In [8]:
import nltk
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity      
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# import spacy
lemmatizer = nltk.stem.WordNetLemmatizer()
# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
data = pd.read_csv("dialogs.txt",  sep= '\t', na_filter=False, header = None)
data.rename(columns = {0: 'Question', 1: 'Answer'}, inplace = True )
data.sample(10)

Unnamed: 0,Question,Answer
1473,"some of it is okay, i guess.","yes, the poems that rhyme and are easy to reme..."
3041,me too.,where does cheese come from?
362,"no, i couldn't make it.",you missed a really good game.
2141,they still make movies like that.,"yes, but they never make much money."
3403,"well, we have a new president.",but we have the same old problems.
1472,i don't know anyone who likes it.,"some of it is okay, i guess."
3287,that means it gets twice as much traffic.,you're right.
2629,those books will slip and you'll fall.,it's only a couple of feet.
1834,where's the car?,what do you mean?
221,thank you.,i'm so happy for you.


Here, the Next step is tokenize our text dataset.<br>
There are two types of tokenization:
    <ol><li>Word Tokenization: This is  the process of breaking down a text or document into individual words or tokens.</li>
    <li>Sent Tokenization: This is to break down the text data into individual sentences so that each sentence can be processed separately.</li><br></ol>
Lemmatization: The goal of lemmatization is to reduce a word to its canonical form so that variations of the same word can be treated as the same token<br>
For example, the word "jumped" may be lemmatized to "jump", and the word "walking" may be lemmatized to "walk".<br>
By reducing words to their base forms, lemmatization can help to simplify text data and reduce the number of unique tokens that need to be analyzed or processed.


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3722 entries, 0 to 3721
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  3722 non-null   object
 1   Answer    3722 non-null   object
dtypes: object(2)
memory usage: 58.3+ KB


In [11]:

# Define a function for text preprocessing (including lemmatization)
def preprocess_text(text):
    
    # Identifies all sentences in the data
    sentences = nltk.sent_tokenize(text)
    
    # Tokenize and lemmatize each word in each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        tokens = [lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(sentence) if word.isalnum()]
        # Turns to basic root - each word in the tokenized word found in the tokenized sentence - if they are all alphanumeric 
        # The code above does the following:
        # Identifies every word in the sentence 
        # Turns it to a lower case 
        # Lemmatizes it if the word is alphanumeric

        preprocessed_sentence = ' '.join(tokens)
        preprocessed_sentences.append(preprocessed_sentence)
    
    return ' '.join(preprocessed_sentences)


data['tokenized Question'] = data['Question'].apply(preprocess_text)
data.head()

Unnamed: 0,Question,Answer,tokenized Question
0,"hi, how are you doing?",i'm fine. how about yourself?,hi how are you doing
1,i'm pretty good. thanks for asking.,,i pretty good thanks for asking
2,no problem. so how have you been?,,no problem so how have you been
3,i've been great. what about you?,i've been good. i'm in school right now.,i been great what about you
4,what school do you go to? i go to pcc.,,what school do you go to i go to pcc


In [12]:
# Create a corpus by flattening the preprocessed questions
corpus = data['tokenized Question'].tolist()

In [13]:
# Vectorize corpus
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(corpus)
# TDIDF is a numerical statistic used to evaluate how important a word is to a document in a collection or corpus. 
# The TfidfVectorizer calculates the Tfidf values for each word in the corpus and uses them to create a matrix where each row represents a document and each column represents a word. 
# The cell values in the matrix correspond to the importance of each word in each document.

In [14]:
def get_response(user_input):
    global most_similar_index
    
    user_input_processed = preprocess_text(user_input) # ....................... Preprocess the user's input using the preprocess_text function

    user_input_vector = tfidf_vectorizer.transform([user_input_processed])# .... Vectorize the preprocessed user input using the TF-IDF vectorizer

    similarity_scores = cosine_similarity(user_input_vector, X) # .. Calculate the score of similarity between the user input vector and the corpus (df) vector

    most_similar_index = similarity_scores.argmax() # ..... Find the index of the most similar question in the corpus (df) based on cosine similarity

    return data['Answer'].iloc[most_similar_index] # ... Retrieve the corresponding answer from the df DataFrame and return it as the chatbot's response

# create greeting list 
greetings = ["Hi.... This is the voice of the guy Abinibee! .... I'm ready to help",
            "Hello bros.... How you dey",
            'Respect!, wetin dey happen nah',
            'How far my blood, wetin dey sup'
            "Good Day .... How can I help", 
            "Hello There... How can I be useful to you today",
            "Hi Abinibee fam.... Any show for me?"]

exits = ['thanks bye', 'bye', 'quit', 'exit', 'bye bye', 'close']
farewell = ['Thanks....see you soon', 'Babye, See you soon', 'Bye... See you later', 'Bye... come back soon']

random_farewell = random.choice(farewell) # ---------------- Randomly select a farewell message from the list
random_greetings = random.choice(greetings) # -------- Randomly select greeting message from the list

# Test your chatbot
while True:
    user_input = input("You: ")
    if user_input.lower() in exits:
        print(f"\nChatbot: {random_farewell}!")
        break
    if user_input.lower() in ['hi', 'hello', 'hey', 'hi there']:
        print(f"\nChatbot: {random_greetings}!")
    else:   
        response = get_response(user_input)
        print(f"\nChatbot: {response}")


Chatbot: i'm fine. how about yourself?

Chatbot: all right, see you.

Chatbot: i'm attending pcc right now.


In [None]:
<h3><b>Model Technique</b></h3> <hr>

In [None]:
tfidf_vectorizer = TfidfVectorizer()
xtrain = tfidf_vectorizer.fit_transform(data['tokenized Question'])
# Xtrain is the preprocessed questions 

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Transform the Y 
data['Answer_ID'] = le.fit_transform(data['Answer'])
data.head()

ytrain = data['Answer_ID'].values
# ytrain is the transformed Answers 

In [None]:
data.head()

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

mnb = MultinomialNB()
mnb.fit(xtrain, ytrain)

rf = RandomForestClassifier()
rf.fit(xtrain, ytrain)

train_predict = mnb.predict(xtrain)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(train_predict, ytrain))

In [None]:
def get_response(user_input):
    global results
    user_input_processed = preprocess_text(user_input) # ....................... Preprocess the user's input using the preprocess_text function

    user_input_vector = tfidf_vectorizer.transform([user_input_processed])# .... Vectorize the preprocessed user input using the TF-IDF vectorizer

    results = mnb.predict(user_input_vector)

    for elem in results:
        row_df = data.loc[data.isin([elem]).any(axis=1)]
        print(row_df['Answer'].values)

# create greeting list 
greetings = ["Hi.... This is the voice of the guy Abinibee! .... I'm ready to help",
            "Hello bros.... How you dey",
            'Respect!, wetin dey happen nah',
            'How far my blood, wetin dey sup'
            "Good Day .... How can I help", 
            "Hello There... How can I be useful to you today",
            "Hi Abinibee fam.... Any show for me?"]

exits = ['thanks bye', 'bye', 'quit', 'later nah', 'exit', 'bye bye', 'close']
farewell = ['Thanks....see you soon', 'Babye, See you soon', 'Bye... See you later', 'Bye... come back soon']

random_farewell = random.choice(farewell) # ---------------- Randomly select a farewell message from the list
random_greetings = random.choice(greetings) # -------- Randomly select greeting message from the list

# Test your chatbot
while True:
    user_input = input("You: ")
    if user_input.lower() in exits:
        print(f"\nChatbot: {random_farewell}!")
        break
    if user_input.lower() in ['hi', 'hello', 'hey', 'hi there']:
        print(f"\nChatbot: {random_greetings}!")
    else:   
        response = get_response(user_input)
        print(f"\nChatbot: {response}")