In [1]:
import re
import pandas as pd
import numpy as np
from nltk.stem import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akank\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
dialogues=pd.read_excel("dialog_talk_agent.xlsx")

In [4]:
#filling the nans with the previous context or responses
dialogues.fillna(method='ffill', inplace=True)

In [5]:
dialogues.head()

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,I can help you work smarter instead of harder
3,Describe yourself,I can help you work smarter instead of harder
4,tell me about yourself,I can help you work smarter instead of harder


In [6]:
# function that performs text normalization steps

def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing everything which is not between a-z i.e all kinds of spcl characters and replacing them with space
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lemmatizer = wordnet.WordNetLemmatizer() # initializing lemmatizer
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in tokens]) #lemmatizing every word in tokens list and then again converting it into a string
    
    return lemmatized_output    


In [7]:
dialogues['Context']=dialogues['Context'].apply(text_normalization)

In [8]:
dialogues['Context'].head()

0    tell me about your personality
1         i want to know you better
2                   define yourself
3                 describe yourself
4            tell me about yourself
Name: Context, dtype: object

In [9]:
questions=np.array(dialogues['Context'])
answers=np.array(dialogues['Text Response'])

In [10]:
text=np.concatenate((questions,answers))

In [11]:
vectorizer=TfidfVectorizer()
x_tfidf=vectorizer.fit(text)  #building a vocab of all questions and answers

In [12]:
x_tfidf    

TfidfVectorizer()

In [13]:
Question_vectors=vectorizer.transform(dialogues['Context'])       #converting question to vector

In [14]:
Question_vectors

<1592x729 sparse matrix of type '<class 'numpy.float64'>'
	with 5060 stored elements in Compressed Sparse Row format>

In [None]:
print('You can start chatting with me now')
while True:
    #read user input
    input_question = input()
    
    
    #Locate closest question
    input_question_vector = vectorizer.transform([input_question])
    
    
    #compute similarities
    similarities = cosine_similarity(input_question_vector , Question_vectors )
    
    
    #find the closest question 
    closest = np.argmax(similarities , axis=1)      #axis required because similarities is not a flattened array ,it is a matrix
    
    
    #print the correct answer
    print("BOT:" + dialogues['Text Response'].iloc[closest].values[0])
    

You can start chatting with me now
Hi
BOT:Hey!
Let's have a discussion
BOT:Talking is what I do best.
Can we be friends?
BOT:Of course we are.
Great 
BOT:Terrific!
Where do you live?
BOT:The virtual world is my playground. I'm always just a few clicks away.
When is your birthday?
BOT:Wait a minute. Are you planning a surprise party for me? I love surprises! I'll pretend you didn't say anything.
You are a genious
BOT:Thanks! The feeling is mutual.
I'll be back in a few minutes
BOT:I'll be waiting.
I am busy
BOT:I understand. If I can help you with your work, please let me know.
I cant sleep
BOT:Maybe some music would help. Try listening something relaxing.
