## test the chatbot models

In [None]:
# load data
import csv
# read in raw data
with open('clean_withid_noblanks_part2.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    raw_all = []
    match_all = []
    time_all = []
    slot_all = []
    for row in readCSV:
        match_all.append(row[0])
        time_all.append(row[1])
        slot_all.append(row[2])
        raw_all.append(row[3])
print(len(raw_all))

In [None]:
# train model with subset and test the model against another subset
Ntrain = 10000
raw = raw_all[:Ntrain]
tt = time_all[:Ntrain]
slot = slot_all[:Ntrain]
match = match_all[:Ntrain]

raw_test = raw_all[Ntrain+1:Ntrain+10000]
tt_test = time_all[Ntrain+1:Ntrain+10000]
slot_test = slot_all[Ntrain+1:Ntrain+10000]
match_test = match_all[Ntrain+1:Ntrain+10000]

In [None]:
# TF-IDF approach
import nltk
import numpy as np
import string # to process standard python strings
sent_tokens = raw
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def response(user_response):
    robo_response=''
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        robo_response=robo_response+''
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response

In [None]:
# chatterbot
from chatterbot import ChatBot
chatbot = ChatBot('dotai', read_only=True) # trained in chatterbot.ipynb

In [None]:
# test the model
Nrec = len(raw_test)
rec = 0
dt = 10.
response1 = []
response2 = []
response0 = []
while rec < Nrec-1:
    if match_test[rec+1] == match_test[rec] and float(tt_test[rec+1]) < float(tt_test[rec])+dt:
        print(['Found conversation at rec: ',rec])
        user_response = raw_test[rec]
        response0.append(raw_test[rec+1])
        response1.append(response(user_response))
        sent_tokens.remove(user_response)
        response2.append(chatbot.get_response(user_response))
        rec = rec + 2
    else:
        rec = rec + 1

# save responses in txt files
with open('response0.txt', 'w') as f:
    for item in response0:
        f.write("%s\n" % item)
with open('response1.txt', 'w') as f:
    for item in response1:
        f.write("%s\n" % item)
with open('response2.txt', 'w') as f:
    for item in response2:
        f.write("%s\n" % item)

In [None]:
# load saved data
with open('response0.txt') as f:
    response0 = f.read().splitlines()
with open('response1.txt') as f:
    response1 = f.read().splitlines()
with open('response2.txt') as f:
    response2 = f.read().splitlines()

In [None]:
# check how many responses from TF-IDF appoach are valid
Nr = 0
for i in response1:
    if i:
        Nr = Nr+1
print(str(Nr)+' valid response out of '+str(len(response1))+' for similarity model')
print((len(response1)-Nr)/len(response1))

In [None]:
# word count
word_tmp = [nltk.word_tokenize(i.lower()) for i in response0]
word0 = [item for sublist in word_tmp for item in sublist]
word_tmp = [nltk.word_tokenize(i.lower()) for i in response1]
word1 = [item for sublist in word_tmp for item in sublist]
word_tmp = [nltk.word_tokenize(i.lower()) for i in response2]
word2 = [item for sublist in word_tmp for item in sublist]

from collections import Counter
C0 = Counter(word0)
C1 = Counter(word1)
C2 = Counter(word2)
word_sort = []
frac0_sort = []
frac1_sort = []
frac2_sort = []
for k,v in C0.most_common()[:5]:
    word_sort.append(k)
    frac0_sort.append(v/len(word0)*100)
    frac1_sort.append(C1[k]/len(word1)*100)
    frac2_sort.append(C2[k]/len(word2)*100)
print(C0.most_common()[:5])
print(C1.most_common()[:5])
print(C2.most_common()[:5])

w = csv.writer(open("count0.csv", "w"))
for key, val in C0.most_common():
    w.writerow([key, val])
w = csv.writer(open("count1.csv", "w"))
for key, val in C1.most_common():
    w.writerow([key, val])
w = csv.writer(open("count2.csv", "w"))
for key, val in C2.most_common():
    w.writerow([key, val])

In [None]:
# plot word frequecies as bar plot
import matplotlib.pyplot as plt 
name_list = word_sort
x =list(range(len(word_sort)))
total_width, n = 0.8, 3
width = total_width / n
 
plt.bar(x, frac0_sort, width=width, label='data',fc = 'y')
for i in range(len(x)):
    x[i] = x[i] + width
plt.bar(x, frac1_sort, width=width, label='similarity model',tick_label = name_list,fc = 'r')
for i in range(len(x)):
    x[i] = x[i] + width
plt.bar(x, frac2_sort, width=width, label='chatterbot',tick_label = name_list,fc = 'b')
plt.ylabel('frequency (%)')
plt.xlabel('word')
plt.legend()
plt.show()
plt.savefig('top5_frequency.png')