<a href="https://colab.research.google.com/github/benschlup/csck507_team_a/blob/main/CSCK507_Team_A_ChatBot_THREE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSCK507 Natural Language Processing
## Team A

Inspired by https://medium.com/swlh/how-to-design-seq2seq-chatbot-using-keras-framework-ae86d950e91d

Additional interesting materials to review, and potentially reference:
Khin, N.N., Soe, K.M., 2020. Question Answering based University Chatbot using Sequence to Sequence Model, in: .. doi:10.1109/o-cocosda50338.2020.9295021



In [None]:
# Imports
import codecs
import io
import os
import re
import tarfile
import urllib.request
import yaml
import random

import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from tensorflow.keras import Input, Model
from tensorflow.keras.activations import softmax
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras_preprocessing.text import Tokenizer

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


In [None]:
# Make sure the GPU is visible to our runtime
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
# Check what GPU we have in place
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# Download data
urllib.request.urlretrieve("https://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz", "Question_Answer_Dataset_v1.2.tar.gz")

('Question_Answer_Dataset_v1.2.tar.gz',
 <http.client.HTTPMessage at 0x7f5fddc6fd90>)

In [None]:
# Extract files
file = tarfile.open('Question_Answer_Dataset_v1.2.tar.gz')
file.extractall('.')
file.close()

In [None]:
# Import questions and answers from all courses in Spring 2008, 2009 and 2010 respectively
qa_df = pd.DataFrame()
for course in ['S08', 'S09', 'S10']:
    print(f'Reading questions and answers from course {course}')
    course_qa_df = pd.read_csv( f'./Question_Answer_Dataset_v1.2/{course}/question_answer_pairs.txt', sep='\t', encoding='ISO-8859-1')
    course_qa_df['course'] = course
    qa_df = pd.concat([qa_df, course_qa_df])

        

Reading questions and answers from course S08
Reading questions and answers from course S09
Reading questions and answers from course S10


In [None]:
# Remove lines not having answers (or not even having questions, in some cases...):
qa_df = qa_df[qa_df['Answer'].notna()]

In [None]:
## Remove duplicates
## Add the length of the answer to the dataframe
#qa_df['answer_length'] = qa_df['Answer'].str.len()
## Sort the dataframe to have the longest answer per question at the top
#qa_df.sort_values(['Question', 'answer_length'], inplace=True)
## Remove duplicated questions, retaining only the longest answer
#qa_df.drop_duplicates(subset=['Question'], keep='last', inplace=True)


In [None]:
# Derive normalized questions
qa_df['norm_question'] = [ re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", s).lower() for s in qa_df['Question'] ]

# Answers are no harder
qa_df['norm_answer'] = [ '_START_ '+re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", s).lower() for s in qa_df['Answer']+' _STOP_' ]

In [None]:
# Set-up model

# Filter for tokenizer:
# Reconsider adding numbers to filter later, as encoding of numbers may create excessive vocabulary
# Check reference https://arxiv.org/abs/2103.13136
target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\'' 
tokenizer = Tokenizer(filters=target_regex)
tokenizer.fit_on_texts(qa_df['norm_question'] + qa_df['norm_answer'])
VOCAB_SIZE = len(tokenizer.word_index) + 1

tokenized_questions = tokenizer.texts_to_sequences(qa_df['norm_question'])
maxlen_questions = max([len(x) for x in tokenized_questions])
encoder_input_data = pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding='post')

print(f'Encoder input data shape: {encoder_input_data.shape})')

tokenized_answers = tokenizer.texts_to_sequences(qa_df['norm_answer'])
maxlen_answers = max([len(x) for x in tokenized_answers])
decoder_input_data = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
print(decoder_input_data.shape)

#for i in range(len(tokenized_answers)):
#    tokenized_answers[i] = tokenized_answers[i][1:]
tokenized_answers = [ ta[1:] for ta in tokenized_answers]
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
decoder_output_data = to_categorical(padded_answers, VOCAB_SIZE)

print(decoder_output_data.shape)

enc_inputs = Input(shape=(None,))
enc_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(enc_inputs)
_, state_h, state_c = LSTM(200, return_state=True)(enc_embedding)
enc_states = [state_h, state_c]

dec_inputs = Input(shape=(None,))
dec_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True)
dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=enc_states)
dec_dense = Dense(VOCAB_SIZE, activation=softmax)
output = dec_dense(dec_outputs)

model = Model([enc_inputs, dec_inputs], output)
model.compile(optimizer=RMSprop(), loss='categorical_crossentropy')

model.summary()



Encoder input data shape: (3422, 44))
(3422, 158)
(3422, 158, 5701)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 200)    1140200     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 200)    1140200     ['input_2[0][0]']                
                          

In [None]:
# Model training

model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=50, epochs=100)
#model.save('/content/drive/MyDrive/CSCK507_Team_A/qa_model.h5')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Prepare models for inferencing (separate encoder, decoder)
#model.load_weights('/content/drive/MyDrive/CSCK507_Team_A/qa_model.h5')

def make_inference_models():
    dec_state_input_h = Input(shape=(200,))
    dec_state_input_c = Input(shape=(200,))
    dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    dec_outputs, state_h, state_c = dec_lstm(dec_embedding,
                                             initial_state=dec_states_inputs)
    dec_states = [state_h, state_c]
    dec_outputs = dec_dense(dec_outputs)

    dec_model = Model(
        inputs=[dec_inputs] + dec_states_inputs,
        outputs=[dec_outputs] + dec_states)
    print('Inference decoder:')
    dec_model.summary()

    enc_model = Model(inputs=enc_inputs, outputs=enc_states)
    print('Inference encoder:')
    enc_model.summary()
    return enc_model, dec_model


# Also here: need to change to lemmas in case we do that on training data
# (see above)
# Furthermore, there'd be a more compact way of expressing
# below code... but for simplicity, taken from example for time being
def str_to_tokens(sentence):
    words = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", sentence).lower().split()
    tokens_list = list()
    for current_word in words:
        result = tokenizer.word_index.get(current_word, '')
        if result != '':
            tokens_list.append(result)

    return pad_sequences([tokens_list],
                         maxlen=maxlen_questions,
                         padding='post')


enc_model, dec_model = make_inference_models()



Inference decoder:
Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 200)    1140200     ['input_2[0][0]']                
                                                                                                  
 input_9 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, 200)]        0           []                               
                                                                         

In [None]:
# get 100 random numbers to choose random sentences and calculate BLEU score
# note that code must be refactored: it was merged from examples and is 
# inconsistent now
questions = qa_df['Question'].to_list()
rand_integers = [random.randint(0, len(questions)-1) for i in range(1, 100)]
bleu_total = 0


for i in rand_integers:
    states_values = enc_model.predict(str_to_tokens(questions[i]))
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']

    decoded_translation = ''
    while True:
        dec_outputs, h, c = dec_model.predict([empty_target_seq]
                                              + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                if word != 'stop':
                    decoded_translation += ' {}'.format(word)
                sampled_word = word

        if sampled_word == 'stop' \
                or len(decoded_translation.split()) \
                > maxlen_answers:
            break

        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index
        states_values = [h, c]

    decoded_translation = decoded_translation[1:]

    print(f'Original question: {questions[i]}')
    print(f'Predicated answer: {decoded_translation}')

    reference_answers = qa_df.loc[qa_df['Question']==questions[i], 'norm_answer'].to_list()
    reference_answers = [answer[8:-7] for answer in reference_answers]


    # The following should contain all possible answers, though...
    print(f'{reference_answers}')
    bleu_score = sentence_bleu(reference_answers, decoded_translation, smoothing_function=SmoothingFunction().method0)
    print(f'Bleu score: {bleu_score}\n')
    bleu_total += bleu_score

print(f'Bleu average = {bleu_total/len(rand_integers)}')
    

Original question: Are all spoken varieties of Chinese tonal and analytical?
Predicated answer: yes
['yes', 'yes']
Bleu score: 1.0

Original question: Did Tesla win the Nobel Prize?
Predicated answer: no
['no']
Bleu score: 1.0



Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Original question: Who did Newton see as the master creator?
Predicated answer: god
['god', 'newton saw god as the master creator whose existence could not be denied in the face of the grandeur of all creation']
Bleu score: 1.0

Original question: Was faraday `s  earliest  chemical  work as an assistant  to Davy?
Predicated answer: yes
['yes']
Bleu score: 1.0

Original question: Are drums often used in music therapy?
Predicated answer: yes
['yes', 'yes']
Bleu score: 1.0

Original question: What may happen to red fire ants if we use boiling water on the queen?
Predicated answer: nests of red fire ants may be destroyed
['nests of red fire ants may be destroyed', 'die']
Bleu score: 1.0

Original question: Is Taipei in a valley?
Predicated answer: yes
['taipei is in the valleys of the keelung and xindian rivers', 'yes']
Bleu score: 1.0

Original question: Did he become chief engineer in the Department of Bridges and Highways in 1892?
Predicated answer: no
['no', 'yes']
Bleu score: 1.0

Ori

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Original question: Where is smoked eel considered a delicacy?
Predicated answer: northern germany the netherlands denmark sweden
['northern germany the netherlands denmark sweden', 'smoked eel is considered a delicacy in northern germany the netherlands denmark and sweden']
Bleu score: 1.0

Original question: What is Ottawa's junior ice hockey team?
Predicated answer: the middle states has the west is the center point of the center of saint lawrence river
["the ottawa 67's"]
Bleu score: 0.040050763315923193

Original question: Was Ford active about Vietnamese affairs?
Predicated answer: yes
['no', 'yes']
Bleu score: 1.0

Original question: How many seasons does Kuala Lumpur experience?
Predicated answer: 1
['1']
Bleu score: 1.0

Original question: What happened in 1764?
Predicated answer: adams married abigail smith
['adams married abigail smith']
Bleu score: 1.0

Original question: Was Grover Cleveland elected Sheriff of Erie County, New York?
Predicated answer: yes
['yes']
Bleu score

In [None]:
while True:
    question = input('Ask me something, or enter \'end\' to stop: ')
    if question == 'end':
        break
    states_values = enc_model.predict(str_to_tokens(question))
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']

    decoded_translation = ''
    while True:
        dec_outputs, h, c = dec_model.predict([empty_target_seq]
                                              + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                if word != 'stop':
                    decoded_translation += ' {}'.format(word)
                sampled_word = word

        if sampled_word == 'stop' \
                or len(decoded_translation.split()) \
                > maxlen_answers:
            break

        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index
        states_values = [h, c]

    print(decoded_translation)

Ask me something, or enter 'end' to stop:  What is the name of the largest church in Montreal?
 the largest church in montreal is named saint joseph s oratory
Ask me something, or enter 'end' to stop:  What is the name of the church in Montreal?
 the largest church in montreal is named saint joseph s oratory
Ask me something, or enter 'end' to stop: Name of churn in Montreal?
 the spanish
Ask me something, or enter 'end' to stop: What is a name of a church in Montreal?
 a female turtle
Ask me something, or enter 'end' to stop: What is the name of the largest in Montreal?
 the largest church in montreal is named saint joseph s oratory
Ask me something, or enter 'end' to stop: end
