<a href="https://colab.research.google.com/github/benschlup/csck507_team_a/blob/main/CSCK507_Team_A_ChatBot_THREE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSCK507 Natural Language Processing
## Team A

In [1]:
# Imports
import codecs
import io
import os
import re
import tarfile
import urllib.request
import yaml

import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from tensorflow.keras import Input, Model
from tensorflow.keras.activations import softmax
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras_preprocessing.text import Tokenizer

In [2]:
# Make sure the GPU is visible to our runtime
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
# Download data
urllib.request.urlretrieve("https://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz", "Question_Answer_Dataset_v1.2.tar.gz")

('Question_Answer_Dataset_v1.2.tar.gz',
 <http.client.HTTPMessage at 0x7f5ccd7235d0>)

In [4]:
# Extract files
file = tarfile.open('Question_Answer_Dataset_v1.2.tar.gz')
file.extractall('.')
file.close()

In [5]:
# Import questions and answers from all courses in Spring 2008, 2009 and 2010 respectively
qa_df = pd.DataFrame()
for course in ['S08', 'S09', 'S10']:
    print(f'Reading questions and answers from course {course}')
    course_qa_df = pd.read_csv( f'./Question_Answer_Dataset_v1.2/{course}/question_answer_pairs.txt', sep='\t', encoding='ISO-8859-1')
    course_qa_df['course'] = course
    qa_df = pd.concat([qa_df, course_qa_df])

        

Reading questions and answers from course S08
Reading questions and answers from course S09
Reading questions and answers from course S10


In [6]:
# Remove lines not having answers (or not even having questions, in some cases...):
qa_df = qa_df[qa_df['Answer'].notna()]

In [None]:
## Remove duplicates
# Add the length of the answer to the dataframe
qa_df['answer_length'] = qa_df['Answer'].str.len()
# Sort the dataframe to have the longest answer per question at the top
qa_df.sort_values(['Question', 'answer_length'], inplace=True)
# Remove duplicated questions, retaining only the longest answer
qa_df.drop_duplicates(subset=['Question'], keep='last', inplace=True)


In [7]:
# Questions are easy to derive
#questions = qa_df['Question']
questions = [ re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", s).lower() for s in qa_df['Question'] ]

# Answers are no harder
answers = [ '_START_ '+re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", s).lower()+' _STOP_' for s in qa_df['Answer'] ]

In [8]:

########################################################################################################################
############################################# MODEL SET-UP #############################################################
########################################################################################################################

target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n\'0123456789'
### ORIGINAL 
tokenizer = Tokenizer(filters=target_regex)
### BEN
#tokenizer = Tokenizer(filters=target_regex, num_words=1500)
###
tokenizer.fit_on_texts(questions + answers)

### ORIGINAL 
VOCAB_SIZE = len(tokenizer.word_index) + 1
### BEN
#VOCAB_SIZE = 1500

tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
encoder_input_data = pad_sequences(tokenized_questions,
                                   maxlen=maxlen_questions,
                                   padding='post')

print(encoder_input_data.shape)

tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
decoder_input_data = pad_sequences(tokenized_answers,
                                   maxlen=maxlen_answers,
                                   padding='post')
print(decoder_input_data.shape)

for i in range(len(tokenized_answers)):
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
decoder_output_data = to_categorical(padded_answers, VOCAB_SIZE)

print(decoder_output_data.shape)

enc_inputs = Input(shape=(None,))
enc_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(enc_inputs)
_, state_h, state_c = LSTM(200, return_state=True)(enc_embedding)
enc_states = [state_h, state_c]

dec_inputs = Input(shape=(None,))
dec_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True)
dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=enc_states)
dec_dense = Dense(VOCAB_SIZE, activation=softmax)
output = dec_dense(dec_outputs)

model = Model([enc_inputs, dec_inputs], output)
model.compile(optimizer=RMSprop(), loss='categorical_crossentropy')

model.summary()



(3422, 44)
(3422, 157)
(3422, 157, 5332)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 200)    1066400     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 200)    1066400     ['input_2[0][0]']                
                                                     

In [9]:
########################################################################################################################
############################################# MODEL TRAINING ###########################################################
########################################################################################################################

model.fit([encoder_input_data, decoder_input_data],
          decoder_output_data,
          batch_size=50,
          epochs=300)
model.save('/content/drive/MyDrive/CSCK507_Team_A/qa_model.h5')


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [10]:
#model.load_weights('/content/drive/MyDrive/CSCK507_Team_A/qa_model.h5')


def make_inference_models():
    dec_state_input_h = Input(shape=(200,))
    dec_state_input_c = Input(shape=(200,))
    dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    dec_outputs, state_h, state_c = dec_lstm(dec_embedding,
                                             initial_state=dec_states_inputs)
    dec_states = [state_h, state_c]
    dec_outputs = dec_dense(dec_outputs)
    dec_model = Model(
        inputs=[dec_inputs] + dec_states_inputs,
        outputs=[dec_outputs] + dec_states)
    print('Inference decoder:')
    dec_model.summary()
    print('Inference encoder:')
    enc_model = Model(inputs=enc_inputs, outputs=enc_states)
    enc_model.summary()
    return enc_model, dec_model


def str_to_tokens(sentence: str):
    words = sentence.lower().split()
    tokens_list = list()
    for current_word in words:
        result = tokenizer.word_index.get(current_word, '')
        if result != '':
            tokens_list.append(result)


    print(tokens_list)  
    return pad_sequences([tokens_list],
                         maxlen=maxlen_questions,
                         padding='post')


enc_model, dec_model = make_inference_models()



Inference decoder:
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 200)    1066400     ['input_2[0][0]']                
                                                                                                  
 input_3 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 200)]        0           []                               
                                                                         

In [None]:
while True:
    question = input('Ask me something, or enter \'end\' to stop: ')
    if question == 'end':
        break
    states_values = enc_model.predict(str_to_tokens(question))
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']

    decoded_translation = ''
    while True:
        dec_outputs, h, c = dec_model.predict([empty_target_seq]
                                              + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                if word != 'stop':
                    decoded_translation += ' {}'.format(word)
                sampled_word = word

        if sampled_word == 'stop' \
                or len(decoded_translation.split()) \
                > maxlen_answers:
            break

        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index
        states_values = [h, c]

    print(decoded_translation)

Ask me something, or enter 'end' to stop: In what year did Avogadro stop teaching at Turin University
[7, 9, 149, 13, 55, 2, 2626, 37, 300, 69]

Ask me something, or enter 'end' to stop: How old was Celsius when he died
[17, 89, 10, 188, 24, 34, 186]

Ask me something, or enter 'end' to stop: What is temperature would water have to be to be halfway between its standard boiling and freezing point
[9, 6, 340, 353, 105, 21, 12, 52, 12, 52, 3981, 125, 65, 187, 488, 11, 1236, 301]

Ask me something, or enter 'end' to stop: Is the electrolyte sulphuric acid
[6, 3, 3965, 3966, 3967]
 yes
Ask me something, or enter 'end' to stop: For how many years did Alessandro Volta live
[19, 17, 29, 56, 13, 162, 54, 90]

Ask me something, or enter 'end' to stop: Did Alessandro Volta live to be 80 years old
[13, 162, 54, 90, 12, 52, 56, 89]
 no
Ask me something, or enter 'end' to stop: Where did Volta enter retirement
[25, 13, 54, 1843, 2618]
 spain
Ask me something, or enter 'end' to stop: Copenhagen is ra

In [None]:
answers[800]

'<START> well the crew is confined to the ship when we land at clavius we have to stay inside for the time it take to refit  about twentyfour hours and then we are going to back empty <END>'

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Not connected to a GPU


In [None]:
questions[100]

'how do you get your hair to look like that'

In [None]:
answers[100]

"<START> eber's deep conditioner every two days and i never ever use a blowdryer without the diffuser attachment <END>"