<a href="https://colab.research.google.com/github/benschlup/csck504assemblyfactory/blob/main/CSCK507_Team_A_WikiQA_Chatbot_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
### **CSCK507 Natural Language Processing, March-May 2022: End-of-Module Assignment**
# **Generative Chatbot**
---
#### Team A
Muhammad Ali (Student ID )  
Benjamin Schlup (Student ID 200050007)  
Chinedu Abonyi (Student ID )  
Victor Armenta-Valdes (Student ID )

---
# **Solution 1: LSTM without Attention Layer**
---

Dataset being used: https://www.microsoft.com/en-us/download/details.aspx?id=52419  
Paper on dataset: https://aclanthology.org/D15-1237/  
Solution inspired by https://medium.com/swlh/how-to-design-seq2seq-chatbot-using-keras-framework-ae86d950e91d  

Additional interesting materials to review, and potentially reference:
Khin, N.N., Soe, K.M., 2020. Question Answering based University Chatbot using Sequence to Sequence Model, in: .. doi:10.1109/o-cocosda50338.2020.9295021



---
Backlog:
* Strip whitespace at beginning and end of normalized questions and answers
* Add drop-out layer

For future study, i.e. to be mentioned in report
* Check if lemmatizing on question side improves performance
* Check if word embedding (e.g. using Word2Vec or GloVe) on question (i.e. input side) improves performance (beware of out-of-vocab)
---

## 1. Configuration

In [1]:
# The dataset includes invalid answers (labelled 0) and some questions 
# even have no valid answer at all: Switches allow test runs excluding invalid
# answers.
# Note that the assignment says that answers must be provided by the chatbot: 
# there is no mention that answers must be correct!
train_with_invalid_answers = True
validate_with_invalid_answers = True
test_questions_without_valid_answers = True

# The dataset contains questions with multiple valid answers
train_with_duplicate_questions = True
validate_with_duplicate_questions = True
test_with_duplicate_questions = True

# Configure the tokenizer
vocab_size_limit = 6000 + 1 # set this to None if all tokens from training shall be included (add one to number of tokens)
vocab_include_val = False   # set this to True if tokens from validation set shall be included in vocabulary
vocab_include_test = False  # set this to True if tokens from test set shall be included in vocabulary
oov_token = 1               # set this to None if out-of-vocabulary tokens should be removed from sequences
remove_oov_sentences = True # set this to True if any sentences containing out-of-vocabulary tokens should be removed from training, validation, test dataset

# Limit sentence lengths // not yet implemented
max_question_tokens = 20    # set this to None if no limit on question length
max_answer_tokens = 50      # set this to None if no limit on answer length

---

In [2]:
# Imports
import codecs
import io
import os
import re
import urllib.request
import yaml
import random
import zipfile

import numpy as np
import pandas as pd

#from gensim.models import Word2Vec

from tensorflow.keras.activations import softmax
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from keras_preprocessing.text import Tokenizer

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [3]:
# Make sure the GPU is visible to our runtime
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [4]:
# Check what GPU we have in place
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed May 11 17:57:20 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P0    33W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
# Download data: If link does not work any longer, access file manually from here: https://www.microsoft.com/en-us/download/details.aspx?id=52419
urllib.request.urlretrieve("https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip", "WikiQACorpus.zip")

('WikiQACorpus.zip', <http.client.HTTPMessage at 0x7f5872e16dd0>)

In [6]:
# Extract files
with zipfile.ZipFile('WikiQACorpus.zip', 'r') as zipfile:
   zipfile.extractall()

In [7]:
# Import questions and answers: training, validation and test datasets
train_df = pd.read_csv( f'./WikiQACorpus/WikiQA-train.tsv', sep='\t', encoding='ISO-8859-1')
val_df = pd.read_csv( f'./WikiQACorpus/WikiQA-dev.tsv', sep='\t', encoding='ISO-8859-1')
test_df = pd.read_csv( f'./WikiQACorpus/WikiQA-test.tsv', sep='\t', encoding='ISO-8859-1')       

In [8]:
# Quality checks and exploratory data analysis removed: dataset has proven clean
# Print gross volumes:
print(f'Gross training dataset size: {len(train_df)}')
print(f'Gross validation dataset size: {len(val_df)}')
print(f'Gross test dataset size: {len(test_df)}')

Gross training dataset size: 20347
Gross validation dataset size: 2733
Gross test dataset size: 6116


In [9]:
# Remove q/a pairs depending on configuration of the notebook
if not train_with_invalid_answers:
    train_df = train_df[train_df['Label'] == 1]
if not validate_with_invalid_answers:
    val_df = val_df[val_df['Label'] == 1]
if not test_questions_without_valid_answers:
    test_df = test_df[test_df['Label'] == 1]

In [10]:
# Remove duplicate questions in case configured to do so
if not train_with_duplicate_questions:
    train_df.drop_duplicates(subset=['Question'], inplace=True)
if not validate_with_duplicate_questions:
    validate_df.drop_duplicates(subset=['Question'], inplace=True)
if not test_with_duplicate_questions:
    test_df.drop_duplicates(subset=['Question'], inplace=True)

In [11]:
# Derive normalized questions and answers
for df in [train_df, val_df, test_df]:
    df.loc[:,'norm_question'] = [ re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", q).lower() for q in df['Question'] ]
    df.loc[:,'norm_answer'] = [ '_START_ '+re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", s).lower()+' _STOP_' for s in df['Sentence']]

In [12]:
# Data preparation:
# Tokenization:
# Reconsider adding digits to filter later, as encoding of numbers may create excessive vocabulary
# Also check reference on handling numbers in NLP: https://arxiv.org/abs/2103.13136
# Note that I do not yet train the tokenizer on validation and test datasets - should be challenged. 
# my be added to Tokenizer filters=target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\''

if remove_oov_sentences:
    oov_token = None
tokenizer = Tokenizer(num_words=vocab_size_limit, oov_token=oov_token)

tokenizer.fit_on_texts(train_df['norm_question'] + train_df['norm_answer'])
if vocab_include_val:
    tokenizer.fit_on_texts(val_df['norm_question'] + val_df['norm_answer'])
if vocab_include_test:
    tokenizer.fit_on_texts(test_df['norm_question'] + test_df['norm_answer'])

vocab_size = len(tokenizer.word_index) + 1
if vocab_size_limit is not None:
    vocab_size = min([vocab_size, vocab_size_limit])
print(f'Vocabulary size based on training dataset: {vocab_size}')

for df in [train_df, val_df, test_df]:
    df['tokenized_question'] = tokenizer.texts_to_sequences(df['norm_question'])
    df['tokenized_answer'] = tokenizer.texts_to_sequences(df['norm_answer'])
    df['question_tokens'] = [ len(x.split()) for x in df['norm_question'] ]
    df['answer_tokens'] = [ len(x.split()) for x in df['norm_answer'] ]
    if remove_oov_sentences:
        df.drop(df[df['question_tokens']!=df['tokenized_question'].str.len()].index, inplace=True)
        df.drop(df[df['answer_tokens']!=df['tokenized_answer'].str.len()].index, inplace=True)

Vocabulary size based on training dataset: 6001


In [13]:
# Print net volumes
print(f'Net training dataset size: {len(train_df)}')
print(f'Net validation dataset size: {len(val_df)}')
print(f'Net test dataset size: {len(test_df)}')

Net training dataset size: 2181
Net validation dataset size: 108
Net test dataset size: 252


In [14]:
# Build model

maxlen_questions = max(len(t) for t in train_df['tokenized_question'].to_list())
maxlen_answers = max(len(t) for t in train_df['tokenized_answer'].to_list())

train_encoder_input_data = pad_sequences(train_df['tokenized_question'], maxlen=maxlen_questions, padding='post')
val_encoder_input_data = pad_sequences(val_df['tokenized_question'], maxlen=maxlen_questions, padding='post')
print(f'Encoder input data shape: {train_encoder_input_data.shape}')

train_decoder_input_data = pad_sequences(train_df['tokenized_answer'], maxlen=maxlen_answers, padding='post')
val_decoder_input_data = pad_sequences(val_df['tokenized_answer'], maxlen=maxlen_answers, padding='post')
print(f'Decoder input data shape: {train_decoder_input_data.shape}')

tokenized_answers = [ ta[1:] for ta in train_df['tokenized_answer'] ]
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
train_decoder_output_data = to_categorical(padded_answers, vocab_size)
tokenized_answers = [ ta[1:] for ta in val_df['tokenized_answer'] ]
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
val_decoder_output_data = to_categorical(padded_answers, vocab_size)
print(f'Decoder output data shape: {decoder_output_data.shape}')

enc_inputs = Input(shape=(None,))
enc_embedding = Embedding(vocab_size, 200, mask_zero=True)(enc_inputs)
_, state_h, state_c = LSTM(200, return_state=True)(enc_embedding)
enc_states = [state_h, state_c]

dec_inputs = Input(shape=(None,))
dec_embedding = Embedding(vocab_size, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True)
dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=enc_states)
dec_dense = Dense(vocab_size, activation=softmax)
output = dec_dense(dec_outputs)

model = Model([enc_inputs, dec_inputs], output)
model.compile(optimizer=RMSprop(), loss='categorical_crossentropy')

model.summary()


Encoder input data shape: (2181, 21)
Decoder input data shape: (2181, 52)


NameError: ignored

In [None]:
# Model training

model.fit([train_encoder_input_data, train_decoder_input_data], train_decoder_output_data,
          validation_data=([val_encoder_input_data, val_decoder_input_data], val_decoder_output_data),
          batch_size=50, epochs=200)

#model.save('/content/drive/MyDrive/CSCK507_Team_A/qa_model.h5')


In [None]:
# Prepare models for inferencing (separate encoder, decoder)
#model.load_weights('/content/drive/MyDrive/CSCK507_Team_A/qa_model.h5')

def make_inference_models():
    dec_state_input_h = Input(shape=(200,))
    dec_state_input_c = Input(shape=(200,))
    dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    dec_outputs, state_h, state_c = dec_lstm(dec_embedding,
                                             initial_state=dec_states_inputs)
    dec_states = [state_h, state_c]
    dec_outputs = dec_dense(dec_outputs)

    dec_model = Model(
        inputs=[dec_inputs] + dec_states_inputs,
        outputs=[dec_outputs] + dec_states)
    print('Inference decoder:')
    dec_model.summary()

    enc_model = Model(inputs=enc_inputs, outputs=enc_states)
    print('Inference encoder:')
    enc_model.summary()
    return enc_model, dec_model

def str_to_tokens(sentence):
    words = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", sentence).lower().split()
    tokens_list = list()
    for current_word in words:
        result = tokenizer.word_index.get(current_word, '')
        if result != '':
            tokens_list.append(result)
        else:
            print(f'Warning: out-of-vocabulary token \'{current_word}\'')
            if oov_token is not None:
                tokens_list.append(oov_token)

    return pad_sequences([tokens_list],
                         maxlen=maxlen_questions,
                         padding='post')


enc_model, dec_model = make_inference_models()



In [None]:
# get 10 random numbers to choose random sentences and calculate BLEU score
# note that code must be refactored: it was merged from examples and is 
# inconsistent now
questions = train_df['Question'].to_list()
rand_integers = [random.randint(0, len(questions)-1) for i in range(1, 10)]
bleu_total = 0


for i in rand_integers:
    states_values = enc_model.predict(str_to_tokens(questions[i]))
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']

    decoded_translation = ''
    while True:
        dec_outputs, h, c = dec_model.predict([empty_target_seq]
                                              + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                if word != 'stop':
                    decoded_translation += ' {}'.format(word)
                sampled_word = word

        if sampled_word == 'stop' \
                or len(decoded_translation.split()) \
                > maxlen_answers:
            break

        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index
        states_values = [h, c]

    decoded_translation = decoded_translation[1:]

    print(f'Original question: {questions[i]}')
    print(f'Predicated answer: {decoded_translation}')

    reference_answers = train_df.loc[train_df['Question']==questions[i], 'norm_answer'].to_list()
    reference_answers = [answer[8:-7] for answer in reference_answers]


    # The following should contain all possible answers, though...
    print(f'{reference_answers}')
    bleu_score = sentence_bleu(reference_answers, decoded_translation, smoothing_function=SmoothingFunction().method0)
    print(f'Bleu score: {bleu_score}\n')
    bleu_total += bleu_score

print(f'Bleu average = {bleu_total/len(rand_integers)}')
    

In [None]:
while True:
    question = input('Ask me something, or enter \'end\' to stop: ')
    if question == 'end':
        break
    states_values = enc_model.predict(str_to_tokens(question))
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']

    decoded_translation = ''
    while True:
        dec_outputs, h, c = dec_model.predict([empty_target_seq]
                                              + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                if word != 'stop':
                    decoded_translation += ' {}'.format(word)
                sampled_word = word

        if sampled_word == 'stop' \
                or len(decoded_translation.split()) \
                > maxlen_answers:
            break

        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index
        states_values = [h, c]

    print(decoded_translation)