<a href="https://colab.research.google.com/github/benschlup/csck507_team_a/blob/main/CSCK507_Team_A_WikiQA_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
### **CSCK507 Natural Language Processing, March-May 2022: End-of-Module Assignment**
# **Generative Chatbot**
---
#### Team A
Muhammad Ali (Student ID 200050027)  
Benjamin Schlup (Student ID 200050007)  
Chinedu Abonyi (Student ID 200050028)  
Victor Armenta-Valdes (Student ID 222500001)

---
# **Data Analysis**
---

Dataset being used: https://www.microsoft.com/en-us/download/details.aspx?id=52419  
Paper on dataset: https://aclanthology.org/D15-1237/  

---
## 1. Configuration and framework

In [None]:
# Imports
import os
import re
import urllib.request
import zipfile

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from keras_preprocessing.text import Tokenizer

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

---
## 2. Data acquisition and loading

In [None]:
# Download data: If link does not work any longer, access file manually from here: https://www.microsoft.com/en-us/download/details.aspx?id=52419
urllib.request.urlretrieve("https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip", "WikiQACorpus.zip")

('WikiQACorpus.zip', <http.client.HTTPMessage at 0x7f207ffecbd0>)

In [None]:
# Extract files
with zipfile.ZipFile('WikiQACorpus.zip', 'r') as zipfile:
   zipfile.extractall()

In [None]:
# Import questions and answers: training, validation and test datasets
train_df = pd.read_csv( f'./WikiQACorpus/WikiQA-train.tsv', sep='\t', encoding='ISO-8859-1')
val_df = pd.read_csv( f'./WikiQACorpus/WikiQA-dev.tsv', sep='\t', encoding='ISO-8859-1')
test_df = pd.read_csv( f'./WikiQACorpus/WikiQA-test.tsv', sep='\t', encoding='ISO-8859-1')       

---
## 3. Dataset preparation (pre-processing, transformation)
Note that no cleansing as such is required, as prior analysis has shown.

In [None]:
# Quality checks and exploratory data analysis removed: dataset has proven clean
# Print gross volumes:
print(f'Gross training dataset size: {len(train_df)}')
print(f'Gross validation dataset size: {len(val_df)}')
print(f'Gross test dataset size: {len(test_df)}')

Gross training dataset size: 20347
Gross validation dataset size: 2733
Gross test dataset size: 6116


In [None]:
# Derive normalized questions and answers and count number of tokens
for df in [train_df, val_df, test_df]:
    df.loc[:,'norm_question'] = [ re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", q).lower().strip() for q in df['Question'] ]
    df.loc[:,'norm_answer'] = [ '_START_ '+re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", s).lower().strip()+' _STOP_' for s in df['Sentence']]
    df['question_tokens'] = [ len(x.split()) for x in df['norm_question'] ]
    df['answer_tokens'] = [ len(x.split()) for x in df['norm_answer'] ]

In [None]:
# Drop sentences which are too long
for df in [train_df, val_df, test_df]:
    if max_question_tokens is not None:
        df.drop(df[df['question_tokens']>max_question_tokens].index, inplace=True)
    if max_answer_tokens is not None:
        df.drop(df[df['answer_tokens']>max_answer_tokens+2].index, inplace=True)    

In [None]:
# Remove q/a pairs depending on configuration of the notebook
if not train_with_invalid_answers:
    train_df = train_df[train_df['Label'] == 1]
if not validate_with_invalid_answers:
    val_df = val_df[val_df['Label'] == 1]
if not test_questions_without_valid_answers:
    test_df = test_df[test_df['Label'] == 1]

In [None]:
# Remove duplicate questions in case configured to do so
if not train_with_duplicate_questions:
    train_df.drop_duplicates(subset=['Question'], inplace=True)
if not validate_with_duplicate_questions:
    validate_df.drop_duplicates(subset=['Question'], inplace=True)
if not test_with_duplicate_questions:
    test_df.drop_duplicates(subset=['Question'], inplace=True)

In [None]:
# Data preparation:
# Tokenization:
# Reconsider adding digits to filter later, as encoding of numbers may create excessive vocabulary
# Also check reference on handling numbers in NLP: https://arxiv.org/abs/2103.13136
# Note that I do not yet train the tokenizer on validation and test datasets - should be challenged. 
# my be added to Tokenizer filters=target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\''

if remove_oov_sentences:
    oov_token = None
tokenizer = Tokenizer(num_words=vocab_size_limit, oov_token=oov_token)

tokenizer.fit_on_texts(train_df['norm_question'] + train_df['norm_answer'])
if vocab_include_val:
    tokenizer.fit_on_texts(val_df['norm_question'] + val_df['norm_answer'])
if vocab_include_test:
    tokenizer.fit_on_texts(test_df['norm_question'] + test_df['norm_answer'])

vocab_size = len(tokenizer.word_index) + 1
if vocab_size_limit is not None:
    vocab_size = min([vocab_size, vocab_size_limit])
print(f'Vocabulary size based on training dataset: {vocab_size}')

for df in [train_df, val_df, test_df]:
    # Tokenize
    df['tokenized_question'] = tokenizer.texts_to_sequences(df['norm_question'])
    df['tokenized_answer'] = tokenizer.texts_to_sequences(df['norm_answer'])

    # Optionally remove sentences with out-of-vocabulary tokens
    if remove_oov_sentences:
        df.drop(df[df['question_tokens']!=df['tokenized_question'].str.len()].index, inplace=True)
        df.drop(df[df['answer_tokens']!=df['tokenized_answer'].str.len()].index, inplace=True)

Vocabulary size based on training dataset: 6001


In [None]:
# Print net volumes
print(f'Net training dataset size: {len(train_df)}')
print(f'Net validation dataset size: {len(val_df)}')
print(f'Net test dataset size: {len(test_df)}')

Net training dataset size: 2181
Net validation dataset size: 108
Net test dataset size: 252


In [None]:
# Transform data for training and validation by aligning lengths (i.e. padding)
maxlen_questions = max(len(t) for t in train_df['tokenized_question'].to_list())
maxlen_answers = max(len(t) for t in train_df['tokenized_answer'].to_list())

train_encoder_input_data = pad_sequences(train_df['tokenized_question'], maxlen=maxlen_questions, padding='post')
val_encoder_input_data = pad_sequences(val_df['tokenized_question'], maxlen=maxlen_questions, padding='post')
print(f'Encoder input data shape: {train_encoder_input_data.shape}')

train_decoder_input_data = pad_sequences(train_df['tokenized_answer'], maxlen=maxlen_answers, padding='post')
val_decoder_input_data = pad_sequences(val_df['tokenized_answer'], maxlen=maxlen_answers, padding='post')
print(f'Decoder input data shape: {train_decoder_input_data.shape}')

tokenized_answers = [ ta[1:] for ta in train_df['tokenized_answer'] ]
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
train_decoder_output_data = to_categorical(padded_answers, vocab_size)
tokenized_answers = [ ta[1:] for ta in val_df['tokenized_answer'] ]
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
val_decoder_output_data = to_categorical(padded_answers, vocab_size)
print(f'Decoder output data shape: {train_decoder_output_data.shape}')

Encoder input data shape: (2181, 21)
Decoder input data shape: (2181, 52)
Decoder output data shape: (2181, 52, 6001)


---
## 4. Data analysis

In [None]:
# TO BE ADDED

---
# END OF NOTEBOOK
---