## Importing Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import sent_tokenize, word_tokenize, bigrams, trigrams
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from collections import Counter, defaultdict
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay,classification_report
import gensim
from gensim.models import Word2Vec
import os, random, math
from string import punctuation
import seaborn as sns
import pickle
import itertools
import warnings
warnings.filterwarnings('ignore')
sns.set()

## Loading files & Data

In [None]:
# Trining files directory
TRAINING_DIR=os.getcwd()+"\\Holmes_Training_Data" #Parent directory for the training corpus

# Get Training files
def get_training(training_dir=TRAINING_DIR):
    filenames=os.listdir(training_dir)
    print(f"There are {len(filenames)} files in the training directory: {training_dir}")
    return(filenames)

training_files=get_training()

In [None]:
# Get Test Data
test_questions = pd.read_csv('testing_data.csv')
test_answers = pd.read_csv('test_answer.csv')

# Display Top 5 test questions
test_questions.head()

## Pre-processing

In [None]:
# Tokenization
# Removing punctuation (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)
# Getting the base words through lematization
def pre_process(sentence):
    return [WordNetLemmatizer().lemmatize(token).lower() for token in word_tokenize(sentence) if token not in punctuation]


In [None]:
# process text
def text_process(text):
    # Pre-process text data with sentence tokenization and collect words  
    for sentence in sent_tokenize(text):
        #pre-process sentence
        tokens_processed = pre_process(sentence)
        #collect tokens for word2vec
        tokens_wv.append(tokens_processed)
        #collect all tokens
        tokens_list.extend(tokens_processed)

In [None]:
# Read all the files from the Training directory, extract the text and process
def processfiles(training_dir=TRAINING_DIR, files = []):
        for afile in files:
            print(f"Processing {afile}")
            print("Path" +os.path.join(training_dir, afile))
            try:
                with open(os.path.join(training_dir, afile)) as instream:
                    text_process(instream.read())
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

In [None]:
#pre-process question columns to remove punctuation, tokenize and get base word (lemmatization)
test_questions.question = test_questions.question.apply(lambda question: pre_process(question))

#pre-process columns a,b,c,d,e
test_questions['a)']=test_questions['a)'].apply(lambda option: pre_process(option))
test_questions['b)']=test_questions['b)'].apply(lambda option: pre_process(option))
test_questions['c)']=test_questions['c)'].apply(lambda option: pre_process(option))
test_questions['d)']=test_questions['d)'].apply(lambda option: pre_process(option))
test_questions['e)']=test_questions['e)'].apply(lambda option: pre_process(option))

#Display
test_questions.head(7)

## Modeling

### Uni-gram Model

In [None]:
# uni-gram model 
def uni_gram(tokens_list):
    
    global unigram_model
    for token in tokens_list:
        unigram_model[token]=unigram_model.get(token,0)+1
    #with probilities
    unigram_model={k:v/sum(unigram_model.values()) for (k,v) in unigram_model.items()}

### Bi-gram Model

In [None]:
# Bi-gram model 
def bi_gram(tokens_list):
    for w1, w2 in bigrams(tokens_list, pad_right=True, pad_left=True):
        bigram_model[(w1)][w2] += 1

    # Let's transform the counts to probabilities
    for w1 in bigram_model:
        total_count = float(sum(bigram_model[w1].values()))
        for w2 in bigram_model[w1]:
            bigram_model[w1][w2] /= total_count

### Tri-gram Model

In [None]:
def tri_gram(tokens_list):
    for w1, w2, w3 in trigrams(tokens_list, pad_right=True, pad_left=True):
        trigram_model[(w1, w2)][w3] += 1

    # Let's transform the counts to probabilities
    for w1_w2 in trigram_model:
        total_count = float(sum(trigram_model[w1_w2].values()))
        for w3 in trigram_model[w1_w2]:
            trigram_model[w1_w2][w3] /= total_count

## Model training

In [None]:
# Create a placeholder for models
unigram_model = {}
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))

#list for word2vec
tokens_wv = []
#all tokens list
tokens_list = []

# process all files
processfiles(TRAINING_DIR, training_files)

#unigram model training
uni_gram(tokens_list)

#bigram model training
bi_gram(tokens_list)

#trigram model training
tri_gram(tokens_list)


In [None]:
#word2vector model training
word2vec_model = gensim.models.Word2Vec(tokens_wv, min_count = 1, vector_size=100, window=5, workers=4)

In [None]:
# Save variables for future processing
pickle.dump(tokens_wv, open('variables/tokens_wv', 'wb'), True)
pickle.dump(tokens_list, open('variables/tokens_list', 'wb'), True)

In [None]:
#pickle.load(open('variables/tokens_list', 'rb'))
#pickle.load(open('variables/tokens_wv', 'rb'))

## Model predictions

In [None]:
# Get the prevoius two words from the blank 
def get_previous_words(question):
    first_word = ''
    second_word = ''
    indx_blank = question.index('_____')
    if question[indx_blank-2]:
        first_word = question[indx_blank-2]
        second_word = question[indx_blank-1]
    elif question[indx_blank-1]:
        second_word = question[indx_blank-1]
    return first_word, second_word

In [None]:
# finding the cosien similrity between two words w1 & w2
def cosine_similarity(word2vec_model, w1, w2):
    if word2vec_model.wv.has_index_for(w1) and word2vec_model.wv.has_index_for(w2):
        cosine_similarity = word2vec_model.wv.similarity(w1, w2)
    else:
        cosine_similarity = 0
    return cosine_similarity

In [None]:
# get synonyms with wordnet
def get_synonyms(word):
    syn = set()
    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            syn.add(lemma.name())
            
    syn.add(word)
    return list(syn)

In [None]:
# get probability of the option
def get_probability(first_word, second_word, option_word, model):
    prob = 0
    try:
        if model == 'bigram':
            prob = dict(bigram_model[second_word])[option_word]
        elif model == 'trigram':
            prob = dict(trigram_model[first_word, second_word])[option_word]
    except:
        prob = 0
    return prob

In [None]:
# answer prediction with trigram, word2vec & wordne
def predict_answer(question, word2vec_model, options, model, question_no, total_questions, is_w2v_wn):
    print(f"Processing question No. {question_no+1} out of {total_questions}")
    answer_list = ['a', 'b', 'c', 'd', 'e']
    max_index = -1
    first_word, second_word = get_previous_words(question)
    # Join all the lists in one list
    options = list(itertools.chain.from_iterable(options))
    if not is_w2v_wn:
        max_prob = 0
        if model=='unigram':
            # get random values from unigram 
            unigram_list = list(np.random.choice(list(unigram_model), size=5))
        for indx in range(5):
            if model =='unigram':
                if options[indx] in unigram_list:
                    max_index = indx
            elif model == 'bigram':
                # here second word is basically the first word for bigram
                prob = get_probability(first_word, second_word, options[indx], 'bigram')
                if prob > max_prob:
                    max_prob = prob
                    max_index = indx
            elif model == 'trigram':
                prob = prob = get_probability(first_word, second_word, options[indx], 'trigram')
                if prob > max_prob:
                    max_prob = prob
                    max_index = indx
        return answer_list[max_index]
    else:
        max_cosine_similarity = -float("inf")
        for indx in range(5):
            if model =='unigram':
                predict_possible_answers= list(np.random.choice(list(unigram_model), size=10))
            elif model == 'bigram':
                # handle scenario where we don't have match of words in bigram 
                if len(list(bigram_model[second_word])) > 0:
                    # here second word is basically the first word for bigram
                    predict_possible_answers= list(np.random.choice(list(bigram_model[second_word]), size=10))
                else:
                    predict_possible_answers = []
            elif model == 'trigram':
                if len(list(trigram_model[first_word, second_word])) > 0:
                    # here second word is basically the first word for bigram
                    predict_possible_answers= list(np.random.choice(list(trigram_model[first_word, second_word]), size=10))
                else:
                    predict_possible_answers = [] 
            synonyms_list = get_synonyms(options[indx])[:5]
            for i in range(len(predict_possible_answers)):
                for j in range(len(synonyms_list)):
                    cos_similarity = cosine_similarity(word2vec_model, predict_possible_answers[i], synonyms_list[j])
                    if cos_similarity > max_cosine_similarity:
                        max_cosine_similarity = cos_similarity
                        max_index = indx
        return answer_list[max_index]

### Model prediction -- uni-gram

In [None]:
#uni-gram predictions
predicted_answers = [predict_answer(test_questions['question'][i], word2vec_model, test_questions.iloc[i,-5:], 'unigram', i, test_questions.shape[0], False) for i in range(test_questions.shape[0])]


### Evaluation -- uni-gram

In [None]:
# Display confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(test_answers['answer'], predicted_answers),
                              display_labels=['a', 'b', 'c', 'd', 'e'])

disp = disp.plot(cmap='YlOrRd')
plt.title("Confusion matrix - uni-gram")
plt.show()

# Classification report
clf_report = classification_report(test_answers['answer'], predicted_answers, target_names=['a', 'b', 'c', 'd', 'e'], output_dict=True)

# .iloc[:-1, :] to exclude support
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True, cmap="YlGnBu")

### Model prediction - bi-gram

In [None]:
# bi-gram predictions
predicted_answers = [predict_answer(test_questions['question'][i], word2vec_model, test_questions.iloc[i,-5:], 'bigram', i, test_questions.shape[0], False) for i in range(test_questions.shape[0])]


### Evaluation -- bi-gram

In [None]:
# Display confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(test_answers['answer'], predicted_answers),
                              display_labels=['a', 'b', 'c', 'd', 'e'])

disp = disp.plot(cmap='YlOrRd')
plt.title("Confusion matrix - bi-gram")
plt.show()

# Classification report
clf_report = classification_report(test_answers['answer'], predicted_answers, target_names=['a', 'b', 'c', 'd', 'e'], output_dict=True)

# .iloc[:-1, :] to exclude support
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True, cmap="YlGnBu")

### Model prediction -- tri-gram

In [None]:
# tri-gram predictions
predicted_answers = [predict_answer(test_questions['question'][i], word2vec_model, test_questions.iloc[i,-5:], 'trigram', i, test_questions.shape[0], False) for i in range(test_questions.shape[0])]


### Evaluation -- tri-gram

In [None]:
# Display confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(test_answers['answer'], predicted_answers),
                              display_labels=['a', 'b', 'c', 'd', 'e'])

disp = disp.plot(cmap='YlOrRd')
plt.title("Confusion matrix - tri-gram")
plt.show()

# Classification report
clf_report = classification_report(test_answers['answer'], predicted_answers, target_names=['a', 'b', 'c', 'd', 'e'], output_dict=True)

# .iloc[:-1, :] to exclude support
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True, cmap="YlGnBu")

### Model prediction with uni-gram with wordnet & word2vec

In [None]:
#uni-gram predictions
predicted_answers = [predict_answer(test_questions['question'][i], word2vec_model, test_questions.iloc[i,-5:], 'unigram', i, test_questions.shape[0], True) for i in range(test_questions.shape[0])]



### Evaluation -- uni-gram with wordnet & word2vec

In [None]:
# Display confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(test_answers['answer'], predicted_answers),
                              display_labels=['a', 'b', 'c', 'd', 'e'])

disp = disp.plot(cmap='YlOrRd')
plt.title("Confusion matrix - uni-gram")
plt.show()

# Classification report
clf_report = classification_report(test_answers['answer'], predicted_answers, target_names=['a', 'b', 'c', 'd', 'e'], output_dict=True)

# .iloc[:-1, :] to exclude support
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True, cmap="YlGnBu")

### Model prediction -- bi-gram with wordnet & word2vec

In [None]:
# bi-gram predictions
predicted_answers = [predict_answer(test_questions['question'][i], word2vec_model, test_questions.iloc[i,-5:], 'bigram', i, test_questions.shape[0], True) for i in range(test_questions.shape[0])]



### Evaluation -- bi-gram with wordnet & word2vec

In [None]:
# Display confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(test_answers['answer'], predicted_answers),
                              display_labels=['a', 'b', 'c', 'd', 'e'])

disp = disp.plot(cmap='YlOrRd')
plt.title("Confusion matrix - bi-gram")
plt.show()

# Classification report
clf_report = classification_report(test_answers['answer'], predicted_answers, target_names=['a', 'b', 'c', 'd', 'e'], output_dict=True)

# .iloc[:-1, :] to exclude support
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True, cmap="YlGnBu")

### Model prediction -- tri-gram with wordnet & word2vec

In [None]:
# tri-gram predictions
predicted_answers = [predict_answer(test_questions['question'][i], word2vec_model, test_questions.iloc[i,-5:], 'trigram', i, test_questions.shape[0], True) for i in range(test_questions.shape[0])]


### Evaluation -- tri-gram with wordnet & word2vec

In [None]:
# Display confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(test_answers['answer'], predicted_answers),
                              display_labels=['a', 'b', 'c', 'd', 'e'])

disp = disp.plot(cmap='YlOrRd')
plt.title("Confusion matrix - tri-gram")
plt.show()

# Classification report
clf_report = classification_report(test_answers['answer'], predicted_answers, target_names=['a', 'b', 'c', 'd', 'e'], output_dict=True)

# .iloc[:-1, :] to exclude support
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True, cmap="YlGnBu")