In [1]:
!pip install gensim -U

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.5 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.1.2


In [2]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')



Mounted at /content/drive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import gensim
import gensim.downloader
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import random
from sklearn.metrics import confusion_matrix,accuracy_score

In [None]:

import pandas as pd
import csv

import itertools
import os
import operator
import numpy as np

# Word2Vec Class

In [3]:
class WordToVec:
    def __init__(self, test_path, answer_path, holmes_path, ratio, max_doc,  pre_processing, seed=True):
        self.test_path = test_path
        self.answer_path = answer_path
        self.holmes_path = holmes_path
        self.ratio = ratio
        self.max_doc = max_doc 
        self.pre_processing = pre_processing 
        self.seed = seed 

        
        
        self.file_names = os.listdir(self.holmes_path)



        self.listed_data = []
        self.process_files()
        self.testing = pd.read_csv(self.test_path)
        self.labels = pd.read_csv(self.answer_path)
    
    def score_predictions(self, prediction):
        return accuracy_score(self.labels['answer'].values, prediction)
    
    def split(self):
        if self.seed:
            random.seed(53)

        files = self.file_names
        shuffled = random.sample(files, len(files))
     
        index = int(len(files) * self.ratio)

        training = files[:index]
        held_out = files[index:]
   
        return training, held_out
    
    def process_files(self):
        training, _ = self.split()
        #print(training)
        n = 1
        stop = stopwords.words('english')
        for file in training:
            try:
                text = open(os.path.join(self.holmes_path, file), "r")              
                sentences = sent_tokenize(text.read().replace('\n', ' '))
                if not self.pre_processing:
                    tokenized = [word_tokenize(sents)  for sents in sentences if len(sents)>3]   
                else:
                    tokenized = [[token for token in word_tokenize(sentence.lower()) if token.isalpha() and token not in stop] for sentence in sentences]
                self.listed_data += tokenized
                n += 1
                if n > self.max_doc:
                    break
            except UnicodeDecodeError:
                pass


    def predict_and_score(self, all_models, param_title):
        results = {}        
        for key, model in all_models.items():
            #print(f"Processing {key} on the Holmes dataset")
            current_model = {}
            current_model["model"] = key
            current_predictions = []
            for i, question in self.testing.iterrows():
                current_prediction, _ = self.predict_question(question, model)
                current_model["id"] = current_prediction
                current_predictions.append(current_prediction)
            
            current_model["Accuracy"]  = self.score_predictions(current_predictions)
            results[key] = current_model
            
        return pd.DataFrame([(values["model"], values["Accuracy"]) for values in results.values()], columns = [param_title, "Accuracy"])   

    def single_prediction(self, model):
        predictions = []
        similarities = []
        for i, question in self.testing.iterrows():
            current_prediction, similarity = self.predict_question(question, model)
            similarities.append(similarity)
            predictions.append(current_prediction)  
        accuracy  = self.score_predictions(predictions)   
        return accuracy, similarities, predictions  

    def predict_question(self, row, model):
        testing_data = word_tokenize(row["question"].lower().replace("_____", ""))
        testing_data = [token for token in testing_data if token.isalpha() and token not in stopwords.words('english')]
        
        choices = ['a','b','c','d','e']
        similarity = {}
        
        for choice in choices:
            word = row[choice + ")"]
            total_sim = 0
            length = len(testing_data) 
            for question in testing_data:
                try:
                    total_sim += model.wv.similarity(question, word)
                except:
                    pass
            similarity[choice] = total_sim/(length + 1)
            
        return max(similarity.items(), key=operator.itemgetter(1))[0], similarity
      


# Results

In [None]:
def merge_df(doc_sizes, results, adjust=1):
    df = pd.concat(results)
    df["Doc Size"] = sorted(doc_sizes * (len(results[0]["Accuracy"])*adjust))
    df= df.sort_values(by='Accuracy', ascending=False)
    return df
    

In [5]:
# Experiment Setup
answers_path = "/content/drive/MyDrive/NLP Testing/test_answer.csv"
questions_path = "/content/drive/MyDrive/NLP Testing/testing_data.csv"
holmes_path = "/content/drive/MyDrive/Holmes_Training_Data"
ratio = 0.5
pre_processing = True


In [None]:
# CBOW vs Skip Gram
doc_sizes = [10, 50, 100, 200]
title = "Algorithm"
alg_parameters = {"CBOW": 0, "Skip Gram": 1}
hs_ns_parameters = {0: "Negative Sampling", 1: "Hierarchical Softmax"}
alg_results = []

for max_doc in doc_sizes:
    w2v = WordToVec(questions_path, answers_path, holmes_path, ratio, max_doc,  pre_processing, seed=True)
    for hn in range(2):
        models = {f"{key} ({hs_ns_parameters[hn]})": Word2Vec(sentences=w2v.listed_data, hs=hn, sg=values, seed=10) for key, values in alg_parameters.items()}
        df_algs = w2v.predict_and_score(models, title)
        alg_results.append(df_algs)





In [None]:
df = merge_df(doc_sizes, alg_results,2)
df

In [None]:
doc_sizes = [200]
# Sample Sizes
sample_name = "Sample"
sample_parameters = [1, 0.5, 0.1, 0.01, 0.001, 0.0001, 0] 
sample_results = []
# Minimum Count 
minCount_name = "Minimum Count"
minCount_parameters = list(range(1, 10))
minCount_results = []
# Window
window_name = "Window"
window_parameters = [4, 6, 8, 10, 12, 14, 16, 18]
window_results = []
# Alpha Values
alpha_name = "Alpha" 
alpha_parameters = [0.01, 0.025, 0.05, 0.1, 1.5, 2]
alpha_results = []

for max_doc in doc_sizes:
    # Running W2V model on a given doc size
    w2v = WordToVec(questions_path, answers_path, holmes_path, ratio, max_doc,  pre_processing, seed=True)
    # Getting results for different sample sizes
    models = {key: Word2Vec(sentences=w2v.listed_data, sample=values, sg=1, hs=1, seed=10) for key, values in {str(val): val for val in sample_parameters}.items()}
    df_samples = w2v.predict_and_score(models, sample_name)
    sample_results.append(df_samples)
    # Getting results for different sample sizes
    models = {key: Word2Vec(sentences=w2v.listed_data, min_count=values, sg=1, hs=1, seed=10) for key, values in {str(val): val for val in minCount_parameters}.items()}
    df_minCount = w2v.predict_and_score(models, minCount_name)
    minCount_results.append(df_minCount)
    # Getting results for different windows
    models = {key: Word2Vec(sentences=w2v.listed_data, window=values, sg=1, hs=1, seed=10) for key, values in {str(val): val for val in window_parameters}.items()}
    df_window = w2v.predict_and_score(models, window_name)
    window_results.append(df_window)
    # Getting results for different alpha values
    models = {key: Word2Vec(sentences=w2v.listed_data, alpha=values, sg=1, hs=1, seed=10) for key, values in {str(val): val for val in alpha_parameters}.items()}
    df_alpha = w2v.predict_and_score(models, alpha_name)
    alpha_results.append(df_alpha)

In [None]:
df = merge_df(doc_sizes, sample_results)

df.to_csv("/content/drive/MyDrive/NLP Testing/Sample.csv", encoding='utf-8', index=False)

In [None]:
df = merge_df(doc_sizes, minCount_results)
df.to_csv("/content/drive/MyDrive/NLP Testing/minCount.csv", encoding='utf-8', index=False)

In [None]:
df=merge_df(doc_sizes, window_results)
df.to_csv("/content/drive/MyDrive/NLP Testing/window_results.csv", encoding='utf-8', index=False)

In [None]:
df = merge_df(doc_sizes, alpha_results)

df.to_csv("/content/drive/MyDrive/NLP Testing/alpha.csv", encoding='utf-8', index=False)

In [None]:
sample_parameters = [0.5, 0.1, 0.01] 
alpha_parameters = [0.025, 0.05]

results_comb = []
w2v = WordToVec(questions_path, answers_path, holmes_path, ratio, 200,  pre_processing, seed=True)
for samp in sample_parameters:
    for alpha in alpha_parameters:
        model = Word2Vec(sentences=w2v.listed_data, sample=samp, alpha=alpha, min_count=2, window=6, sg=1, hs=1, seed=10)
        
        df_combined = w2v.single_prediction(model)
        results_comb.append((samp, alpha, df_combined))




In [None]:
results_comb

[(0.5, 0.025, 0.19903846153846153),
 (0.5, 0.05, 0.19903846153846153),
 (0.1, 0.025, 0.19903846153846153),
 (0.1, 0.05, 0.19903846153846153),
 (0.01, 0.025, 0.19903846153846153),
 (0.01, 0.05, 0.19903846153846153)]

In [31]:
df = pd.DataFrame(results_comb, columns=["Sample Size", "Alpha Value", "Accuracy"])
df

Unnamed: 0,Sample Size,Alpha Value,Accuracy
0,0.5,0.025,0.488462
1,0.5,0.05,0.473077
2,0.1,0.025,0.495192
3,0.1,0.05,0.480769
4,0.01,0.025,0.477885
5,0.01,0.05,0.486538


In [6]:
w2v = WordToVec(questions_path, answers_path, holmes_path, ratio, 200,  pre_processing, seed=True)

In [7]:
model = Word2Vec(sentences=w2v.listed_data, sample=0.05, alpha=0.025, min_count=2, window=6, sg=1, hs=1, seed=10)
a,b,c=w2v.single_prediction(model)


In [56]:
sorted([2,3,4,1], reverse=True)

[4, 3, 2, 1]

In [8]:
 sll = list(w2v.labels["answer"])
 predictions__=[1 if sll[i] == c[i] else 0 for i in range(len(c))]
 sum(predictions__)

503

In [9]:
uncertainty_index = [a[0]/a[1] if a[1] != 0 else 1 for a in [sorted(list(i.values()))[3:] for i in b]]

In [10]:
df = pd.DataFrame(data={"Uncertainty Index": uncertainty_index, "Predictions": predictions__})

In [82]:
df = df.sort_values(by='Uncertainty Index')


In [12]:
df.to_csv("/content/drive/MyDrive/NLP Testing/w2vpredsss.csv")

In [93]:
error_analysis = []
count = 0 
for index, pred in zip(list(df["Uncertainty Index"]), list(df["Predictions"])):
  count+=1
  if count % 100 == 0:
    a = df[df["Uncertainty Index"] < index]
    score = sum(list(a["Predictions"]))/len(a["Predictions"])
    error_analysis.append((count, index, score))


In [96]:
df_error = pd.DataFrame(error_analysis, columns=["Sentences Processed", "Uncertainty Index", "Accuracy"])

In [98]:
df_error.to_csv("/content/drive/MyDrive/NLP Testing/w2vError.csv")

In [99]:
df_error


Unnamed: 0,Sentences Processed,Uncertainty Index,Accuracy
0,100,0.692307,0.767677
1,200,0.77039,0.743719
2,300,0.81764,0.692308
3,400,0.850744,0.656642
4,500,0.880497,0.625251
5,600,0.907823,0.594324
6,700,0.930459,0.556509
7,800,0.953507,0.543179
8,900,0.972333,0.515017
9,1000,0.99039,0.496496
