# COMP 472 - MP3

JustAlex
- Alexandre Lavoie (40129457)

This file covers experiments and data collection.

## Imports

In [4]:
import gensim.downloader
import pandas
import random
import glob
import os.path

## Synoyms Dataset

In [None]:
synonyms = pandas.read_csv("synonyms.csv")
print(synonyms)

       question        answer  ...              2                3
0    enormously  tremendously  ...   tremendously        decidedly
1    provisions  stipulations  ...  jurisdictions  interpretations
2   haphazardly      randomly  ...       randomly         linearly
3     prominent   conspicuous  ...     mysterious      conspicuous
4        zenith      pinnacle  ...         outset          decline
..          ...           ...  ...            ...              ...
75      fashion        manner  ...          craze           manner
76     marketed          sold  ...      sweetened          diluted
77       bigger        larger  ...         larger           better
78        roots       origins  ...           cure         function
79     normally    ordinarily  ...    permanently     periodically

[80 rows x 6 columns]


## Models

In [None]:
for model in gensim.downloader.info()['models'].keys():
  print(model)

fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis


In [None]:
def get_dataframe(model):
  csv = {"question": [], "answer": [], "prediction": [], "label": []}

  for question, answer, o1, o2, o3, o4 in zip(synonyms["question"], synonyms["answer"], synonyms["0"], synonyms["1"], synonyms["2"], synonyms["3"]):
    options = [o1, o2, o3, o4]
    
    if question in model and any(option in model for option in options):  
      sorted_options = list(sorted(options, key=lambda option: model.similarity(question, option) if option in model else -1))
      prediction = sorted_options[-1]
      if prediction == answer:
        label = "correct"
      else:
        label = "wrong"
    else:
      prediction = random.choice(options)
      label = "guess"

    csv["question"].append(question)
    csv["answer"].append(answer)
    csv["prediction"].append(prediction)
    csv["label"].append(label)

  return pandas.DataFrame(data=csv)

### Word2Vec Google News 300

In [None]:
word2vec_google_news_300_model = gensim.downloader.load("word2vec-google-news-300")

In [None]:
word2vec_google_news_300 = get_dataframe(word2vec_google_news_300_model)
word2vec_google_news_300.to_csv("word2vec-google-news-300-details.csv", index=False)
print(word2vec_google_news_300)

       question        answer    prediction    label
0    enormously  tremendously  tremendously  correct
1    provisions  stipulations  stipulations  correct
2   haphazardly      randomly      randomly  correct
3     prominent   conspicuous   conspicuous  correct
4        zenith      pinnacle      pinnacle  correct
..          ...           ...           ...      ...
75      fashion        manner        manner  correct
76     marketed          sold          sold  correct
77       bigger        larger        larger  correct
78        roots       origins       origins  correct
79     normally    ordinarily    ordinarily  correct

[80 rows x 4 columns]


### Glove Wiki Gigaword 300

In [None]:
glove_wiki_gigaword_300_model = gensim.downloader.load("glove-wiki-gigaword-300")



In [None]:
glove_wiki_gigaword_300 = get_dataframe(glove_wiki_gigaword_300_model)
glove_wiki_gigaword_300.to_csv("glove-wiki-gigaword-300-details.csv", index=False)
print(glove_wiki_gigaword_300)

       question        answer    prediction    label
0    enormously  tremendously  tremendously  correct
1    provisions  stipulations  stipulations  correct
2   haphazardly      randomly      randomly  correct
3     prominent   conspicuous   conspicuous  correct
4        zenith      pinnacle      pinnacle  correct
..          ...           ...           ...      ...
75      fashion        manner        manner  correct
76     marketed          sold          sold  correct
77       bigger        larger        larger  correct
78        roots       origins       origins  correct
79     normally    ordinarily    ordinarily  correct

[80 rows x 4 columns]


### Fasttext Wiki News Subwords 300

In [None]:
fasttext_wiki_news_subwords_300_model = gensim.downloader.load("fasttext-wiki-news-subwords-300")



In [None]:
fasttext_wiki_news_subwords_300 = get_dataframe(fasttext_wiki_news_subwords_300_model)
fasttext_wiki_news_subwords_300.to_csv("fasttext-wiki-news-subwords-300-details.csv", index=False)
print(fasttext_wiki_news_subwords_300)

       question        answer    prediction    label
0    enormously  tremendously  tremendously  correct
1    provisions  stipulations  stipulations  correct
2   haphazardly      randomly      randomly  correct
3     prominent   conspicuous   conspicuous  correct
4        zenith      pinnacle      pinnacle  correct
..          ...           ...           ...      ...
75      fashion        manner        manner  correct
76     marketed          sold          sold  correct
77       bigger        larger        larger  correct
78        roots       origins       origins  correct
79     normally    ordinarily    ordinarily  correct

[80 rows x 4 columns]


### Conceptnet Numberbatch 17-06 300

Not sure what this model is, but does not work on model.

In [None]:
conceptnet_numberbatch_17_06_300_model = gensim.downloader.load("conceptnet-numberbatch-17-06-300")

In [None]:
conceptnet_numberbatch_17_06_300 = get_dataframe(conceptnet_numberbatch_17_06_300_model)
conceptnet_numberbatch_17_06_300.to_csv("conceptnet-numberbatch-17-06-300-details.csv", index=False)
print(conceptnet_numberbatch_17_06_300)

       question        answer     prediction  label
0    enormously  tremendously      decidedly  guess
1    provisions  stipulations  jurisdictions  guess
2   haphazardly      randomly       linearly  guess
3     prominent   conspicuous     mysterious  guess
4        zenith      pinnacle       pinnacle  guess
..          ...           ...            ...    ...
75      fashion        manner         fathom  guess
76     marketed          sold           sold  guess
77       bigger        larger       steadier  guess
78        roots       origins           cure  guess
79     normally    ordinarily    permanently  guess

[80 rows x 4 columns]


### Glove Twitter 200

In [None]:
glove_twitter_200_model = gensim.downloader.load("glove-twitter-200")

In [None]:
glove_twitter_200 = get_dataframe(glove_twitter_200_model)
glove_twitter_200.to_csv("glove-twitter-200-details.csv", index=False)
print(glove_twitter_200)

       question        answer     prediction    label
0    enormously  tremendously   tremendously  correct
1    provisions  stipulations  jurisdictions    wrong
2   haphazardly      randomly        densely    wrong
3     prominent   conspicuous     mysterious    wrong
4        zenith      pinnacle       pinnacle  correct
..          ...           ...            ...      ...
75      fashion        manner          craze    wrong
76     marketed          sold           sold  correct
77       bigger        larger         larger  correct
78        roots       origins        origins  correct
79     normally    ordinarily    permanently    wrong

[80 rows x 4 columns]


### Glove Twitter 100

In [None]:
glove_twitter_100_model = gensim.downloader.load("glove-twitter-100")



In [None]:
glove_twitter_100 = get_dataframe(glove_twitter_100_model)
glove_twitter_100.to_csv("glove-twitter-100-details.csv", index=False)
print(glove_twitter_100)

       question        answer     prediction    label
0    enormously  tremendously   tremendously  correct
1    provisions  stipulations  jurisdictions    wrong
2   haphazardly      randomly        densely    wrong
3     prominent   conspicuous     mysterious    wrong
4        zenith      pinnacle       pinnacle  correct
..          ...           ...            ...      ...
75      fashion        manner          craze    wrong
76     marketed          sold        diluted    wrong
77       bigger        larger         larger  correct
78        roots       origins        origins  correct
79     normally    ordinarily    permanently    wrong

[80 rows x 4 columns]


## Analysis

In [10]:
path = "./results/"
models = [os.path.basename(fp).split(".")[0][:-len("-details")] for fp in glob.glob(f"{path}*-details.csv")]

In [12]:
analysis = {"model": [], "vocabulary": [], "correct": [], "answered": [], "accuracy": []} 

for model_name in models:
  details_file = f"{path}{model_name}-details.csv"
  details = pandas.read_csv(details_file)

  correct_count = len([True for label in details["label"] if label == "correct"])
  answered_count = len([True for label in details["label"] if label != "guess"])

  analysis["model"].append(model_name)
  analysis["vocabulary"].append(int(model_name.split("-")[-1]) * 10000)
  analysis["correct"].append(correct_count)
  analysis["answered"].append(answered_count)
  analysis["accuracy"].append(correct_count / answered_count if answered_count > 0 else 0)

analysis = pandas.DataFrame(data=analysis)
analysis.to_csv(f"{path}analysis.csv", index=False)