# Thesis experiments

In [26]:
import numpy
from cade.metrics.comparative import lncs2, intersection_nn, initialize_avgs, get_neighbors_set
from gensim.models.word2vec import Word2Vec
from scipy.spatial.distance import cosine
from pandas import pandas
from pandarallel import pandarallel
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)
from scipy.stats import spearmanr
from tabulate import tabulate
from config import CURRENT_EXP_DIR, config, get_logger, log_config


## Load language models and groundtruth

In [2]:
def get_models(lang: str):
    model1 = Word2Vec.load(
        CURRENT_EXP_DIR.split("_")[0]
        + "_0"
        + "/model/"
        + lang
        + "/corpus1.model"
    )
    model2 = Word2Vec.load(
        CURRENT_EXP_DIR.split("_")[0]
        + "_0"
        + "/model/"
        + lang
        + "/corpus2.model"
    )
    return model1, model2

def get_gt(lang: str, binary=True):
    binary_truth = numpy.loadtxt(
        "./data/"
        + lang
        + "/semeval2020_ulscd_"
        + lang[:3]
        + "/truth/" + ("binary" if binary else "graded") + ".txt",
        dtype=str,
        delimiter="\t",
    )
    return binary_truth

### English (get LNCS2, Intersection_NN and Cosine scores)

In [3]:
lang = "english"
# Load models
model1, model2 = get_models(lang)
# Initialize models avgs
initialize_avgs(model1, model2)
shared_vocabulary = set(model1.wv.vocab.keys()).intersection(set(model2.wv.vocab.keys()))
shared_vocabulary_df = pandas.DataFrame(shared_vocabulary, columns=["word"])
shared_vocabulary_df["lncs2"] = shared_vocabulary_df["word"].apply(
    lambda word: lncs2(word, model1, model2, 25)
)
shared_vocabulary_df["intersection_nn"] = shared_vocabulary_df["word"].apply(
    lambda word: intersection_nn(word, model1, model2)
)
shared_vocabulary_df["cosine"] = shared_vocabulary_df["word"].apply(
    lambda word: 1 - cosine(model1.wv[word], model2.wv[word])
)

Add mean of the three metrics

In [17]:
shared_vocabulary_df["mean"] = shared_vocabulary_df[["lncs2", "cosine", "intersection_nn"]].apply(
    lambda x: (x.lncs2 + x.cosine + x.intersection_nn) / 3, axis=1
)

### Add word count

In [18]:
shared_vocabulary_df["count_m1"] = shared_vocabulary_df["word"].apply(
    lambda word: model1.wv.vocab[word].count
)
shared_vocabulary_df["count_m2"] = shared_vocabulary_df["word"].apply(
    lambda word: model2.wv.vocab[word].count
)

### Save dataframe

In [19]:
shared_vocabulary_df.to_pickle("./shared_vocabulary_metrics.pkl")

### Words that changed the most (by LNCS2)

In [20]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["lncs2"], ascending=True)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,count_m1,count_m2,mean
10661,pregnant,-0.059092,0.988,-0.001763,31,158,0.309048
24935,unused,-0.059081,0.987,-0.073233,26,36,0.284895
20337,scarce,-0.044323,0.99,0.072006,329,61,0.339228
20078,incline,-0.034136,0.988,0.17663,114,25,0.376831
6585,ml,-0.029649,0.998,-0.079436,1,18,0.296305
4671,major,-0.016591,0.998,0.263108,531,1554,0.414839
5073,someday,0.002855,0.988,0.11585,1,125,0.368902
16236,mentally,0.016916,0.98,0.328319,31,90,0.441745
147,tense,0.032603,0.984,0.138092,11,139,0.384898
19984,err,0.036362,0.974,0.169528,70,11,0.393297


### Words change the most (by Intersection_NN)

In [24]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["intersection_nn"], ascending=False)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,count_m1,count_m2,mean
4043,virus,0.32396,1.0,-0.149479,13,166,0.391494
21585,clumsiness,0.552998,0.999,0.532715,1,4,0.694905
17746,rusticity,0.613301,0.999,0.544417,3,2,0.718906
23853,funding,0.344045,0.999,0.151782,3,119,0.498276
7452,coke,0.272922,0.999,0.071755,12,88,0.447892
22296,uprising,0.307683,0.998,0.156027,10,39,0.487237
17630,aura,0.217044,0.998,-0.075655,9,52,0.379796
4266,media,0.323505,0.998,0.092881,49,394,0.471462
25496,rove,0.151496,0.998,0.07378,48,19,0.407758
18442,twain,0.055887,0.998,0.011805,29,31,0.355231


### Words change the most (by cosine)

In [22]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["cosine"], ascending=True)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,count_m1,count_m2,mean
16205,ski,0.287075,0.983,-0.166819,2,218,0.367752
4043,virus,0.32396,1.0,-0.149479,13,166,0.391494
7259,fer,0.259189,0.966,-0.132545,4,47,0.364215
18503,op,0.695739,0.974,-0.127812,21,31,0.513976
17938,wilmer,0.500615,0.95,-0.111061,11,16,0.446518
17195,some--,0.068811,0.964,-0.110117,2,6,0.307565
1684,setting,0.200999,0.991,-0.109949,1,62,0.360683
11922,frightening,0.095688,0.992,-0.102361,3,76,0.328442
15049,gist,0.27267,0.994,-0.098656,15,15,0.389338
17813,cal,0.479762,0.98,-0.098061,13,66,0.4539


### Words change the most (by mean)

In [23]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["mean"], ascending=True)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,count_m1,count_m2,mean
24935,unused,-0.059081,0.987,-0.073233,26,36,0.284895
6585,ml,-0.029649,0.998,-0.079436,1,18,0.296305
17195,some--,0.068811,0.964,-0.110117,2,6,0.307565
10661,pregnant,-0.059092,0.988,-0.001763,31,158,0.309048
11922,frightening,0.095688,0.992,-0.102361,3,76,0.328442
16315,backing,0.045896,0.987,-0.038172,5,42,0.331575
20337,scarce,-0.044323,0.99,0.072006,329,61,0.339228
10305,hearing,0.064546,0.99,-0.008649,1,149,0.348632
7053,significantly,0.058707,0.998,-0.010645,31,123,0.348688
23333,hy,0.056843,0.992,0.00858,109,11,0.352474
