# Thesis experiments

In [1]:
import numpy
from cade.metrics.comparative import lncs2, intersection_nn, initialize_avgs, get_neighbors_set
from gensim.models.word2vec import Word2Vec
from scipy.spatial.distance import cosine
from pandas import pandas
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)
from scipy.stats import spearmanr
from tabulate import tabulate
from config import CURRENT_EXP_DIR, config, get_logger, log_config


## Load language models and groundtruth

In [2]:
def get_models(lang: str):
    model1 = Word2Vec.load(
        CURRENT_EXP_DIR.split("_")[0]
        + "_0"
        + "/model/"
        + lang
        + "/corpus1.model"
    )
    model2 = Word2Vec.load(
        CURRENT_EXP_DIR.split("_")[0]
        + "_0"
        + "/model/"
        + lang
        + "/corpus2.model"
    )
    return model1, model2

def get_gt(lang: str, binary=True):
    binary_truth = numpy.loadtxt(
        "./data/"
        + lang
        + "/semeval2020_ulscd_"
        + lang[:3]
        + "/truth/" + ("binary" if binary else "graded") + ".txt",
        dtype=str,
        delimiter="\t",
    )
    return binary_truth

### English (get LNCS2, Intersection_NN and Cosine scores)

In [3]:
lang = "english"
# Load models
model1, model2 = get_models(lang)
# Initialize models avgs
initialize_avgs(model1, model2)
shared_vocabulary = set(model1.wv.vocab.keys()).intersection(set(model2.wv.vocab.keys()))
shared_vocabulary_df = pandas.DataFrame(shared_vocabulary, columns=["word"])
shared_vocabulary_df["lncs2"] = shared_vocabulary_df["word"].apply(
    lambda word: lncs2(word, model1, model2, 25)
)
shared_vocabulary_df["intersection_nn"] = shared_vocabulary_df["word"].apply(
    lambda word: intersection_nn(word, model1, model2)
)
shared_vocabulary_df["cosine"] = shared_vocabulary_df["word"].apply(
    lambda word: 1 - cosine(model1.wv[word], model2.wv[word])
)

### Add mean of the three metrics

In [4]:
shared_vocabulary_df["mean"] = shared_vocabulary_df[["lncs2", "cosine", "intersection_nn"]].apply(
    lambda x: (x.lncs2 + x.cosine + x.intersection_nn) / 3, axis=1
)

### Add word count

In [5]:
shared_vocabulary_df["count_m1"] = shared_vocabulary_df["word"].apply(
    lambda word: model1.wv.vocab[word].count
)
shared_vocabulary_df["count_m2"] = shared_vocabulary_df["word"].apply(
    lambda word: model2.wv.vocab[word].count
)

### Save dataframe

In [6]:
shared_vocabulary_df.to_pickle("./shared_vocabulary_metrics.pkl")

### Words changed the most (by LNCS2)

In [7]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["lncs2"], ascending=True)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,mean,count_m1,count_m2
10916,pregnant,-0.059092,0.012,-0.001763,-0.016285,31,158
12134,unused,-0.059081,0.013,-0.073233,-0.039771,26,36
13989,scarce,-0.044323,0.01,0.072006,0.012561,329,61
4183,incline,-0.034136,0.012,0.17663,0.051498,114,25
11680,ml,-0.029649,0.002,-0.079436,-0.035695,1,18
21938,major,-0.016591,0.002,0.263108,0.082839,531,1554
12794,someday,0.002855,0.012,0.11585,0.043568,1,125
21775,mentally,0.016916,0.02,0.328319,0.121745,31,90
8043,tense,0.032603,0.016,0.138092,0.062232,11,139
23205,err,0.036362,0.026,0.169528,0.077297,70,11


### Words changed the less by LNCS

In [8]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["lncs2"], ascending=False)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,mean,count_m1,count_m2
8807,ten,0.998332,0.335,0.834361,0.722564,1140,1154
22068,eleven,0.998247,0.339,0.81345,0.716899,128,214
10773,fifteen,0.997878,0.332,0.80201,0.71063,326,350
20937,twenty,0.997811,0.329,0.830547,0.719119,791,536
14380,twelve,0.997662,0.33,0.791642,0.706435,503,340
23289,13,0.99707,0.304,0.669598,0.656889,41,363
7267,11,0.996741,0.286,0.628893,0.637211,49,526
1607,eight,0.996465,0.326,0.84865,0.723705,524,868
11522,12,0.995956,0.347,0.760477,0.701144,73,529
20191,twenty-five,0.995953,0.352,0.815984,0.721312,162,143


### Words change the most (by Intersection_NN)

In [10]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["intersection_nn"], ascending=True)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,mean,count_m1,count_m2
11297,virus,0.32396,0.0,-0.149479,0.05816,13,166
4252,coke,0.272922,0.001,0.071755,0.115226,12,88
19626,rusticity,0.613301,0.001,0.544417,0.386239,3,2
7493,clumsiness,0.552998,0.001,0.532715,0.362238,1,4
23735,funding,0.344045,0.001,0.151782,0.165609,3,119
7662,inclusive,0.339527,0.002,0.072305,0.137944,13,21
11680,ml,-0.029649,0.002,-0.079436,-0.035695,1,18
21938,major,-0.016591,0.002,0.263108,0.082839,531,1554
8815,rove,0.151496,0.002,0.07378,0.075758,48,19
25692,media,0.323505,0.002,0.092881,0.139462,49,394


### Words changed the less by Intersection_NN

In [11]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["intersection_nn"], ascending=False)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,mean,count_m1,count_m2
2920,convince,0.945782,0.486,0.781441,0.737741,476,450
248,believe,0.971778,0.478,0.758603,0.736127,2948,2618
12402,swing,0.971353,0.462,0.728393,0.720582,166,626
10888,deny,0.964949,0.454,0.691601,0.703517,682,414
13761,understand,0.974703,0.442,0.727781,0.714828,1475,1683
22884,find,0.928129,0.442,0.849056,0.739728,7549,6846
22111,grass,0.945075,0.44,0.797416,0.727497,378,483
14853,slope,0.930147,0.434,0.855519,0.739888,137,200
6467,trust,0.957087,0.434,0.670041,0.687043,1050,562
2298,trunk,0.95231,0.432,0.776542,0.720284,322,180


### Words change the most (by cosine)

In [12]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["cosine"], ascending=True)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,mean,count_m1,count_m2
3112,ski,0.287075,0.017,-0.166819,0.045752,2,218
11297,virus,0.32396,0.0,-0.149479,0.05816,13,166
19687,fer,0.259189,0.034,-0.132545,0.053548,4,47
1718,op,0.695739,0.026,-0.127812,0.197976,21,31
18580,wilmer,0.500615,0.05,-0.111061,0.146518,11,16
18557,some--,0.068811,0.036,-0.110117,-0.001769,2,6
19448,setting,0.200999,0.009,-0.109949,0.03335,1,62
12377,frightening,0.095688,0.008,-0.102361,0.000442,3,76
20480,gist,0.27267,0.006,-0.098656,0.060005,15,15
12836,cal,0.479762,0.02,-0.098061,0.1339,13,66


### Words changed the less by cosine

In [13]:
shared_vocabulary_df = shared_vocabulary_df.sort_values(by=["cosine"], ascending=False)
shared_vocabulary_df.head(n=30)

Unnamed: 0,word,lncs2,intersection_nn,cosine,mean,count_m1,count_m2
3063,an,0.458082,0.107,0.924624,0.496568,20796,22248
7649,who,0.809027,0.128,0.919161,0.618729,17530,16983
8330,than,0.405342,0.123,0.908293,0.478878,12372,10985
2103,new,0.464222,0.132,0.903266,0.499829,5012,10031
26782,more,0.790212,0.149,0.901612,0.613608,14909,14446
11294,far,0.791386,0.188,0.90084,0.626742,5100,3903
21851,whom,0.829928,0.218,0.900759,0.649562,3449,918
3912,from,0.57768,0.102,0.900104,0.526594,32523,29181
21599,eye,0.942752,0.33,0.898691,0.723814,6165,4951
22107,there,0.622997,0.166,0.898492,0.562496,16028,17609


In [27]:
gt = get_gt("english")
gt_vocabulary_df = shared_vocabulary_df[shared_vocabulary_df["word"].isin(gt[:, 0])]
gt_vocabulary_df["truth"] = gt_vocabulary_df["word"].apply(
    lambda x: gt[numpy.where(gt[:, 0] == x), 1][0,0]
)
gt_vocabulary_df = gt_vocabulary_df.sort_values(by=["truth"])
gt_vocabulary_df

Unnamed: 0,word,lncs2,intersection_nn,cosine,mean,count_m1,count_m2,truth
18743,face_nn,0.934642,0.355,0.891037,0.726893,3394,3932,0
8987,chairman_nn,0.889095,0.181,0.664473,0.578189,147,683,0
2399,pin_vb,0.915103,0.407,0.667831,0.663311,114,217,0
18327,lane_nn,0.892045,0.287,0.693115,0.624053,211,289,0
3698,ball_nn,0.897295,0.308,0.696024,0.633773,440,878,0
23330,quilt_nn,0.907541,0.329,0.700923,0.645821,106,189,0
6690,fiction_nn,0.887914,0.29,0.713829,0.630581,202,326,0
23902,contemplation_nn,0.93185,0.325,0.717873,0.658241,240,111,0
11939,risk_nn,0.792147,0.216,0.729821,0.579323,286,643,0
12712,twist_nn,0.820299,0.148,0.564152,0.510817,103,186,0
