In [1]:
from gensim.models import KeyedVectors
import numpy as np 
import pandas as pd
import random

In [2]:
model = KeyedVectors.load_word2vec_format("../embeddings/CoNLL17/english/model.bin", binary=True)

In [3]:
automatic_trans_lex = pd.read_csv("./lexicons_pons.csv")
automatic_trans_lex = automatic_trans_lex[automatic_trans_lex.lang == "eng"]

In [4]:
manual_trans_lex = pd.read_csv("./manually_translated_lexicons.csv")
manual_trans_lex.at[0,'lex']="modalization"
manual_trans_lex.at[1,'lex']="pressuposition"

### Auto x Manual Dist

In [5]:
auto_man_df = automatic_trans_lex.merge(manual_trans_lex, how = "inner", on=["lang","lex"])
auto_man_dist = auto_man_df.apply(lambda x: model.wmdistance(x.words_x.split(), x.words_y.split()), axis=1)

In [6]:
auto_man_dist

0    0.721713
1    0.874691
dtype: float64

### Manual x Random Dist 

#### Random Lex 

In [7]:

def generate_rand_word(model, lex):
    rand_lex = lex.apply(lambda x: random.sample(model.wv.vocab.keys(), len(x.split())))
    return(rand_lex)

#### Distance 

In [8]:
dists = []
for i in range(40):
    rand_lex = generate_rand_word(model, manual_trans_lex.words)
    manual_rand_dist = [model.wmdistance(manual.split(), random) for manual, random in zip(manual_trans_lex.words, rand_lex)]
    dists.append(manual_rand_dist)

  


### Normalization Step 

In [9]:
dists = np.array(dists)
mean_dist = np.mean(dists, axis=0)

In [10]:
sim = 1-(auto_man_dist/(mean_dist))

In [11]:
info_df = pd.DataFrame({"lex":manual_trans_lex.lex,"auto_man":auto_man_dist, "mean_dist":mean_dist, "similarity":sim})
info_df.to_csv("sim_df.csv")