In [1]:
from gensim.models import KeyedVectors
import numpy as np 
import pandas as pd 

In [2]:
model = KeyedVectors.load_word2vec_format("../embeddings/CoNLL17/english/model.bin", binary=True)

## Lexica Import

In [3]:
automatic_trans_lex = pd.read_csv("./lexicons_pons.csv")
automatic_trans_lex = automatic_trans_lex[automatic_trans_lex.lang == "eng"]

In [68]:
manual_trans_lex = pd.read_csv("./manually_translated_lexicons.csv")
manual_trans_lex.at[0,'lex']="modalization"
manual_trans_lex.at[1,'lex']="pressuposition"

In [5]:
mpqa = pd.read_csv("./lexica_similarity/automatic_x_mpqa/subjectivity_sense_annotations/subjectivity_sense_annotations/goldstandard.connl11/CoNNL11SenseAnnotations.txt", 
                   sep=" ", names=["word","POS","sense_key","subj_level"])

mpqa = mpqa.word.unique()

## WMD between lexica

In [7]:
import numpy as np

def generate_rand_lex(lex_name, lex, n_samples=5):
    lex_len = len(lex.split())
    
    # generate random lexica
    random_lex = [" ".join(random.sample(model.wv.vocab.keys(), lex_len)) for i in range(n_samples)]
    rand_lex_df = pd.DataFrame(random_lex, columns=["rand_words"])
    
    # assign random label to each lexicon
    lex = pd.DataFrame(np.tile([lex],n_samples), columns=["automatic_words"])
    lexica_df = pd.concat([lex,rand_lex_df], axis=1)
    
    # compute wmd between the lexica
    wmd = lexica_df.apply(lambda x: model.wmdistance(x.automatic_words.split(), x.rand_words.split()), axis=1)
    return(wmd.mean())

In [8]:
### Generate Most Similar lexicon 
def most_similar_lex(lex):
    lex = lex.split()
    most_similar_lex = []
    for word in lex:
        if word in model.vocab:
            most_similar_word = model.most_similar(word, topn=1)[0][0] #get the word, not the entire vector
            most_similar_lex.append(most_similar_word)
    return(most_similar_lex)

### Generate Most Similar lexicon 

In [9]:
mod_most_sim_lex = most_similar_lex(manual_trans_lex.words[0])

In [10]:
pre_most_sim_lex = most_similar_lex(manual_trans_lex.words[1])


## Open output file 

In [11]:
f = open("lexica_dist_semantic_a.csv", "w")
f.write("lex,type_1,type_2,dist"+"\n")

23

### Auto x Random  

In [12]:
import random
mod_auto_rand_dist = generate_rand_lex(manual_trans_lex.lex[0], manual_trans_lex.words[0])

  import sys


In [13]:
f.write("mod,auto,random,"+str(mod_auto_rand_dist)+"\n")

35

In [14]:
pre_auto_rand_dist = generate_rand_lex(manual_trans_lex.lex[1], manual_trans_lex.words[1])

  import sys


In [15]:
f.write("pre,auto,random,"+str(pre_auto_rand_dist)+"\n")

34

### Auto x Manual

In [20]:
auto_man_df = automatic_trans_lex.merge(manual_trans_lex, how = "inner", on=["lang","lex"])
auto_man_dist = auto_man_df.apply(lambda x: model.wmdistance(x.words_x.split(), x.words_y.split()), axis=1)

In [21]:
auto_man_dist

0    0.721713
1    0.874691
dtype: float64

In [22]:
f.write("mod,auto,manual,"+str(auto_man_dist[0])+"\n")
f.write("pre,auto,manual,"+str(auto_man_dist[1])+"\n")

35

### Auto x Most Similar to Manual 

In [23]:
mod_auto_sim_dist = model.wmdistance(auto_man_df.words_x[0], mod_most_sim_lex)

In [24]:
f.write("mod,auto,sim_to_manual,"+str(mod_auto_sim_dist)+"\n")

41

In [25]:
pre_auto_sim_dist = model.wmdistance(auto_man_df.words_x[1], pre_most_sim_lex)

In [26]:
f.write("pre,auto,sim_to_manual,"+str(pre_auto_sim_dist)+"\n")

42

### Manual x Most Similar to Manual 

In [27]:
mod_manual_sim_dist = model.wmdistance(auto_man_df.words_y[0], mod_most_sim_lex)

In [28]:
f.write("mod,manual,sim_to_manual,"+str(mod_manual_sim_dist)+"\n")

41

In [29]:
pre_manual_sim_dist = model.wmdistance(auto_man_df.words_y[1], pre_most_sim_lex)

In [30]:
f.write("pre,manual,sim_to_manual,"+str(pre_manual_sim_dist))

43

### Auto x MPQA

In [31]:
auto_mpqa_dist = automatic_trans_lex.apply(lambda x: model.wmdistance(x.words.split(), mpqa), axis=1)

In [32]:
auto_mpqa_dist

5    2.652556
6    2.555783
7    2.656125
8    2.841752
9    2.562749
dtype: float64

In [33]:
f.close()

## Sintatic Similarity between Lexica 

In [34]:
def n_similar_items(a, b) -> tuple: 
    not_similar = [item for item in a if item not in b ]
    n_diff = len(not_similar)
    n_similar = len(a) - n_diff
    return(n_similar, n_diff, not_similar)
    
sin_dist = auto_man_df.apply(lambda x: n_similar_items(x.words_x.split(), x.words_y.split()), axis=1)

In [35]:
f = open("lexica_dist_sintatic_a.csv", "w")
f.write("lex,type_1,type_2,n_in_common,n_distinct,distinct_words"+"\n")
f.write("mod,auto,man,"+str(sin_dist[0][0])+","+str(sin_dist[0][1])+","+str(sin_dist[0][2])+"\n")
f.write("pre,auto,man,"+str(sin_dist[1][0])+","+str(sin_dist[1][1])+","+str(sin_dist[1][2])+"\n")

239

### WMD Change by word

How WMD changes when we replace one word from the manual lexicon with a random word

In [168]:
def generate_sim_word(word_idx, lex, model):
    words = [model.most_similar(lex[idx], topn=1)[0][0] for idx in word_idx ]
    return(words) #get the word, not the entire vector

def generate_rand_word(model):
    return(random.sample(model.wv.vocab.keys(), 2))

def replace_and_compute_wmd(lex, idxs, words): # replace one word and compute WMD
    #new_lex = lex.copy()
    new_lex = list(lex)
    for idx, word in zip(idxs, words):
        new_lex[idx] = word
    return(model.wmdistance(new_lex, lex))



lex_dists = []
for lex_idx in range(manual_trans_lex.shape[0]):
    lex = manual_trans_lex.words[lex_idx].split()
    rand_words_idx = random.sample(range(len(lex)), 1)
    new_words = generate_rand_word(model)
    #new_words = generate_sim_word(rand_words_idx, lex, model)
    print(replace_and_compute_wmd(lex, rand_words_idx, new_words))

0.0266016307853599
0.03234575380657737


In [171]:
random.sample(model.wv.vocab.keys(), 2)

  """Entry point for launching an IPython kernel.


['162007..', 'cronberry']

In [36]:
NUM_GENERATED_WORDS = 1

def replace_and_compute_wmd(lex, idx, word): # replace one word and compute WMD
    #new_lex = lex.copy()
    new_lex = list(lex)
    new_lex[idx] = word
    return(model.wmdistance(new_lex, lex))

def generate_rand_word(model):
    return(random.sample(model.wv.vocab.keys(), 1)[0])

def generate_sim_word(word, model):
    return(model.most_similar(word, topn=NUM_GENERATED_WORDS)[0][0]) #get the word, not the entire vector

lex_dists = []
for lex_idx in range(manual_trans_lex.shape[0]): 
    lex = manual_trans_lex.words[lex_idx].split()
    dists = []
    for i, word in enumerate(lex):
        if word in model.vocab:
            new_word = generate_sim_word(word, model)
            dist = [replace_and_compute_wmd(lex, idx, new_word) for idx, item in enumerate(lex)]
            dists.append(dist)
    lex_dists.append(dists)


In [37]:
def semantic_agreement_level(n_words, one_word_dist, total_dist):
    semantic_equal = n_words-(total_dist/one_word_dist)
    agreement = (semantic_equal)/n_words
    return(agreement)

In [43]:
mean_dists_1

[0.05631870773955988, 0.05511815628227828]

In [39]:
mean_dists = [np.mean(dists) for dists in lex_dists]
lex_sizes = auto_man_df.words_y.apply(lambda x: len(x.split()))

semantic_agreement = semantic_agreement_level(lex_sizes, mean_dists, auto_man_dist)

In [40]:
semantic_agreement

0    0.762689
1    0.706123
dtype: float64

In [31]:
f.write("mod,manual,manual_with_single_word_change,"+str(semantic_agreement[0])+"\n")
f.write("pre,manual,manual_with_single_word_change,"+str(semantic_agreement[1]))
f.close()