In [25]:
import numpy as np
from numpy.linalg import norm
import json
from scipy import spatial



def word_vector(emb):
    a = np.array(emb)
    vector = np.mean(a, axis=0)
    return vector

def cos_sim(vect_a:np.array, vect_b:np.array):
    return (vect_a @ vect_b)/(norm(vect_a) * norm(vect_b))



def get_similarity(word1, word2, year):
    
    data = []
    with open('../embeddings/embeddings_' + str(year) + '.json', 'r') as f:
        for line in f:
            data.append(json.loads(line))

        
    results = {}
    for w in data:
        if w['word'] in [word1, word2]:
            results[w['word']] = {'index': w['sentence_number_index'], 'embeddings': w['embeddings']}


    word_embeddings = {}
    for word, value in results.items():
        word_embeddings[word] = word_vector(value['embeddings'])
    
    result = cos_sim(word_embeddings[word1], word_embeddings[word2])

    return result


    

In [26]:
get_similarity('administration', 'white', 1980)

0.6134349271143078

In [27]:
years = [1980, 1985, 1987, 1989, 1990, 1992, 1995, 2000, 2001, 2005, 2008, 2010, 2015]

In [28]:
for year in years:
    print(year, get_similarity('white', 'aids', year))


1980 0.5335846593259473
1985 0.4138146756371786
1987 0.4301887550207626
1989 0.4595541145347521
1990 0.4760644726401668
1992 0.49492303365549833
1995 0.47107655606801113
2000 0.5062048156906076
2001 0.46654952710055386
2005 0.44585490400833594
2008 0.4883235276633935
2010 0.47534109225012744
2015 0.5224337567080692


In [29]:
for year in years:
    print(year, get_similarity('wall', 'cold', year))


1980 0.5678930864678454
1985 0.500025749016114
1987 0.5220442348009298
1989 0.5312158112787152
1990 0.5601013457768088
1992 0.5407377375144511
1995 0.5458151992101112
2000 0.5514113459383099
2001 0.5534045214571172
2005 0.5506118139311664
2008 0.537457585798878
2010 0.524317455113604
2015 0.5597489430697016


In [30]:
words = []
with open('../data/target_words/polysemous.txt', 'r') as f:
    for w in f:
        words.append(w.strip())

print(words)

['abuse', 'administration', 'cold', 'wall', 'aids', 'authority', 'battle', 'black', 'campaign', 'lives', 'head', 'march', 'public', 'release', 'screen', 'shot', 'war', 'union', 'white']
