# Analyzing diachronic embeddings

In [10]:
import pickle
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr

In [2]:
MODEL_DIR = "./models/word2vec"

# Original hyperparameters, from 1890-1990
with open(f'{MODEL_DIR}/word2vec_a_1890-1990_diachronic.pickle', 'rb') as f:
    d = pickle.load(f)

In [3]:
words = d["w"] #['time', 'man', 'years', 'part', 'way', 'life', 'people',...]
words

['olema',
 'ja',
 'tema',
 'see',
 'mina',
 'ei',
 'et',
 'mis',
 'kui',
 'ka',
 'oma',
 'aga',
 'saama',
 'sina',
 'ning',
 'siis',
 'kes',
 'või',
 'tulema',
 'ise',
 'pidama',
 'aasta',
 'üks',
 'nii',
 'kõik',
 'võima',
 'tegema',
 'teine',
 'nagu',
 'minema',
 'ütlema',
 'eesti',
 'veel',
 'aeg',
 'jääma',
 'võtma',
 'juba',
 'mees',
 'inimene',
 'kuid',
 'hakkama',
 'teadma',
 'kas',
 'suur',
 'miski',
 'välja',
 'ära',
 'nägema',
 'nüüd',
 'andma',
 'üle',
 'tahtma',
 'mitte',
 'palju',
 'mõni',
 'töö',
 'vaid',
 'ainult',
 'kord',
 'uus',
 'küll',
 'vastu',
 'kus',
 'pärast',
 'kaks',
 'sest',
 'naine',
 'vaatama',
 'käsi',
 'enam',
 'esimene',
 'iga',
 'panema',
 'päev',
 'eest',
 'ju',
 'siin',
 'peale',
 'ega',
 'rääkima',
 'asi',
 'keegi',
 'mõtlema',
 'tundma',
 'käima',
 'kuidas',
 'väga',
 'silm',
 'jõudma',
 'elu',
 'tagasi',
 'hea',
 'sõna',
 'seal',
 'kõige',
 'läbi',
 'laps',
 'ikka',
 'kogu',
 'viimane',
 'mingi',
 'leidma',
 'maa',
 'arvama',
 'tallinn',
 'ette',
 

In [4]:
decades = d["d"] # [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990]
decades

[1890, 1900, 1910, 1930, 1950, 1960, 1970, 1980, 1990]

In [5]:
vectors = np.asarray(d["E"])
vectors.shape

(1232, 9, 200)

In [8]:
def largest_change(start, end):
    """
    In detecting semantic change, we are looking for words, whose cosine similarity is as small as possible
    between two aligned time periods to find words which have as different contexts as possible and thus may have gone
    through semantic change. Therefore, because for the cosine distance we subtract the cosine similarity from one,
    we are looking for the highest distance (in other words, two words which vectors are as distant as possible)
    """
    start_index = decades.index(start)
    end_index = decades.index(end)
    res = []

    for i in range(len(vectors)):
        word = words[i]
        vector_first = vectors[i][start_index]
        vector_last = vectors[i][end_index]
        dist = cosine(vector_first, vector_last) #equals 1 - cosine similarity
        res.append((word, round(dist, 2)))
    return res

In [9]:
method1 = sorted(largest_change(1890, 1990), key=lambda x: -x[1])

print("Top-20 most changed words", method1[:20])
print("Top-20 least changed words", method1[-20:])

Top-20 most changed words [('pank', 1.08), ('saksa', 1.05), ('tallinn', 1.04), ('ühendus', 1.04), ('poolt', 1.03), ('', 1.02), ('tartu', 1.01), ('näitus', 1.01), ('seadus', 1.0), ('praegune', 1.0), ('riik', 0.99), ('linn', 0.99), ('valitsus', 0.99), ('punkt', 0.99), ('turg', 0.99), ('avalik', 0.99), ('juhatus', 0.99), ('nimekiri', 0.99), ('kuuluma', 0.98), ('ettevõte', 0.98)]
Top-20 least changed words [('paljas', 0.42), ('käes', 0.42), ('peksma', 0.42), ('niiviisi', 0.42), ('raiuma', 0.42), ('rahvahulk', 0.42), ('pisike', 0.41), ('viimaks', 0.41), ('sosistama', 0.41), ('ohkama', 0.41), ('lill', 0.4), ('korraks', 0.4), ('otsas', 0.4), ('kurk', 0.4), ('lõbus', 0.4), ('kuulatama', 0.4), ('poisike', 0.38), ('tasa', 0.38), ('vihaselt', 0.38), ('soo', 0.35)]


In [37]:
for pair in (method1[:21]):
    if pair[0] != "":
        print(pair[0], pair[1])

pank 1.08
saksa 1.05
tallinn 1.04
ühendus 1.04
poolt 1.03
tartu 1.01
näitus 1.01
seadus 1.0
praegune 1.0
riik 0.99
linn 0.99
valitsus 0.99
punkt 0.99
turg 0.99
avalik 0.99
juhatus 0.99
nimekiri 0.99
kuuluma 0.98
ettevõte 0.98
arv 0.98


In [62]:
# Method 2: Summing the cosine distances over two periods next to each other
def sum_distances():
    res = defaultdict(float)

    for i in range(len(vectors)):

        word = words[i]

        for j in range(len(vectors[i])-1):

            vector_j = vectors[i][j] # i-th word's j-th period
            vector_j_next = vectors[i][j+1] # i-th word's j+1-th period
            dist = cosine(vector_j, vector_j_next)
            res[word] += dist
    return res

In [23]:
method2 = sorted(sum_distances().items(), key=lambda kv: kv[1])

print("Top-20 most changed words", method2[-20:])
print("Top-20 least changed words", method2[:20])

Top-20 most changed words [('koosolek', 6.865782659500837), ('rahvas', 6.866986304521561), ('juhatus', 6.8672575410455465), ('ehitamine', 6.885198380798101), ('otsus', 6.897328887440381), ('ülesanne', 6.93874919693917), ('uus', 6.952172493853141), ('tegevus', 6.955495705828071), ('poolt', 6.963481940329075), ('käesolev', 6.975842222571373), ('eesti', 6.988644644618034), ('eest', 7.038449911400676), ('arv', 7.090138132101856), ('miljon', 7.101421665400267), ('liige', 7.145875573158264), ('valitsus', 7.150271609425545), ('aasta', 7.203878371044993), ('riik', 7.22100087814033), ('osa', 7.322705645114183), ('', 7.929676614701748)]
Top-20 least changed words [('eksima', 3.0748135149478912), ('häbi', 3.0995177924633026), ('vähegi', 3.112474024295807), ('lahke', 3.1235561668872833), ('niipea', 3.1407135128974915), ('kõlbama', 3.1852188110351562), ('pärima', 3.228432685136795), ('kippuma', 3.2297322750091553), ('ükskõik', 3.2328240871429443), ('jällegi', 3.2359923720359802), ('rõõmustama', 3.2

In [42]:
for pair in reversed(method2[-21:]):
    if pair[0] != "":
        print(pair[0], round(pair[1], 2))

osa 7.32
riik 7.22
aasta 7.2
valitsus 7.15
liige 7.15
miljon 7.1
arv 7.09
eest 7.04
eesti 6.99
käesolev 6.98
poolt 6.96
tegevus 6.96
uus 6.95
ülesanne 6.94
otsus 6.9
ehitamine 6.89
juhatus 6.87
rahvas 6.87
koosolek 6.87
asutus 6.8


In [69]:
# Method 3: Finding the nearest words (using cosine distance) for all of the timeperiods and finding for which words the change is most significant
distances = defaultdict(float) #top-down dynamic programming
def find_nearest_neighbours(word, vector, period_ind, topk=10):
    """Finds top-k nearest neighbour words for a given word, its vector and time period index."""
    res = []
    for i in range(len(vectors)):
        comparable_word = words[i]
        if comparable_word == word: #don't calculate word distance with itself
            continue
        comparable_vector = vectors[i][period_ind]
        if (word, comparable_word, period_ind) in distances:
            dist = distances[(word, comparable_word, period_ind)]
        elif (comparable_word, word, period_ind) in distances:
            dist = distances[(comparable_word, word, period_ind)]
        else:
            dist = cosine(vector, comparable_vector)
            distances[(word, comparable_word, period_ind)] = dist
        res.append((comparable_word, dist))
    return sorted(res, key=lambda x: x[1])[:topk]

def count_common_neighbours(n1, n2):
    return len(list(set([pair[0] for pair in n1]).intersection([pair[0] for pair in n2])))

def find_neighbours():
    res = defaultdict(int)

    for i in tqdm(range(len(vectors))): #loop over all words

        word = words[i] #get the currently observed word

        for j in range(len(vectors[i])-1): #loop over all time periods

            vector_j = vectors[i][j] # i-th word's j-th period
            nearest_neighbours = find_nearest_neighbours(word, vector_j, j)
            nearest_neighbours_next = find_nearest_neighbours(word, vector_j, j+1)
            common_count = count_common_neighbours(nearest_neighbours, nearest_neighbours_next)
            res[word] += common_count
    return res

In [26]:
method3 = sorted(find_neighbours().items(), key=lambda kv: kv[1])

print("Top-20 most changed words", method3[:20])
print("Top-20 least changed words", method3[-20:])

Top-20 most changed words [('ka', 0), ('või', 0), ('tulema', 0), ('aasta', 0), ('veel', 0), ('jääma', 0), ('andma', 0), ('üle', 0), ('töö', 0), ('vastu', 0), ('vaatama', 0), ('eest', 0), ('väga', 0), ('silm', 0), ('seal', 0), ('läbi', 0), ('osa', 0), ('seisma', 0), ('riik', 0), ('linn', 0)]
Top-20 least changed words [('kaaluma', 7), ('kasulik', 7), ('peatama', 7), ('aleksander', 7), ('vähegi', 7), ('viimne', 7), ('kutse', 7), ('kahekordne', 7), ('jah', 8), ('kartma', 8), ('kurk', 8), ('usaldus', 8), ('niipea', 8), ('post', 9), ('ohkama', 10), ('võrk', 10), ('rahule', 11), ('meelitama', 12), ('kevadine', 12), ('kuulsus', 13)]


In [27]:
method3_max = max(res.values())
method3_max

13

In [28]:
method3_normalized = [(w, method3_max-v) for (w,v) in method3]

In [49]:
for pair in reversed(method3[-20:]):
    print(pair[0], round(pair[1], 2))

kuulsus 0
kevadine 1
meelitama 1
rahule 2
võrk 3
ohkama 3
post 4
niipea 5
usaldus 5
kurk 5
kartma 5
jah 5
kahekordne 6
kutse 6
viimne 6
vähegi 6
aleksander 6
peatama 6
kasulik 6
kaaluma 6


In [30]:
method3 = method3_normalized

In [50]:
print(pearsonr([p[1] for p in sorted(method1)], [p[1] for p in sorted(method1)]))
print(pearsonr([p[1] for p in sorted(method1)], [p[1] for p in sorted(method2)]))
print(pearsonr([p[1] for p in sorted(method1)], [p[1] for p in sorted(method3)]))
print(pearsonr([p[1] for p in sorted(method2)], [p[1] for p in sorted(method1)]))
print(pearsonr([p[1] for p in sorted(method2)], [p[1] for p in sorted(method2)]))
print(pearsonr([p[1] for p in sorted(method2)], [p[1] for p in sorted(method3)]))
print(pearsonr([p[1] for p in sorted(method3)], [p[1] for p in sorted(method1)]))
print(pearsonr([p[1] for p in sorted(method3)], [p[1] for p in sorted(method2)]))
print(pearsonr([p[1] for p in sorted(method3)], [p[1] for p in sorted(method3)]))

(1.0, 0.0)
(0.7114744188849584, 1.0978923273780035e-190)
(0.16840911182446835, 2.719961633568773e-09)
(0.7114744188849584, 1.0978923273780035e-190)
(1.0, 0.0)
(0.3068743159119467, 2.8170059966349444e-28)
(0.16840911182446835, 2.719961633568773e-09)
(0.3068743159119467, 2.8170059966349444e-28)
(1.0, 0.0)


# PPMI + SVD

In [52]:
MODEL_DIR = "./models/svd"

# Original hyperparameters, from 1890-1990
with open(f'{MODEL_DIR}/svd_a_1890-1990_diachronic.pickle', 'rb') as f:
    d = pickle.load(f)

In [53]:
words = d["w"]
decades = d["d"]
vectors = np.asarray(d["E"])
vectors.shape

(1844, 9, 200)

In [57]:
method1 = sorted(largest_change(1930, 1990), key=lambda x: -x[1])

print("Top-20 most changed words", method1[:20])
print("Top-20 least changed words", method1[-20:])

Top-20 most changed words [('ja', 1.75), ('kui', 1.75), ('olema', 1.74), ('tema', 1.51), ('oma', 1.51), ('ning', 1.36), ('et', 1.33), ('see', 1.24), ('muu', 1.23), ('korraldamine', 1.23), ('siis', 1.22), ('juba', 1.22), ('tasu', 1.22), ('miski', 1.21), ('riik', 1.21), ('süütama', 1.19), ('vanker', 1.18), ('kuues', 1.18), ('kinni', 1.17), ('ringi', 1.17)]
Top-20 least changed words [('vihm', 0.83), ('viiskümmend', 0.83), ('soojus', 0.83), ('mõni', 0.82), ('tähtsus', 0.82), ('parandama', 0.82), ('karistama', 0.82), ('tõrjuma', 0.82), ('kurk', 0.82), ('kurjategija', 0.82), ('rahutus', 0.82), ('puu', 0.81), ('sügisene', 0.81), ('veel', 0.8), ('mai', 0.8), ('päästma', 0.8), ('tormama', 0.79), ('amet', 0.78), ('puhas', 0.77), ('jääma', 0.71)]


In [68]:
for pair in (method1[:21]):
    if pair[0] != "":
        print(pair[0], pair[1])

ja 1.75
kui 1.75
olema 1.74
tema 1.51
oma 1.51
ning 1.36
et 1.33
see 1.24
muu 1.23
korraldamine 1.23
siis 1.22
juba 1.22
tasu 1.22
miski 1.21
riik 1.21
süütama 1.19
vanker 1.18
kuues 1.18
kinni 1.17
ringi 1.17
tõesti 1.17


In [63]:
method2 = sorted(sum_distances().items(), key=lambda kv: kv[1])

print("Top-20 most changed words", method2[-20:])
print("Top-20 least changed words", method2[:20])

Top-20 most changed words [('nõnda', 8.485129404880693), ('järv', 8.488469032250098), ('plaan', 8.49364728204815), ('saksamaa', 8.512384874870358), ('veerema', 8.513721134941386), ('korter', 8.514975258276097), ('osavõtja', 8.516308726184722), ('silm', 8.51678941175701), ('mõõtma', 8.541626902247124), ('hommikune', 8.549092565704843), ('iii', 8.563702808168745), ('kes', 8.60252189334909), ('purustama', 8.620025202358619), ('tähtis', 8.709149318051239), ('mis', 8.710738329663753), ('et', 8.854942071234072), ('vaimustama', 8.861395428028262), ('tegema', 8.89205626920769), ('tema', 9.441902343889174), ('see', 9.82036738101661)]
Top-20 least changed words [('ja', 5.613479427180764), ('vaene', 7.334763989958965), ('pank', 7.355840368515209), ('puhuma', 7.386472927304948), ('jahu', 7.456929139075693), ('külg', 7.462996115134498), ('oi', 7.477721377136274), ('vihaselt', 7.485690037764211), ('pooleli', 7.498483444829085), ('kirjanik', 7.507289033301643), ('heameel', 7.50972507910737), ('end', 

In [66]:
for pair in (method2[:21]):
    if pair[0] != "":
        print(pair[0], round(pair[1], 2))

ja 5.61
vaene 7.33
pank 7.36
puhuma 7.39
jahu 7.46
külg 7.46
oi 7.48
vihaselt 7.49
pooleli 7.5
kirjanik 7.51
heameel 7.51
end 7.52
suvi 7.53
aasta 7.53
lisama 7.54
programm 7.54
vabalt 7.54
raiskama 7.55
kolm 7.55
panema 7.55
võitma 7.56


In [70]:
distances = defaultdict(float) #top-down dynamic programming
method3 = sorted(find_neighbours().items(), key=lambda kv: kv[1])

print("Top-20 most changed words", method3[:20])
print("Top-20 least changed words", method3[-20:])

100%|██████████████████████████████████████████████████████████████████████████████| 1844/1844 [15:49<00:00,  1.94it/s]

Top-20 most changed words [('mina', 0), ('ka', 0), ('ise', 0), ('pidama', 0), ('või', 0), ('siis', 0), ('sina', 0), ('minema', 0), ('aeg', 0), ('kõik', 0), ('veel', 0), ('mees', 0), ('juba', 0), ('suur', 0), ('inimene', 0), ('ära', 0), ('jääma', 0), ('välja', 0), ('kuid', 0), ('andma', 0)]
Top-20 least changed words [('olema', 3), ('oma', 3), ('aga', 3), ('teine', 3), ('ju', 3), ('mõte', 3), ('kohe', 3), ('täis', 3), ('pealt', 3), ('nahk', 3), ('kaugemale', 3), ('tunnistus', 3), ('rändama', 3), ('liig', 3), ('tall', 3), ('kuub', 3), ('tuhk', 3), ('lugupidamine', 3), ('lind', 4), ('tema', 5)]





In [76]:
max(res.values())

13

In [83]:
method3_max = max(res.values())
method3_normalized = [(w, method3_max-v) for (w,v) in method3]
method3 = method3_normalized

In [87]:
for pair in reversed(method3_normalized[-21:]):
    print(pair[0], round(pair[1], 2))

tema 8
lind 9
lugupidamine 10
tuhk 10
kuub 10
tall 10
liig 10
rändama 10
tunnistus 10
kaugemale 10
nahk 10
pealt 10
täis 10
kohe 10
mõte 10
ju 10
teine 10
aga 10
oma 10
olema 10
metsavaht 11
