# Training embeddings

## Training word2vec models

In [333]:
import numpy as np
import pickle
import gensim
from gensim.models import Word2Vec
from scipy.stats import pearsonr
from svd2vec import svd2vec
from os import listdir
from os.path import isfile, join
from scipy.spatial.distance import cosine
from collections import Counter

In [328]:
gensim.__version__

'4.0.1'

In [300]:
DATA_DIR = "./corpus/lemmas" #select either words or lemmas
MODEL_DIR = "./models/word2vec"

In [301]:
# Finding all filenames in the data folder
all_files = [f for f in listdir(DATA_DIR) if isfile(join(DATA_DIR, f))]
all_files = [filename for filename in all_files if filename[-4:] not in [".zip", "r.gz"]]

In [302]:
all_files

['1890_aja_lemmas.txt',
 '1890_ilu_lemmas.txt',
 '1900_aja_lemmas.txt',
 '1900_ilu_lemmas.txt',
 '1910_aja_lemmas.txt',
 '1910_ilu_lemmas.txt',
 '1930_aja_lemmas.txt',
 '1930_ilu_lemmas.txt',
 '1950_aja_lemmas.txt',
 '1950_ilu_lemmas.txt',
 '1960_aja_lemmas.txt',
 '1960_ilu_lemmas.txt',
 '1970_aja_lemmas.txt',
 '1970_ilu_lemmas.txt',
 '1980_aja_lemmas.txt',
 '1980_ilu_lemmas.txt',
 '1980_muu_lemmas.txt',
 '1980_tea_lemmas.txt',
 '1990e_aja_lemmas.txt',
 '1990_ilu_lemmas.txt']

In [303]:
decades = [1890, 1900, 1910, 1930, 1950, 1960, 1970, 1980, 1990] #1920 is a different data domain, 1940 was not included

In [347]:
# Training word2vec
def train_word2vec(vector_size=100, window=5, min_count=1, sg=1, negative=5):
    models = {}
    for decade in decades:
        chosen_files = [filename for filename in all_files if int(filename[:4]) == decade]
        #print(chosen_files)
        sentences = []
        for filename in chosen_files:
            with open(DATA_DIR + "/" + filename, mode="r", encoding="utf8") as f:
                for row in f:
                    row = row.strip().split(" ")
                    #row = [word.replace("w", "v") for word in row] #normalizing the old style of writing
                    #row = [word.replace("ß", "s") for word in row] #normalizing the old style of writing
                    sentences.append(row)
        # https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
        model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window, min_count=min_count, sg=sg, negative=negative, workers=8)
        models[decade] = model
        model.save(f"{MODEL_DIR}/word2vec_{vector_size}d_{window}w_{min_count}mc_{sg}algo_{negative}neg_{decade}.model")
    return models

models = train_word2vec()

In [348]:
best_val = 0
best_params = None
for vector_size in [100,200,300]:
    for window in [2,5,10]:
        for min_count in [2, 3, 5, 10]:
            for sg in [1]:
                for negative in [0, 3, 5, 10]:
                    params = {"vector_size": vector_size, "window": window, "min_count": min_count, "negative": negative}
                    print(params)
                    models = train_word2vec(vector_size=vector_size, window=window, min_count=min_count, sg=sg, negative=negative)
                    val = evaluate_word2vec(models)
                    
                    if val > best_val:
                        best_val = val
                        best_params = params
                        print("Found new best value of", best_val, "with parameters", best_params)

{'vector_size': 100, 'window': 2, 'min_count': 2, 'negative': 0}
1890 (-0.3562401306368913, 0.11295548853307931)
1900 (-0.08929424375368605, 0.6644419376080374)
1910 (-0.12464374062423125, 0.48949157032060303)
1930 (-0.19496170255458456, 0.2342913990915352)
1950 (-0.08580833204133809, 0.6405314816345085)
1960 (-0.28553703518347734, 0.12613041232061364)
1970 (-0.00281143264082713, 0.9876111717682287)
1980 (-0.3216537779087683, 0.022737712982024653)
1990 (-0.0900641067126205, 0.5754812035010126)
{'vector_size': 100, 'window': 2, 'min_count': 2, 'negative': 3}
1890 (-0.01663022950346199, 0.9429621563227648)
1900 (-0.05348737478940665, 0.7952453157685487)
1910 (0.03246959253519173, 0.8576403957456603)
1930 (0.10131261675774128, 0.5394173385322668)
1950 (0.07921791330973353, 0.6664875837061566)
1960 (0.2159341662411017, 0.2517750893764462)
1970 (0.17452153449060973, 0.33136156237013575)
1980 (0.029022645238842154, 0.8414238544742826)
1990 (0.15519843961461582, 0.33258997847065114)
Found new

1890 (-0.3562401306368913, 0.11295548853307931)
1900 (-0.08929424375368605, 0.6644419376080374)
1910 (-0.12464374062423125, 0.48949157032060303)
1930 (-0.19496170255458456, 0.2342913990915352)
1950 (-0.08580833204133809, 0.6405314816345085)
1960 (-0.28553703518347734, 0.12613041232061364)
1970 (-0.00281143264082713, 0.9876111717682287)
1980 (-0.3216537779087683, 0.022737712982024653)
1990 (-0.0900641067126205, 0.5754812035010126)
{'vector_size': 100, 'window': 5, 'min_count': 2, 'negative': 3}
1890 (0.10250840484636821, 0.6583762829137174)
1900 (0.13105254108227163, 0.523385058449692)
1910 (0.08128158258318868, 0.65294946947275)
1930 (0.017376695234562537, 0.9163801165652441)
1950 (0.05776321687299596, 0.753505830751064)
1960 (0.16309397169075354, 0.3891655632531324)
1970 (0.2725221692229367, 0.12493993032470245)
1980 (0.04867520382771494, 0.7371127191695366)
1990 (0.2219558510914291, 0.16309828114146727)
{'vector_size': 100, 'window': 5, 'min_count': 2, 'negative': 5}
1890 (0.08956137

1890 (0.08644924885446242, 0.7094455180881858)
1900 (0.15066910652039506, 0.46252797450914296)
1910 (0.17246300626888705, 0.3371855347931994)
1930 (0.10543055619712348, 0.5229686999188382)
1950 (0.12228546143550996, 0.5049393040621483)
1960 (0.16731948444806755, 0.3768264520728053)
1970 (0.23696323790077978, 0.18425556253165812)
1980 (0.08155930067976049, 0.5733912446552616)
1990 (0.23201815103865933, 0.14437226370288983)
{'vector_size': 100, 'window': 10, 'min_count': 2, 'negative': 5}
1890 (0.10246222430908546, 0.6585211888817616)
1900 (0.21998637285608164, 0.28021131461706483)
1910 (0.150244750314992, 0.40396349733541753)
1930 (0.05415304012092501, 0.7433479892101762)
1950 (0.07472965387158657, 0.6843862167411998)
1960 (0.10361727188312472, 0.5858291503304558)
1970 (0.18054330337439584, 0.31468256554881313)
1980 (0.08432318350800017, 0.5604235922245756)
1990 (0.2408007280737646, 0.12936531883302296)
{'vector_size': 100, 'window': 10, 'min_count': 2, 'negative': 10}
1890 (0.143240120

1890 (0.060759544387483405, 0.7936089219919708)
1900 (0.09646733393380486, 0.6392175266433556)
1910 (0.10449725068513621, 0.5627693728218701)
1930 (-0.045507662615457134, 0.783246081326606)
1950 (0.11055956051683752, 0.5469160000261947)
1960 (0.1416605115520699, 0.4552355431225748)
1970 (0.1718998909986036, 0.33878952381951083)
1980 (0.03202953458139641, 0.8252394465640432)
1990 (0.18479007231892455, 0.24742097463941207)
{'vector_size': 200, 'window': 2, 'min_count': 2, 'negative': 10}
1890 (0.008202593835015454, 0.9718503168338485)
1900 (0.08873824951426558, 0.6664124908460439)
1910 (0.05606393799777391, 0.7566458217262286)
1930 (-0.08884028488729404, 0.5907000288256636)
1950 (0.035267456093873065, 0.8480360836628926)
1960 (0.08610163927577805, 0.6509801817000291)
1970 (0.1494471465211715, 0.40649219495573763)
1980 (-0.019120562563907997, 0.8951463615349654)
1990 (0.2263016172911563, 0.15480595281263163)
{'vector_size': 200, 'window': 2, 'min_count': 3, 'negative': 0}
1890 (0.27659010

1890 (0.028435902329658785, 0.9026182014997199)
1900 (0.15578100959498853, 0.44730137524197705)
1910 (0.12261795161350394, 0.49663189159945925)
1930 (-0.018026819636961483, 0.9132632288108002)
1950 (0.018314711808621126, 0.9207493435802662)
1960 (0.11168817274781509, 0.5568069862890066)
1970 (0.2056014538402871, 0.25102449508404023)
1980 (0.05830011074354188, 0.6875665898211974)
1990 (0.31386169057822344, 0.04567765600589044)
{'vector_size': 200, 'window': 5, 'min_count': 3, 'negative': 0}
1890 (0.27659010587778066, 0.23779828217379195)
1900 (0.33207973938835555, 0.17819594517038956)
1910 (0.2625912645198972, 0.19497875100300016)
1930 (-0.10045483543941575, 0.5843535726989025)
1950 (0.05256145299504327, 0.7905226161448695)
1960 (0.36757651750551285, 0.07065711601847405)
1970 (0.11303551310755715, 0.5448881694209736)
1980 (0.1481260347621398, 0.3046116236536319)
1990 (-0.09028784974958506, 0.5846392037773402)
{'vector_size': 200, 'window': 5, 'min_count': 3, 'negative': 3}
1890 (0.11384

1890 (0.27659010587778066, 0.23779828217379195)
1900 (0.33207973938835555, 0.17819594517038956)
1910 (0.2625912645198972, 0.19497875100300016)
1930 (-0.10045483543941575, 0.5843535726989025)
1950 (0.05256145299504327, 0.7905226161448695)
1960 (0.36757651750551285, 0.07065711601847405)
1970 (0.11303551310755715, 0.5448881694209736)
1980 (0.1481260347621398, 0.3046116236536319)
1990 (-0.09028784974958506, 0.5846392037773402)
{'vector_size': 200, 'window': 10, 'min_count': 3, 'negative': 3}
1890 (0.23685915073161504, 0.3146740232903721)
1900 (0.46449229811335524, 0.052136137216655956)
1910 (0.13765047121639973, 0.5024968512359509)
1930 (0.14875538339949546, 0.4164797395070719)
1950 (0.11411882021021938, 0.5631135279483878)
1960 (0.15385841747655787, 0.4627678941958417)
1970 (0.2320097419888718, 0.2091476460435964)
1980 (0.11290778377872215, 0.4349868898551638)
1990 (0.2539151794143373, 0.1188081876797429)
{'vector_size': 200, 'window': 10, 'min_count': 3, 'negative': 5}
1890 (0.1729209128

1890 (0.07330755443783579, 0.7587328326607325)
1900 (0.4719973217403705, 0.04796566286042652)
1910 (0.07940359210986331, 0.6998094154141555)
1930 (-0.03225496947852295, 0.8608850650993173)
1950 (0.03485719996267236, 0.8602225951529974)
1960 (0.16501041572412356, 0.43055721248337225)
1970 (0.11247687835088786, 0.5468903543246939)
1980 (0.021211299502909613, 0.8837561496653419)
1990 (0.26928375800052073, 0.09736914832379565)
{'vector_size': 300, 'window': 2, 'min_count': 3, 'negative': 5}
1890 (0.08135211621608368, 0.7331355428476715)
1900 (0.32970196966038423, 0.18151385824444627)
1910 (-0.0004888178584599341, 0.9981090947340483)
1930 (-0.003219907073993139, 0.98604581220108)
1950 (0.06340170212597462, 0.7485763090502348)
1960 (0.16808672995987756, 0.42189085407190396)
1970 (0.19488944706857364, 0.2934316098322504)
1980 (0.02624405822465779, 0.856437505526291)
1990 (0.31200478001782295, 0.053155934174934596)
{'vector_size': 300, 'window': 2, 'min_count': 3, 'negative': 10}
1890 (0.03375

1890 (0.139160291438992, 0.5584600462592014)
1900 (0.369705987448367, 0.1310385019716983)
1910 (0.10904602170794676, 0.5959265289280326)
1930 (-0.00010200493592386894, 0.9995579174183298)
1950 (0.07810669162853881, 0.6927964558764889)
1960 (0.14675569423741092, 0.48391712698504136)
1970 (0.2075730983844437, 0.2625081265723023)
1980 (0.06807743932272001, 0.6385334070580421)
1990 (0.3479373720412475, 0.029969119270729365)
{'vector_size': 300, 'window': 5, 'min_count': 3, 'negative': 10}
1890 (0.07746339378475109, 0.745477193147596)
1900 (0.36923033572540664, 0.1315734609229544)
1910 (0.15031158510526885, 0.4636029273017348)
1930 (0.07015644416965947, 0.7027982471937775)
1950 (0.05327112146914289, 0.7877576252705348)
1960 (0.11354114895772587, 0.5889253004185143)
1970 (0.1843116325411391, 0.32091953602089945)
1980 (0.059531641479096044, 0.6813147056010784)
1990 (0.29006874094137786, 0.07324913802400229)
{'vector_size': 300, 'window': 5, 'min_count': 5, 'negative': 0}
1890 (0.1516235130477

1890 (0.14463193875164082, 0.5429364963924888)
1900 (0.35993159256162277, 0.14234011519886983)
1910 (0.15153085150704948, 0.45994233970125076)
1930 (0.15556215921971947, 0.3952281934904117)
1950 (0.04331530273660383, 0.826762527613825)
1960 (0.11560220387716853, 0.5821345456361965)
1970 (0.19731873643580675, 0.2873368884867795)
1980 (0.10269217901133193, 0.4779170913746418)
1990 (0.3269827919406022, 0.04217380950103446)
{'vector_size': 300, 'window': 10, 'min_count': 5, 'negative': 0}
1890 (0.15162351304771404, 0.561296410556901)
1900 (-0.3713488360055268, 0.1729492199214392)
1910 (-0.12812751214309873, 0.5416210161306471)
1930 (-0.001994072086038662, 0.9922863911941241)
1950 (0.041032132633714975, 0.85982828966054)
1960 (0.3185846653613978, 0.1975743793278149)
1970 (0.02328604792707878, 0.9160091572666835)
1980 (-0.04248934051269152, 0.784198151638751)
1990 (0.05488478520412758, 0.7469710686906513)
{'vector_size': 300, 'window': 10, 'min_count': 5, 'negative': 3}
1890 (0.0435997345337

In [315]:
models[1990].wv.most_similar("tere")

[('rinkner', 0.9569980502128601),
 ('glebke', 0.9474722743034363),
 ('keavin', 0.944699227809906),
 ('potapov', 0.9369103312492371),
 ('mürt', 0.9346433281898499),
 ('peetrus', 0.9338791966438293),
 ('valjusti', 0.9316402077674866),
 ('seepeale', 0.9299307465553284),
 ('adu', 0.929161012172699),
 ('tom', 0.929135799407959)]

## Training SVD vectors (from PPMI)

In [359]:
DATA_DIR = "./corpus/lemmas" #select either words or lemmas
MODEL_DIR = "./models/svd"

all_files = [f for f in listdir(DATA_DIR) if isfile(join(DATA_DIR, f))]
all_files = [filename for filename in all_files if filename[-4:] not in [".zip", "r.gz"]]

In [360]:
all_files

['1890_aja_lemmas.txt',
 '1890_ilu_lemmas.txt',
 '1900_aja_lemmas.txt',
 '1900_ilu_lemmas.txt',
 '1910_aja_lemmas.txt',
 '1910_ilu_lemmas.txt',
 '1930_aja_lemmas.txt',
 '1930_ilu_lemmas.txt',
 '1950_aja_lemmas.txt',
 '1950_ilu_lemmas.txt',
 '1960_aja_lemmas.txt',
 '1960_ilu_lemmas.txt',
 '1970_aja_lemmas.txt',
 '1970_ilu_lemmas.txt',
 '1980_aja_lemmas.txt',
 '1980_ilu_lemmas.txt',
 '1980_muu_lemmas.txt',
 '1980_tea_lemmas.txt',
 '1990e_aja_lemmas.txt',
 '1990_ilu_lemmas.txt']

In [361]:
def train_ppmisvd(size=200, window=10, min_count=10):
    svd_models = {}
    for decade in decades:
        chosen_files = [filename for filename in all_files if int(filename[:4]) == decade]
        #print(chosen_files)

        sentences = []
        for filename in chosen_files:
            with open(DATA_DIR + "/" + filename, mode="r", encoding="utf8") as f:
                for row in f:
                    row = row.strip().split(" ")
                    #row = [word.replace("w", "v") for word in row] #normalizing the old style of writing
                    #row = [word.replace("ß", "s") for word in row] #normalizing the old style of writing
                    sentences.append(row)
        # https://valentinp72.github.io/svd2vec/svd2vec.html#svd2vec.svd2vec
        #training run 1 - model = svd2vec(sentences, size=150, window=2, min_count=1, verbose=False, workers=8) svd_year.model
        #training run 2 - model = svd2vec(sentences, size=200, window=5, min_count=2, verbose=False, workers=8)
        model = svd2vec(sentences, size=size, window=window, min_count=min_count, verbose=False, workers=8)
        svd_models[decade] = model
        model.save(f"{MODEL_DIR}/svd_{size}d_{window}w_{min_count}mc_{decade}.model") #UPDATE!
    return svd_models

In [362]:
best_val = 0
best_params = None
for vector_size in [200]:
    for window in [2,5,10]:
        for min_count in [2, 3, 5, 10]:
            params = {"vector_size": vector_size, "window": window, "min_count": min_count}
            print(params)
            models = train_ppmisvd(size=vector_size, window=window, min_count=min_count)
            val = evaluate_ppmisvd(models)
            print(val)

            if val > best_val:
                best_val = val
                best_params = params
                print("Found new best value of", best_val, "with parameters", best_params)

{'vector_size': 200, 'window': 2, 'min_count': 2}
1890 (0.07294672535787637, 0.7533437110179739)
1900 (0.03685594103962618, 0.858136408774903)
1910 (0.04152047370536671, 0.8185433449463597)
1930 (0.131196900273934, 0.4259638681716837)
1950 (0.19460390592930948, 0.28583949037456213)
1960 (0.06461517239049959, 0.7344376840034168)
1970 (-0.10087147489744994, 0.5764707326603432)
1980 (0.014567102042378544, 0.9200228489278185)
1990 (-0.03354400344983666, 0.8350708146346542)
0.0703151237319507
Found new best value of 0.0703151237319507 with parameters {'vector_size': 200, 'window': 2, 'min_count': 2}
{'vector_size': 200, 'window': 2, 'min_count': 3}
1890 (-0.20137143574821395, 0.3945776554284753)
1900 (0.07711885689395838, 0.761012494224143)
1910 (0.11747299470443856, 0.5676454494119455)
1930 (-0.08853625255648451, 0.6299064175246979)
1950 (-0.11546747333704246, 0.5584788426933373)
1960 (0.3780745792691157, 0.06239990272821291)
1970 (-0.22061463278973265, 0.23301968644657706)
1980 (0.1519304

In [76]:
svd_models[1970].most_similar(positive=["inimene"], topn=10)

[['seakari', 0.9879194439089369],
 ['paganin', 0.987745913938776],
 ['tärklist', 0.986967662781884],
 ['võõramaks', 0.986838679861838],
 ['pete', 0.9868243926751153],
 ['tiido', 0.9833094800157481],
 ['mt-352', 0.9831037333916005],
 ['novelliti', 0.9825522712033941],
 ['träni', 0.9807039307605041],
 ['vabaõhulaev', 0.9807039307605039]]

In [78]:
svd_models[1970].similarity("inimene", "sukk")

-0.001326078622183164

In [253]:
# I did not use it in the end
def ppmi(sentences, window=2):
    unigram_counts = Counter([item for sublist in sentences for item in sublist])
    
    co_occ = Counter()
    for sentence in sentences:
        for i, word in enumerate(sentence): # do not count occurrences over sentence border
            context_idx = [j for j in range(i-window, i+window+1) if 0 <= j <= len(sentence) - 1  and i != j]
            
            for j in context_idx:
                co_occ[(word, sentence[j])] += 1
                co_occ[(sentence[j], word)] += 1
    
    return co_occ

In [252]:
sentence = ["tere", "mina", "olen", "üks", "väga", "pikk", "lause"]
window=4
for i, word in enumerate(sentence): # do not count occurrences over sentence border
    context_idx = [j for j in range(i-window, i+window+1) if 0 <= j <= len(sentence) - 1  and i != j]
    print(i, context_idx)

0 [1, 2, 3, 4]
1 [0, 2, 3, 4, 5]
2 [0, 1, 3, 4, 5, 6]
3 [0, 1, 2, 4, 5, 6]
4 [0, 1, 2, 3, 5, 6]
5 [1, 2, 3, 4, 6]
6 [2, 3, 4, 5]


## Reading in models from files

In [480]:
#word2vec

#%word2vec: Found new best value of 0.3487609631693991 with parameters {'vector_size': 200, 'window': 10, 'min_count': 10, 'negative': 3}

# Finding all filenames in the data folder
MODEL_DIR = "./models/word2vec"
all_files = [f for f in listdir(MODEL_DIR) if isfile(join(MODEL_DIR, f))]
all_files = [filename for filename in all_files if filename.startswith("word2vec_200d_10w_10mc_1algo_3neg_") and filename[-5:] == "model" and filename[-4:] not in [".zip", "r.gz"] and "aligned" not in filename]
models = {}

for file in all_files:
    print(file)
    model = Word2Vec.load(f"{MODEL_DIR}/{file}")
    models[int(file[-10:-6])] = model

word2vec_200d_10w_10mc_1algo_3neg_1890.model
word2vec_200d_10w_10mc_1algo_3neg_1900.model
word2vec_200d_10w_10mc_1algo_3neg_1910.model
word2vec_200d_10w_10mc_1algo_3neg_1930.model
word2vec_200d_10w_10mc_1algo_3neg_1950.model
word2vec_200d_10w_10mc_1algo_3neg_1960.model
word2vec_200d_10w_10mc_1algo_3neg_1970.model
word2vec_200d_10w_10mc_1algo_3neg_1980.model
word2vec_200d_10w_10mc_1algo_3neg_1990.model


In [461]:
models

{1890: <gensim.models.word2vec.Word2Vec at 0x11e581e88b0>,
 1900: <gensim.models.word2vec.Word2Vec at 0x11e581e8130>,
 1910: <gensim.models.word2vec.Word2Vec at 0x11e686240a0>,
 1930: <gensim.models.word2vec.Word2Vec at 0x11e68624400>,
 1950: <gensim.models.word2vec.Word2Vec at 0x11e21cfd100>,
 1960: <gensim.models.word2vec.Word2Vec at 0x11e21cfd760>,
 1970: <gensim.models.word2vec.Word2Vec at 0x11e21cfd7c0>,
 1980: <gensim.models.word2vec.Word2Vec at 0x11e61211100>,
 1990: <gensim.models.word2vec.Word2Vec at 0x11e612112e0>}

In [467]:
#svd
#%SVD: Found new best value of 0.2565087015193734 with parameters {'vector_size': 200, 'window': 5, 'min_count': 5}
# Finding all filenames in the data folder
MODEL_DIR = "./models/svd"
all_files = [f for f in listdir(MODEL_DIR) if isfile(join(MODEL_DIR, f))]
all_files = [filename for filename in all_files if filename.startswith("svd_200d_5w_5mc_") and filename[-5:] == "model" and filename[-4:] not in [".zip", "r.gz"] and "aligned" not in filename and "diff" not in filename]
svd_models = {}

for file in all_files:
    print(file)
    model = svd2vec.load(f"{MODEL_DIR}/{file}")
    svd_models[int(file[-10:-6])] = model

svd_200d_5w_5mc_1890.model
svd_200d_5w_5mc_1900.model
svd_200d_5w_5mc_1910.model
svd_200d_5w_5mc_1930.model
svd_200d_5w_5mc_1950.model
svd_200d_5w_5mc_1960.model
svd_200d_5w_5mc_1970.model
svd_200d_5w_5mc_1980.model
svd_200d_5w_5mc_1990.model


In [468]:
svd_models

{1890: <svd2vec.core.svd2vec at 0x11e21cfd280>,
 1900: <svd2vec.core.svd2vec at 0x11e21cfd2b0>,
 1910: <svd2vec.core.svd2vec at 0x11e581e8730>,
 1930: <svd2vec.core.svd2vec at 0x11e581e8850>,
 1950: <svd2vec.core.svd2vec at 0x11e6e439550>,
 1960: <svd2vec.core.svd2vec at 0x11e6e439cd0>,
 1970: <svd2vec.core.svd2vec at 0x11e581e8a90>,
 1980: <svd2vec.core.svd2vec at 0x11e5361f550>,
 1990: <svd2vec.core.svd2vec at 0x11e5361f0a0>}

In [111]:
#svd
# Finding all filenames in the data folder
MODEL_DIR = "./models/svd"
all_files = [f for f in listdir(MODEL_DIR) if isfile(join(MODEL_DIR, f))]
all_files = [filename for filename in all_files if filename[-5:] == "model" and filename[-4:] not in [".zip", "r.gz"] and "aligned" not in filename]
svd_models = {}

for file in all_files:
    print(file)
    svd_model = svd2vec.load(f"{MODEL_DIR}/{file}")
    svd_model.save_word2vec_format(f"{MODEL_DIR}/diff_{file}")
    #model = Word2Vec.load(f"{MODEL_DIR}/{file}")
    #models[int(file[:4])] = model

svd_1890.model


UnicodeEncodeError: 'charmap' codec can't encode character '\xe1' in position 2: character maps to <undefined>

In [135]:
svd_models.pop(1970)

<svd2vec.core.svd2vec at 0x11e21a14ac0>

In [130]:
svd_models[1950].words

['ja',
 'olema',
 'see',
 'tema',
 'et',
 'ei',
 'mina',
 'mis',
 'töö',
 'oma',
 'nõukogu',
 'kui',
 'kolhoos',
 'võtma',
 'suur',
 'ka',
 'kes',
 'kõik',
 'sina',
 'aasta',
 'nagu',
 'juba',
 'pidama',
 'nsv',
 'ning',
 'andma',
 'rahvas',
 'teine',
 'noor',
 'üks',
 'veel',
 'aeg',
 'aga',
 'inimene',
 'uus',
 'plaan',
 'vastu',
 'tulema',
 'saama',
 'nüüd',
 'mitte',
 'tegema',
 'maa',
 'kuid',
 'ameerika',
 'ise',
 'liit',
 'jääma',
 'läbi',
 'siis',
 'küsimus',
 'esimene',
 'näitama',
 'toimuma',
 'iga',
 'sotsialistlik',
 'kogu',
 'osa',
 '.',
 'eesti',
 'töötaja',
 'võitlus',
 'hakkama',
 'sm',
 'võistlus',
 'linn',
 'praegu',
 'kaks',
 'panema',
 'nii',
 'stalin',
 'partei',
 'kommunistlik',
 'valitsus',
 'ütlema',
 'vald',
 'nägema',
 'tass',
 'poolt',
 'kolm',
 'arv',
 'päev',
 'töötama',
 'ette',
 'tahtma',
 'võima',
 'vaid',
 'täitmine',
 'kõige',
 'vabariik',
 'riik',
 'eest',
 'ainult',
 'siin',
 'palju',
 'sama',
 'tundma',
 'teatama',
 'minema',
 'kohta',
 'rida',
 'ku

In [136]:
for decade, svd_model in svd_models.items():
    print(decade)
    #svd_model = svd2vec.load(f"{MODEL_DIR}/{file}")
    svd_model.save_word2vec_format(f"{MODEL_DIR}/diff_svd_{decade}.model")

1900
1910


### Reading in SVD vectors as word2vec format (could not use this)

In [149]:
from gensim.models import KeyedVectors

svd_models = {}

MODEL_DIR = "./models/svd"
all_files = [f for f in listdir(MODEL_DIR) if isfile(join(MODEL_DIR, f))]
all_files = [filename for filename in all_files if filename[-5:] == "model" and filename[-4:] not in [".zip", "r.gz"] and "aligned" not in filename and "diff" in filename]

for file in all_files:
    print(file)
    svd_model = KeyedVectors.load_word2vec_format(f"{MODEL_DIR}/{file}", binary=False, encoding= 'unicode_escape')
    svd_models[int(file[9:13])] = svd_model

diff_svd_1900.model
diff_svd_1910.model


# Aligning vectors

## Aligning word2vec

In [422]:
MODEL_DIR = "./models/word2vec"

In [423]:
models

{1890: <gensim.models.word2vec.Word2Vec at 0x11e608abdf0>,
 1900: <gensim.models.word2vec.Word2Vec at 0x11e2150c4f0>,
 1910: <gensim.models.word2vec.Word2Vec at 0x11e581e9250>,
 1930: <gensim.models.word2vec.Word2Vec at 0x11e581e98b0>,
 1950: <gensim.models.word2vec.Word2Vec at 0x11e581e91c0>,
 1960: <gensim.models.word2vec.Word2Vec at 0x11e581e9070>,
 1970: <gensim.models.word2vec.Word2Vec at 0x11e21d0bfd0>,
 1980: <gensim.models.word2vec.Word2Vec at 0x11e21d0b640>,
 1990: <gensim.models.word2vec.Word2Vec at 0x11e21d0b5b0>}

In [424]:
decades = list(models.keys())
print(decades)

[1890, 1900, 1910, 1930, 1950, 1960, 1970, 1980, 1990]


In [425]:
# Code from https://gist.github.com/zhicongchen/9e23d5c3f1e5b1293b16133485cd17d8

def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.

    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
    # base_embed.init_sims(replace=True)
    # other_embed.init_sims(replace=True)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)

In [426]:
#Set the start and end indexes
start = 1890
end = 1990

start_ind = decades.index(start)
end_ind = decades.index(end)

In [427]:
aligned_models = {}

for i in range(start_ind, end_ind):
    #print(decades[i], decades[i+1])
    # FIX from https://gist.github.com/zhicongchen/9e23d5c3f1e5b1293b16133485cd17d8?permalink_comment_id=3837234#gistcomment-3837234
    base_embed = models[decades[i]]
    other_embed = models[decades[i+1]]
    
    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed)

    # re-filling the normed vectors
    in_base_embed.wv.fill_norms(force=True)
    in_other_embed.wv.fill_norms(force=True)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()
    
    # we can do this because time point t's in_other_embed equals t+1's in_base_embed
    aligned_models[decades[i]] = in_base_embed
    aligned_models[decades[i+1]] = in_other_embed
    
for decade, model in aligned_models.items():
    model.save(f"{MODEL_DIR}/word2vec_{start}-{end}_{decade}_aligned.model")

2402 2402
2402 2402
2087 2087
2087 2087
1728 1728
1728 1728
1321 1321
1321 1321
1252 1252
1252 1252
1233 1233
1233 1233
1233 1233
1233 1233
1232 1232
1232 1232


## Aligning SVD models

In [428]:
MODEL_DIR = "./models/svd"

In [429]:
svd_models

{1890: <svd2vec.core.svd2vec at 0x11e58d86b20>,
 1900: <svd2vec.core.svd2vec at 0x11e58d86340>,
 1910: <svd2vec.core.svd2vec at 0x11e58d867f0>,
 1930: <svd2vec.core.svd2vec at 0x11e58d869d0>,
 1950: <svd2vec.core.svd2vec at 0x11e58d86af0>,
 1960: <svd2vec.core.svd2vec at 0x11e58d86460>,
 1970: <svd2vec.core.svd2vec at 0x11e58d86580>,
 1980: <svd2vec.core.svd2vec at 0x11e58d86430>,
 1990: <svd2vec.core.svd2vec at 0x11e58d86ca0>}

In [431]:
"""
# Code from https://gist.github.com/zhicongchen/9e23d5c3f1e5b1293b16133485cd17d8

def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.

    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    

    # patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
    # base_embed.init_sims(replace=True)
    # other_embed.init_sims(replace=True)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

def intersection_align_gensim(m1, m2, words=None):
    
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)
"""

def intersection_align_svd(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.words)
    vocab_m2 = set(m2.words)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.terms_counts[w] + m2.terms_counts[w], reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.vocabulary[w] for w in common_vocab]
        old_arr = m.svd_w
        new_arr = np.array([old_arr[index] for index in indices])
        m.svd_w = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.vocabulary = new_key_to_index
        m.words = new_index_to_key
        
        print(len(m.vocabulary), len(m.svd_w))
        
    return (m1,m2)

In [432]:
#Set the start and end indexes
start = 1890
end = 1990

start_ind = decades.index(start)
end_ind = decades.index(end)

In [433]:
aligned_svd_models = {}

for i in range(start_ind, end_ind):
    # FIX from https://gist.github.com/zhicongchen/9e23d5c3f1e5b1293b16133485cd17d8?permalink_comment_id=3837234#gistcomment-3837234
    base_embed = svd_models[decades[i]]
    other_embed = svd_models[decades[i+1]]
    
    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_svd(base_embed, other_embed)

    # re-filling the normed vectors
    #in_base_embed.wv.fill_norms(force=True)
    #in_other_embed.wv.fill_norms(force=True)

    # get the (normalized) embedding matrices
    #base_vecs = in_base_embed.wv.get_normed_vectors()
    #other_vecs = in_other_embed.wv.get_normed_vectors()
    
    # we can do this because time point t's in_other_embed equals t+1's in_base_embed
    aligned_svd_models[decades[i]] = in_base_embed
    aligned_svd_models[decades[i+1]] = in_other_embed
    
for decade, model in aligned_svd_models.items():
    model.save(f"{MODEL_DIR}/svd_{start}-{end}_{decade}_aligned.model")

3795 3795
3795 3795
3187 3187
3187 3187
2589 2589
2589 2589
1977 1977
1977 1977
1884 1884
1884 1884
1849 1849
1849 1849
1846 1846
1846 1846
1844 1844
1844 1844


In [434]:
aligned_svd_models

{1890: <svd2vec.core.svd2vec at 0x11e58d86b20>,
 1900: <svd2vec.core.svd2vec at 0x11e58d86340>,
 1910: <svd2vec.core.svd2vec at 0x11e58d867f0>,
 1930: <svd2vec.core.svd2vec at 0x11e58d869d0>,
 1950: <svd2vec.core.svd2vec at 0x11e58d86af0>,
 1960: <svd2vec.core.svd2vec at 0x11e58d86460>,
 1970: <svd2vec.core.svd2vec at 0x11e58d86580>,
 1980: <svd2vec.core.svd2vec at 0x11e58d86430>,
 1990: <svd2vec.core.svd2vec at 0x11e58d86ca0>}

## Saving diachronic embeddings

### word2vec

In [436]:
ref_year = 1990
all_words = list(aligned_models[ref_year].wv.key_to_index.keys())

In [437]:
keys = aligned_models[ref_year].wv.key_to_index
idxs = aligned_models[ref_year].wv.index_to_key

In [438]:
keys

{'olema': 0,
 'ja': 1,
 'tema': 2,
 'see': 3,
 'mina': 4,
 'ei': 5,
 'et': 6,
 'mis': 7,
 'kui': 8,
 'ka': 9,
 'oma': 10,
 'aga': 11,
 'saama': 12,
 'sina': 13,
 'ning': 14,
 'siis': 15,
 'kes': 16,
 'või': 17,
 'tulema': 18,
 'ise': 19,
 'pidama': 20,
 'aasta': 21,
 'üks': 22,
 'nii': 23,
 'kõik': 24,
 'võima': 25,
 'tegema': 26,
 'teine': 27,
 'nagu': 28,
 'minema': 29,
 'ütlema': 30,
 'eesti': 31,
 'veel': 32,
 'aeg': 33,
 'jääma': 34,
 'võtma': 35,
 'juba': 36,
 'mees': 37,
 'inimene': 38,
 'kuid': 39,
 'hakkama': 40,
 'teadma': 41,
 'kas': 42,
 'suur': 43,
 'miski': 44,
 'välja': 45,
 'ära': 46,
 'nägema': 47,
 'nüüd': 48,
 'andma': 49,
 'üle': 50,
 'tahtma': 51,
 'mitte': 52,
 'palju': 53,
 'mõni': 54,
 'töö': 55,
 'vaid': 56,
 'ainult': 57,
 'kord': 58,
 'uus': 59,
 'küll': 60,
 'vastu': 61,
 'kus': 62,
 'pärast': 63,
 'kaks': 64,
 'sest': 65,
 'naine': 66,
 'vaatama': 67,
 'käsi': 68,
 'enam': 69,
 'esimene': 70,
 'iga': 71,
 'panema': 72,
 'päev': 73,
 'eest': 74,
 'ju': 75,
 

In [439]:
idxs

['olema',
 'ja',
 'tema',
 'see',
 'mina',
 'ei',
 'et',
 'mis',
 'kui',
 'ka',
 'oma',
 'aga',
 'saama',
 'sina',
 'ning',
 'siis',
 'kes',
 'või',
 'tulema',
 'ise',
 'pidama',
 'aasta',
 'üks',
 'nii',
 'kõik',
 'võima',
 'tegema',
 'teine',
 'nagu',
 'minema',
 'ütlema',
 'eesti',
 'veel',
 'aeg',
 'jääma',
 'võtma',
 'juba',
 'mees',
 'inimene',
 'kuid',
 'hakkama',
 'teadma',
 'kas',
 'suur',
 'miski',
 'välja',
 'ära',
 'nägema',
 'nüüd',
 'andma',
 'üle',
 'tahtma',
 'mitte',
 'palju',
 'mõni',
 'töö',
 'vaid',
 'ainult',
 'kord',
 'uus',
 'küll',
 'vastu',
 'kus',
 'pärast',
 'kaks',
 'sest',
 'naine',
 'vaatama',
 'käsi',
 'enam',
 'esimene',
 'iga',
 'panema',
 'päev',
 'eest',
 'ju',
 'siin',
 'peale',
 'ega',
 'rääkima',
 'asi',
 'keegi',
 'mõtlema',
 'tundma',
 'käima',
 'kuidas',
 'väga',
 'silm',
 'jõudma',
 'elu',
 'tagasi',
 'hea',
 'sõna',
 'seal',
 'kõige',
 'läbi',
 'laps',
 'ikka',
 'kogu',
 'viimane',
 'mingi',
 'leidma',
 'maa',
 'arvama',
 'tallinn',
 'ette',
 

In [440]:
all_words

['olema',
 'ja',
 'tema',
 'see',
 'mina',
 'ei',
 'et',
 'mis',
 'kui',
 'ka',
 'oma',
 'aga',
 'saama',
 'sina',
 'ning',
 'siis',
 'kes',
 'või',
 'tulema',
 'ise',
 'pidama',
 'aasta',
 'üks',
 'nii',
 'kõik',
 'võima',
 'tegema',
 'teine',
 'nagu',
 'minema',
 'ütlema',
 'eesti',
 'veel',
 'aeg',
 'jääma',
 'võtma',
 'juba',
 'mees',
 'inimene',
 'kuid',
 'hakkama',
 'teadma',
 'kas',
 'suur',
 'miski',
 'välja',
 'ära',
 'nägema',
 'nüüd',
 'andma',
 'üle',
 'tahtma',
 'mitte',
 'palju',
 'mõni',
 'töö',
 'vaid',
 'ainult',
 'kord',
 'uus',
 'küll',
 'vastu',
 'kus',
 'pärast',
 'kaks',
 'sest',
 'naine',
 'vaatama',
 'käsi',
 'enam',
 'esimene',
 'iga',
 'panema',
 'päev',
 'eest',
 'ju',
 'siin',
 'peale',
 'ega',
 'rääkima',
 'asi',
 'keegi',
 'mõtlema',
 'tundma',
 'käima',
 'kuidas',
 'väga',
 'silm',
 'jõudma',
 'elu',
 'tagasi',
 'hea',
 'sõna',
 'seal',
 'kõige',
 'läbi',
 'laps',
 'ikka',
 'kogu',
 'viimane',
 'mingi',
 'leidma',
 'maa',
 'arvama',
 'tallinn',
 'ette',
 

In [441]:
vecs = [[None] * len(decades) for i in range(len(all_words))]

In [442]:
for i, word in enumerate(all_words):
    for j, decade in enumerate(decades):
        vecs[i][j] = list(aligned_models[decade].wv[word])

In [443]:
saveable = {'E': vecs,
    "w": all_words,
    "d": decades}

with open(f'{MODEL_DIR}/word2vec_a_1890-1990_diachronic.pickle', 'wb') as handle:
    pickle.dump(saveable, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{MODEL_DIR}/word2vec_a_1890-1990_diachronic.pickle', 'rb') as handle:
    b = pickle.load(handle)
    
print(saveable == b)

True


## SVD

In [444]:
MODEL_DIR = "./models/svd"

In [445]:
aligned_svd_models

{1890: <svd2vec.core.svd2vec at 0x11e58d86b20>,
 1900: <svd2vec.core.svd2vec at 0x11e58d86340>,
 1910: <svd2vec.core.svd2vec at 0x11e58d867f0>,
 1930: <svd2vec.core.svd2vec at 0x11e58d869d0>,
 1950: <svd2vec.core.svd2vec at 0x11e58d86af0>,
 1960: <svd2vec.core.svd2vec at 0x11e58d86460>,
 1970: <svd2vec.core.svd2vec at 0x11e58d86580>,
 1980: <svd2vec.core.svd2vec at 0x11e58d86430>,
 1990: <svd2vec.core.svd2vec at 0x11e58d86ca0>}

In [446]:
all_words = list(aligned_svd_models[ref_year].words)
all_words

['olema',
 'ja',
 'tema',
 'see',
 'ei',
 'mina',
 'kui',
 'et',
 'mis',
 'ka',
 'oma',
 'aga',
 'ning',
 'saama',
 'tulema',
 'nii',
 'ise',
 'pidama',
 'kes',
 'aasta',
 'või',
 'siis',
 'üks',
 'sina',
 'minema',
 'võima',
 'tegema',
 'teine',
 'nagu',
 'ütlema',
 'aeg',
 'võtma',
 'kõik',
 'veel',
 'mees',
 'juba',
 'suur',
 'eesti',
 'inimene',
 'ära',
 'jääma',
 'teadma',
 'välja',
 'kas',
 'kuid',
 'andma',
 'nägema',
 'hakkama',
 'mitte',
 'üle',
 'miski',
 'töö',
 'nüüd',
 'palju',
 'kord',
 'vaid',
 'iga',
 'uus',
 'mõni',
 'siin',
 'sest',
 'küll',
 'ju',
 'kaks',
 'tahtma',
 'kus',
 'enam',
 'väga',
 'panema',
 'pärast',
 'vaatama',
 'ainult',
 'ikka',
 'käima',
 'seal',
 'esimene',
 'rääkima',
 'vastu',
 'tundma',
 'ega',
 'päev',
 'hea',
 'kuidas',
 'eest',
 'peale',
 'käsi',
 'naine',
 'tagasi',
 'asi',
 'a',
 'silm',
 'enne',
 'kõige',
 'osa',
 'viimane',
 'sama',
 'sõna',
 'mõtlema',
 'elu',
 'mingi',
 'kogu',
 'rohkem',
 'maa',
 'tallinn',
 'jõudma',
 'laps',
 'leidma

In [447]:
aligned_svd_models[ref_year].vocabulary

{'olema': 0,
 'ja': 1,
 'tema': 2,
 'see': 3,
 'ei': 4,
 'mina': 5,
 'kui': 6,
 'et': 7,
 'mis': 8,
 'ka': 9,
 'oma': 10,
 'aga': 11,
 'ning': 12,
 'saama': 13,
 'tulema': 14,
 'nii': 15,
 'ise': 16,
 'pidama': 17,
 'kes': 18,
 'aasta': 19,
 'või': 20,
 'siis': 21,
 'üks': 22,
 'sina': 23,
 'minema': 24,
 'võima': 25,
 'tegema': 26,
 'teine': 27,
 'nagu': 28,
 'ütlema': 29,
 'aeg': 30,
 'võtma': 31,
 'kõik': 32,
 'veel': 33,
 'mees': 34,
 'juba': 35,
 'suur': 36,
 'eesti': 37,
 'inimene': 38,
 'ära': 39,
 'jääma': 40,
 'teadma': 41,
 'välja': 42,
 'kas': 43,
 'kuid': 44,
 'andma': 45,
 'nägema': 46,
 'hakkama': 47,
 'mitte': 48,
 'üle': 49,
 'miski': 50,
 'töö': 51,
 'nüüd': 52,
 'palju': 53,
 'kord': 54,
 'vaid': 55,
 'iga': 56,
 'uus': 57,
 'mõni': 58,
 'siin': 59,
 'sest': 60,
 'küll': 61,
 'ju': 62,
 'kaks': 63,
 'tahtma': 64,
 'kus': 65,
 'enam': 66,
 'väga': 67,
 'panema': 68,
 'pärast': 69,
 'vaatama': 70,
 'ainult': 71,
 'ikka': 72,
 'käima': 73,
 'seal': 74,
 'esimene': 75,
 '

In [448]:
keys = aligned_svd_models[ref_year].vocabulary
#idxs = aligned_svd_models[1970].wv.index_to_key

In [449]:
all_words

['olema',
 'ja',
 'tema',
 'see',
 'ei',
 'mina',
 'kui',
 'et',
 'mis',
 'ka',
 'oma',
 'aga',
 'ning',
 'saama',
 'tulema',
 'nii',
 'ise',
 'pidama',
 'kes',
 'aasta',
 'või',
 'siis',
 'üks',
 'sina',
 'minema',
 'võima',
 'tegema',
 'teine',
 'nagu',
 'ütlema',
 'aeg',
 'võtma',
 'kõik',
 'veel',
 'mees',
 'juba',
 'suur',
 'eesti',
 'inimene',
 'ära',
 'jääma',
 'teadma',
 'välja',
 'kas',
 'kuid',
 'andma',
 'nägema',
 'hakkama',
 'mitte',
 'üle',
 'miski',
 'töö',
 'nüüd',
 'palju',
 'kord',
 'vaid',
 'iga',
 'uus',
 'mõni',
 'siin',
 'sest',
 'küll',
 'ju',
 'kaks',
 'tahtma',
 'kus',
 'enam',
 'väga',
 'panema',
 'pärast',
 'vaatama',
 'ainult',
 'ikka',
 'käima',
 'seal',
 'esimene',
 'rääkima',
 'vastu',
 'tundma',
 'ega',
 'päev',
 'hea',
 'kuidas',
 'eest',
 'peale',
 'käsi',
 'naine',
 'tagasi',
 'asi',
 'a',
 'silm',
 'enne',
 'kõige',
 'osa',
 'viimane',
 'sama',
 'sõna',
 'mõtlema',
 'elu',
 'mingi',
 'kogu',
 'rohkem',
 'maa',
 'tallinn',
 'jõudma',
 'laps',
 'leidma

In [450]:
vecs = [[None] * len(aligned_svd_models) for i in range(len(all_words))]

In [399]:
aligned_svd_models[1990].vectors("tere")

(array([-0.05587434, -0.1951355 , -0.08260426,  0.13774018, -0.01712797,
         0.04574868, -0.02473389, -0.04623565,  0.15354832, -0.03686719,
        -0.05481276,  0.05318639, -0.05520777, -0.09894572,  0.04152091,
        -0.08698918, -0.01870924, -0.01662217,  0.07333104, -0.05576035,
         0.02212026, -0.03710021, -0.13262742,  0.03707617, -0.0099506 ,
         0.0333817 ,  0.08237399,  0.03535392, -0.01573712,  0.01395037,
         0.06340759,  0.06202773,  0.00856516,  0.00645246,  0.06215041,
        -0.05203828,  0.04194363, -0.1218261 ,  0.00947896, -0.05324343,
        -0.03202885,  0.04130483,  0.01415252,  0.05492151, -0.02720783,
         0.00536357,  0.01738864, -0.15728112, -0.11027659,  0.0097949 ,
        -0.05634966,  0.01497706, -0.10478808, -0.0842309 ,  0.06157946,
         0.00749956, -0.09636088, -0.07610998, -0.07994035,  0.11360342,
        -0.02883455,  0.00962372, -0.13614418, -0.08060705, -0.04137516,
         0.08250113,  0.04605633, -0.10614752,  0.0

In [451]:
for i, word in enumerate(all_words):
    for j, decade in enumerate(aligned_svd_models.keys()):
        #print(decade)
        vecs[i][j] = list(aligned_svd_models[decade].vectors(word)[0])

In [452]:
saveable = {'E': vecs,
    "w": all_words,
    "d": decades}

with open(f'{MODEL_DIR}/svd_a_1890-1990_diachronic.pickle', 'wb') as handle:
    pickle.dump(saveable, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{MODEL_DIR}/svd_a_1890-1990_diachronic.pickle', 'rb') as handle:
    b = pickle.load(handle)
    
print(saveable == b)

True


# Evaluating vectors

In [470]:
rg65 = {("juhe", "naeratus"): 0.02,
    ("kukk", "reis"): 0.04, #voyage - reis
    ("keskpäev", "nöör"): 0.04,
    ("puuvili", "ahi"): 0.05,
    ("autogramm", "kallas"): 0.06,
    ("automobiil", "võlur"): 0.11,
    ("küngas", "pliit"): 0.14,
    ("muigama", "rakendama"): 0.18,
    ("varjupaik", "puuvili"): 0.19,
    ("varjupaik", "munk"): 0.39,
    ("surnuaed", "hullumaja"): 0.42,
    ("klaas", "mustkunstnik"): 0.44,
    ("poiss", "kukk"): 0.44,
    ("padi", "juveel"): 0.45,
    ("munk", "ori"): 0.57,
    ("varjupaik", "kalmistu"): 0.79,
    ("rannik", "mets"): 0.85,
    ("muigama", "noormees"): 0.88,
    ("kallas", "metsamaa"): 0.90,
    ("munk", "oraakel"): 0.91,
    ("poiss", "mõttetark"): 0.96,
    ("automobiil", "padi"): 0.97,
    ("küngas", "kallas"): 0.97,
    ("noormees", "võlur"): 0.99,
    ("mets", "surnuaed"): 1.00,
    ("toit", "kukk"): 1.09,
    ("kalmistu", "metsamaa"): 1.18,
    ("kallas", "reis"): 1.22,
    ("lind", "metsamaa"): 1.24,
    ("rannik", "mägi"): 1.26,
    ("ahi", "rakendama"): 1.37,
    ("kurg", "kukk"): 1.41,
    ("mägi", "metsamaa"): 1.48,
    ("auto", "teekond"): 1.55, #car - journey (teekond)
    ("kalmistu", "küngas"): 1.69,
    ("klaas", "juveel"): 1.78,
    ("mustkunstnik", "oraakel"): 1.82,
    ("kurg", "rakendama"): 2.37,
    ("vend", "noormees"): 2.41,
    ("mõttetark", "võlur"): 2.46,
    ("oraakel", "mõttetark"): 2.61,
    ("lind", "kurg"): 2.63,
    ("lind", "kukk"): 2.63,
    ("toit", "puuvili"): 2.69,
    ("vend", "munk"): 2.74,
    ("varjupaik", "hullumaja"): 3.04,
    ("ahi", "pliit"): 3.11,
    ("mustkunstnik", "võlur"): 3.21,
    ("mägi", "küngas"): 3.29,
    ("juhe", "nöör"): 3.41,
    ("klaas", "trummel"): 3.45,
    ("muigama", "naeratama"): 3.46,
    ("pärisorjus", "ori"): 3.46,
    ("teekond", "reis"): 3.58, #journey - voyage (reis)
    ("autogramm", "allkiri"): 3.59,
    ("rannik", "kallas"): 3.60,
    ("mets", "metsamaa"): 3.65,
    ("rakendama", "tööriist"): 3.66,
    #("cock", "rooster"): 3.68, #the exact same words in Estonian
    ("poiss", "noormees"): 3.82,
    #("cushion", "pillow"): 3.84, #the exact same words in Estonian
    ("kalmistu", "surnuaed"): 3.88,
    ("automobiil", "auto"): 3.92,
    #("midday", "noon"): 3.94, #the exact same words in Estonian
    ("kalliskivi", "juveel"): 3.94}

In [471]:
for e in [w[1] for w in list(rg65)]:
    print(e)

naeratus
reis
nöör
ahi
kallas
võlur
pliit
rakendama
puuvili
munk
hullumaja
mustkunstnik
kukk
juveel
ori
kalmistu
mets
noormees
metsamaa
oraakel
mõttetark
padi
kallas
võlur
surnuaed
kukk
metsamaa
reis
metsamaa
mägi
rakendama
kukk
metsamaa
teekond
küngas
juveel
oraakel
rakendama
noormees
võlur
mõttetark
kurg
kukk
puuvili
munk
hullumaja
pliit
võlur
küngas
nöör
trummel
naeratama
ori
reis
allkiri
kallas
metsamaa
tööriist
noormees
surnuaed
auto
juveel


In [472]:
for v in list(rg65.values()):
    print(v)

0.02
0.04
0.04
0.05
0.06
0.11
0.14
0.18
0.19
0.39
0.42
0.44
0.44
0.45
0.57
0.79
0.85
0.88
0.9
0.91
0.96
0.97
0.97
0.99
1.0
1.09
1.18
1.22
1.24
1.26
1.37
1.41
1.48
1.55
1.69
1.78
1.82
2.37
2.41
2.46
2.61
2.63
2.63
2.69
2.74
3.04
3.11
3.21
3.29
3.41
3.45
3.46
3.46
3.58
3.59
3.6
3.65
3.66
3.82
3.88
3.92
3.94


In [453]:
aligned_models

{1890: <gensim.models.word2vec.Word2Vec at 0x11e608abdf0>,
 1900: <gensim.models.word2vec.Word2Vec at 0x11e2150c4f0>,
 1910: <gensim.models.word2vec.Word2Vec at 0x11e581e9250>,
 1930: <gensim.models.word2vec.Word2Vec at 0x11e581e98b0>,
 1950: <gensim.models.word2vec.Word2Vec at 0x11e581e91c0>,
 1960: <gensim.models.word2vec.Word2Vec at 0x11e581e9070>,
 1970: <gensim.models.word2vec.Word2Vec at 0x11e21d0bfd0>,
 1980: <gensim.models.word2vec.Word2Vec at 0x11e21d0b640>,
 1990: <gensim.models.word2vec.Word2Vec at 0x11e21d0b5b0>}

In [481]:
#word2vec

def evaluate_word2vec(models):
    c = 0
    for decade in decades:
        print(decade, end = " ")
        model_i = models[decade]

        rg_sim = [] # Rubenstein & Goodenough ground truth similarities
        calc_sim = [] # Calculated similarities for the model
        for w1,w2 in rg65:
            #print(w1, w2)
            if w1 in model_i.wv and w2 in model_i.wv: # Only calculate/find similarities if both words in vocabulary
                calc_sim.append(model_i.wv.similarity(w1, w2))
                rg_sim.append(rg65[(w1,w2)])
        c += pearsonr(calc_sim, rg_sim)[0]
        print(pearsonr(calc_sim, rg_sim))
    return c / len(decades) #Average over all Pearson correlations

evaluate_word2vec(models)

1890 (0.2796759476857532, 0.4048868323151496)
1900 (0.7497101310517166, 0.02001698035815378)
1910 (0.6419452258705292, 0.16935294419687294)
1930 (0.09219599228435847, 0.6756605141168575)
1950 (-0.09327659890855272, 0.761823320426157)
1960 (0.6502813100866606, 0.02205284402105857)
1970 (0.325463333931819, 0.18753044657904436)
1980 (0.22382881145730327, 0.21050418655170539)
1990 (0.26902451506500447, 0.13651618229852516)


0.3487609631693991

In [456]:
cosine(aligned_svd_models[1900].vectors("eesti")[0], aligned_svd_models[1910].vectors("eesti")[0])

1.0010308129600733

In [457]:
aligned_svd_models

{1890: <svd2vec.core.svd2vec at 0x11e58d86b20>,
 1900: <svd2vec.core.svd2vec at 0x11e58d86340>,
 1910: <svd2vec.core.svd2vec at 0x11e58d867f0>,
 1930: <svd2vec.core.svd2vec at 0x11e58d869d0>,
 1950: <svd2vec.core.svd2vec at 0x11e58d86af0>,
 1960: <svd2vec.core.svd2vec at 0x11e58d86460>,
 1970: <svd2vec.core.svd2vec at 0x11e58d86580>,
 1980: <svd2vec.core.svd2vec at 0x11e58d86430>,
 1990: <svd2vec.core.svd2vec at 0x11e58d86ca0>}

In [479]:
#SVD

def evaluate_ppmisvd(svd_models):
    c = 0

    for decade, model_i in svd_models.items():
        print(decade, end = " ")
        rg_sim = []
        calc_sim = []
        for w1,w2 in rg65:
            if w1 in model_i.words and w2 in model_i.words:
                calc_sim.append(model_i.similarity(w1, w2))
                rg_sim.append(rg65[(w1,w2)])
        c += pearsonr(calc_sim, rg_sim)[0]
        print(pearsonr(calc_sim, rg_sim))
    return c / len(svd_models)

evaluate_ppmisvd(svd_models)

1890 (0.2463029341117754, 0.34060364255315434)
1900 (0.09006529443663089, 0.7495680482422213)
1910 (0.47654387651696983, 0.01602488014774559)
1930 (0.15228887549115777, 0.45767419105155105)
1950 (0.18332664276840116, 0.4263564860094457)
1960 (0.17482500216838837, 0.4877802571699536)
1970 (0.2181701765186414, 0.31727515014646435)
1980 (0.02188268644766417, 0.8878767560158614)
1990 (-0.02435327934338869, 0.886233394397038)


0.1710058010129156