# Estonian word embeddings

## Necessary imports

In [6]:
from estnltk import Text # Estonian lemmatization
from estnltk.corpus_processing.parse_enc import parse_enc_file_iterator # corpora parsing
from gensim.models import Word2Vec # main model
from gensim.models import KeyedVectors # for loading pre-trained models
from pathlib import Path # operating system independent file paths
from platform import python_version
import tempfile # for saving model
import pickle # for serializing the corpora
print(python_version()) # Should be 3.7

3.7.9


## Loading the corpora

In [4]:
#https://metashare.ut.ee/repository/browse/estonian-national-corpus-2019-vrt-format/be71121e733b11eaa6e4005056b4002483e6e5cdf35343e595e6ba4576d839fb/
#NB!!! .VERT files, not .PREVERT

#Place all .vert files to be trained in the folder `corpora`
corpora_path = Path('./corpora')

corpora_names = []

for filename in corpora_path.glob('*.vert'):
    print(filename)
    corpora_names.append(filename)

corpora\etnc19_balanced_corpus.vert
corpora\etnc19_doaj.vert


In [7]:
# https://github.com/estnltk/estnltk/blob/version_1.6/tutorials/corpus_processing/importing_text_objects_from_corpora.ipynb

for i in range(len(corpora_names)):
    # input file
    input_file = corpora_names[i]
    print("Reading corpora file", input_file)

    all_lemmas = []

    # iterate over corpus and extract Text objects one-by-one
    for text in parse_enc_file_iterator(input_file, 
                                        tokenization="preserve_partially", 
                                        line_progressbar='ascii',
                                        restore_morph_analysis=True): #Add logger?

        lemmas = text.original_morph_analysis.lemma
        all_lemmas.extend([lemma[0] for lemma in lemmas if lemma[0] != None])


print(len(all_lemmas))
print(all_lemmas.count(None))

corpora\etnc19_doaj.vert


100%|#################################################################| 10793527/10793527 [15:09<00:00, 11869.69line/s]


8014367
119


In [None]:
# Save all lemmas in a pickled file
with open('corpora.pkl', 'wb') as f:
    pickle.dump(all_lemmas, f)

In [None]:
# Loading the corpora from the pickled file
with open('corpora.pkl', 'rb') as f:
    all_lemmas = pickle.load(f)

## Training Word2vec model

In [42]:
#Maybe that option won't work, let's see
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        
        # input file
        
        for i in range(len(corpora_names)): # iterate over all corpora files
            input_file = corpora_names[i]
            print("Reading corpora file", input_file)

            # iterate over corpus and extract Text objects one-by-one
            for text in parse_enc_file_iterator(input_file, 
                                                tokenization="preserve_partially", 
                                                line_progressbar='ascii',
                                                restore_morph_analysis=True): #Add logger?

                #[['Mustamäe'], ['ühiselamu'], ...
                lemmas = text.original_morph_analysis.lemma

                #Filter out nonexisting (None) lemmas
                # Mustamäe
                doc_lemmas = [x[0] for x in lemmas if x[0] != None]

                yield doc_lemmas

In [49]:
sentences = MyCorpus()
model = Word2Vec(sentences=sentences)

In [39]:
model.save("models/word2vec.model") # for some reason I could not get pathlib to work...

INFO:utils.py:551: saving Word2Vec object under models/word2vec.model, separately None
INFO:utils.py:657: not storing attribute vectors_norm
INFO:utils.py:657: not storing attribute cum_table
INFO:utils.py:565: saved models/word2vec.model


## Evaluating the model

In [20]:
model.wv['Tallinn']

array([ 3.6329436 , -4.2133846 ,  0.16787282,  4.0586267 ,  1.5857413 ,
        0.68046236,  3.3129284 , -2.5557702 , -1.3165423 ,  3.172179  ,
       -0.34429678,  0.8267788 , -0.9596452 , -0.48876774,  2.5859125 ,
        2.1190176 ,  3.011986  ,  1.5091019 , -1.0206378 ,  0.45206624,
        5.0228815 , -2.29621   , -0.24138844,  1.0629926 ,  1.3579892 ,
        0.92665577, -4.411577  , -0.7759427 ,  3.4071152 , -0.64127296,
       -3.215266  , -1.6152894 ,  1.1413366 , -0.82576925,  1.8092237 ,
       -4.341474  , -1.7522033 , -2.0108278 ,  1.575736  ,  4.101202  ,
        3.3476107 ,  3.9264216 , -3.0812013 , -4.7063084 ,  2.5465696 ,
       -0.75586706,  1.7816662 ,  2.1636212 ,  2.0037787 , -6.004615  ,
       -1.7273428 , -0.28177536,  1.062909  , -1.6998087 , -0.21927491,
        0.27253053,  1.9296978 ,  0.1006563 , -1.7084851 , -2.089545  ,
        1.7609588 ,  2.919195  ,  2.726189  ,  1.3555702 , -1.9165683 ,
       -2.7238922 , -0.07057456, -0.62147635,  2.9888675 ,  3.46

In [41]:
model = Word2Vec.load("model_doaj_5ep")

#Most common lemmas
model.wv.index2entity[:10]

INFO:utils.py:431: loading Word2Vec object from model_doaj_5ep
INFO:utils.py:465: loading wv recursively from model_doaj_5ep.wv.* with mmap=None
INFO:utils.py:503: setting ignored attribute vectors_norm to None
INFO:utils.py:465: loading vocabulary recursively from model_doaj_5ep.vocabulary.* with mmap=None
INFO:utils.py:465: loading trainables recursively from model_doaj_5ep.trainables.* with mmap=None
INFO:utils.py:503: setting ignored attribute cum_table to None
INFO:utils.py:437: loaded model_doaj_5ep


[',', '.', 'olema', 'ja', ')', '(', 'see', ':', '-', '"']

In [19]:
model.wv.most_similar("sinine")

[('kollane', 0.9136537313461304),
 ('must', 0.8914934396743774),
 ('valge', 0.8838784694671631),
 ('õis', 0.8835828304290771),
 ('hall', 0.866241455078125),
 ('punane', 0.8651580810546875),
 ('roheline', 0.8572124242782593),
 ('pruun', 0.8451236486434937),
 ('luik', 0.8428295850753784),
 ('vares', 0.8385834693908691)]