# Estonian word embeddings

## Necessary imports

In [3]:
from estnltk import Text # Estonian lemmatization
from gensim.models import Word2Vec # main model
from gensim.models import KeyedVectors # for loading pre-trained models
from tqdm.notebook import tqdm # progress bar
from pathlib import Path # operating system independent file paths

## Load in the corpora

In [4]:
corpora_path = Path('./corpora')

corpora_names = []

for filename in corpora_path.glob('*.prevert'):
    print(filename)
    corpora_names.append(filename)

corpora\etnc19_balanced_corpus.prevert
corpora\etnc19_doaj.prevert
corpora\etnc19_reference_corpus.prevert
corpora\etnc19_web_2013.prevert
corpora\etnc19_web_2017.prevert
corpora\etnc19_web_2019.prevert
corpora\etnc19_wikipedia_2017.prevert
corpora\etnc19_wikipedia_2019.prevert


In [9]:
# https://github.com/estnltk/estnltk/blob/version_1.6/tutorials/corpus_processing/importing_text_objects_from_corpora.ipynb

from estnltk.corpus_processing.parse_enc import parse_enc_file_iterator

# input file
input_file = corpora_names[0]
print(input_file)

i = 0

# iterate over corpus and extract Text objects one-by-one
for text_obj in parse_enc_file_iterator( input_file, line_progressbar='ascii' ):
    # TODO: do something with the Text object
    print(text_obj.meta)
    print()
    
    i += 1
    
    if i > 10:
        break

corpora\etnc19_balanced_corpus.prevert


  0%|                                                                      | 378/2233718 [00:00<00:17, 124740.12line/s]

{'id': '2184', 'src': 'Balanced Corpus', 'filename': 'aja_EPL_2002_02_12.tasak.ma', 'texttype_nc': 'periodicals', 'newspaperNumber': 'Eesti Päevaleht 12.02.2002', 'heading': 'Majandus', 'title': 'Mustamäe ühiselamute üks omanik on USAs registreeritud firma', 'texttype': 'Journals', 'texttype_src': 'source data'}

{'id': '2185', 'src': 'Balanced Corpus', 'filename': 'aja_EPL_2002_02_12.tasak.ma', 'texttype_nc': 'periodicals', 'newspaperNumber': 'Eesti Päevaleht 12.02.2002', 'heading': 'Majandus', 'title': 'Kinnisvarahaldur BREM tasus Tallinna Veele vana võla', 'texttype': 'Journals', 'texttype_src': 'source data'}

{'id': '2186', 'src': 'Balanced Corpus', 'filename': 'aja_EPL_2002_02_12.tasak.ma', 'texttype_nc': 'periodicals', 'newspaperNumber': 'Eesti Päevaleht 12.02.2002', 'heading': 'Majandus', 'title': 'British Petroleumi kasum vähenes 46 protsenti', 'texttype': 'Journals', 'texttype_src': 'source data'}

{'id': '2187', 'src': 'Balanced Corpus', 'filename': 'aja_EPL_2002_02_12.tasak




In [40]:
import re
sentences = []
with open(corpora_names[0], encoding="utf-8") as f:
    for line in f:
        if not line.startswith("<"):
            #print(line)
            sentences.append(line.strip())

In [45]:
sentence = sentences[0]

In [50]:
sentence = Text(sentence)
sentence.tag_layer(["morph_analysis"])

text
"Mustamäe ühiselamutel on hooneregistri andmetel kokku neli omanikku, sealhulgas ka USA Oklahoma osariigis registreeritud Cremo Capital L.L.C. Ameerika firmast kui võimalikust omanikust rääkis Mustamäe ühiselamute initsiatiivgrupi liige Jevgenia Ruzmanova."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,36
compound_tokens,"type, normalized",,tokens,False,1
words,normalized_form,,,True,32
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,32


In [54]:
[word for word in sentence.words.text]

['Mustamäe',
 'ühiselamutel',
 'on',
 'hooneregistri',
 'andmetel',
 'kokku',
 'neli',
 'omanikku',
 ',',
 'sealhulgas',
 'ka',
 'USA',
 'Oklahoma',
 'osariigis',
 'registreeritud',
 'Cremo',
 'Capital',
 'L',
 '.',
 'L.C. Ameerika',
 'firmast',
 'kui',
 'võimalikust',
 'omanikust',
 'rääkis',
 'Mustamäe',
 'ühiselamute',
 'initsiatiivgrupi',
 'liige',
 'Jevgenia',
 'Ruzmanova',
 '.']

In [69]:
from tqdm import tqdm
words = []
for sentence in (sentences[:6000]):
    sentence = Text(sentence)
    sentence.tag_layer(["morph_analysis"])
    words.append([word for word in sentence.words.text])

In [70]:
model = Word2Vec(words)

INFO:word2vec.py:1399: collecting all words and their counts
INFO:word2vec.py:1384: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:word2vec.py:1407: collected 28863 word types from a corpus of 99733 raw words and 6000 sentences
INFO:word2vec.py:1458: Loading a fresh vocabulary
INFO:word2vec.py:1482: effective_min_count=5 retains 2688 unique words (9% of original 28863, drops 26175)
INFO:word2vec.py:1488: effective_min_count=5 leaves 62885 word corpus (63% of original 99733, drops 36848)
INFO:word2vec.py:1547: deleting the raw counts dictionary of 28863 items
INFO:word2vec.py:1550: sample=0.001 downsamples 33 most-common words
INFO:word2vec.py:1553: downsampling leaves estimated 44779 word corpus (71.2% of prior 62885)
INFO:base_any2vec.py:1008: estimated required memory for 2688 words and 100 dimensions: 3494400 bytes
INFO:word2vec.py:1699: resetting layer weights
INFO:base_any2vec.py:1196: training model with 3 workers on 2688 vocabulary and 100 features, using

In [80]:
model.wv.most_similar("Eesti")

[('oli', 0.9999585747718811),
 ('Tallinna', 0.9999529123306274),
 ('kus', 0.9999476671218872),
 ('poolt', 0.9999476075172424),
 ('ning', 0.999946653842926),
 ('vaid', 0.9999442100524902),
 ('vastu', 0.9999439120292664),
 ('välja', 0.9999438524246216),
 ('üle', 0.9999434947967529),
 ('pärast', 0.9999428391456604)]