# model

In [None]:
import gensim

In [None]:
original_path = 'C:/Users/ddale/Downloads/NLP/rusvectores/model.model'

In [None]:
big_ft = gensim.models.fasttext.FastTextKeyedVectors.load(original_path)

In [None]:
vocab_words = big_ft.vocab.keys()
with open('../data/model_vocab/model_vocab.txt', 'w', encoding='utf-8') as f:
    for w in sorted(vocab_words):
        f.write(w+'\n')

# taiga social

In [27]:
from collections import Counter
import re
import os

In [28]:
text_path = 'C:/Users/ddale/Downloads/NLP/taiga/home/tsha/social/texts'

In [31]:
%%time

raw_counter = Counter()

for fn in os.listdir(text_path):
    if not fn.endswith('txt'):
        continue
    print(fn)
    with open(os.path.join(text_path, fn), 'r', encoding='utf-8') as f:
        for line in f.readlines():
            for w in re.sub('[^а-яёa-z]', ' ', line.lower()).split():
                if w:
                    raw_counter[w] += 1

fbtexts.txt
LiveJournalPostsandcommentsGICR.txt
twtexts.txt
vktexts.txt
Wall time: 2min 22s


In [32]:
print(len(raw_counter))

1037674


In [33]:
raw_counter.most_common(20)

[('и', 1795982),
 ('в', 1710324),
 ('не', 1233161),
 ('на', 896576),
 ('databaseitem', 782967),
 ('что', 730842),
 ('а', 645086),
 ('o', 598672),
 ('с', 576451),
 ('d', 495602),
 ('это', 463749),
 ('я', 456344),
 ('dc', 419553),
 ('dd', 418957),
 ('как', 410708),
 ('a', 410427),
 ('то', 406144),
 ('c', 398109),
 ('quot', 395213),
 ('b', 374821)]

In [34]:
from tqdm.auto import tqdm

In [35]:
from pymorphy2 import MorphAnalyzer
morphAnalyzer = MorphAnalyzer()

def w2lemma(w):
    parsed = morphAnalyzer.parse(w)
    if not parsed:
        return w
    nf = parsed[0].normal_form.replace('ё', 'е')
    return nf or w

In [36]:
lemma_counter = Counter()
for w, c in tqdm(raw_counter.items()):
    lemma_counter[w2lemma(w)] += c

HBox(children=(FloatProgress(value=0.0, max=1037674.0), HTML(value='')))




In [37]:
print(len(lemma_counter))

561921


In [38]:
import pickle

In [42]:
with open('../data/model_vocab/taiga_social_vocab_raw.pkl', 'wb') as f:
    pickle.dump(raw_counter, f)

In [43]:
with open('../data/model_vocab/taiga_social_vocab_lemma.pkl', 'wb') as f:
    pickle.dump(lemma_counter, f)

# diff

In [21]:
model_vocab = set()
with open('../data/model_vocab/model_vocab.txt', 'r', encoding='utf-8') as f:
    for l in f.readlines():
        model_vocab.add(l.strip())

In [49]:
russian_letters = set('абвгдеёжзиклмнопрстуфхцчшщъыьэюя')

def is_ru_word(w):
    if not set(w).intersection(russian_letters):
        return False
    if len(w) < 2:
        return False
    return True

In [50]:
oov_lemmas = Counter({w: c for w, c in lemma_counter.items() if w not in model_vocab and is_ru_word(w)})

In [74]:
inv_lemmas = Counter({w: c for w, c in lemma_counter.items() if w in model_vocab and is_ru_word(w)})

In [68]:
all_lemmas = Counter({w: c for w, c in lemma_counter.items() if is_ru_word(w)})

In [71]:
len(oov_lemmas),

361238

In [64]:
top_oov_lemmas = [w for w, c in oov_lemmas.most_common() if c >= 10]
print(len(top_oov_lemmas))

26186


In [72]:
from random import sample

In [73]:
top_all_lemmas = [w for w, c in all_lemmas.most_common() if c >= 10]
print(len(top_all_lemmas))

79930


In [75]:
top_inv_lemmas = [w for w, c in inv_lemmas.most_common() if c >= 10]
print(len(top_inv_lemmas))

53744


In [76]:
sample(top_oov_lemmas, 10)

['разо',
 'вконтактик',
 'эмергенция',
 'ковалек',
 'мелиссандра',
 'йный',
 'гюрза',
 'авалон',
 'путний',
 'асеана']

In [77]:
with open('../data/model_vocab/taiga_social_in_vocab.txt', 'w', encoding='utf-8') as f:
    for w in top_inv_lemmas:
        f.write(w+'\n')

with open('../data/model_vocab/taiga_social_out_of_vocab.txt', 'w', encoding='utf-8') as f:
    for w in top_oov_lemmas:
        f.write(w+'\n')