In [5]:
import numpy as np
from collections import Counter, defaultdict
import spacy
import textacy.datasets
ds = textacy.datasets.Wikipedia(lang="ru", version="current")

In [12]:
nlp = spacy.load("ru_core_news_sm")

In [33]:
vlemma_cnt = defaultdict(int)
vlemma_transitive_cnt = defaultdict(int)
vlemma2plpast = defaultdict(Counter)

nlemma_cnt = defaultdict(int)
nlemma2plnom = defaultdict(Counter)

for text,record in ds.records(limit=20000):
    processed = nlp(text)
    
    for token in processed:
        if token.pos_ == "NOUN":
            if 'Number=Plur' in token.morph and 'Case=Nom' in token.morph:
                nlemma_cnt[token.lemma_] += 1
                nlemma2plnom[token.lemma_].update([token.text.lower(),])
        if token.pos_ == "VERB" and \
           'Number=Plur' in token.morph and 'Tense=Past' in token.morph and 'VerbForm=Fin' in token.morph and 'Voice=Act' in token.morph:
                vlemma2plpast[token.lemma_].update([token.text.lower(),])
                vlemma_cnt[token.lemma_] += 1

                directObject = False

                for item in token.children:
                    if item.pos_ not in ['NOUN','PRON','PROPN']: continue
                    
                    if item.dep_=='obj' and 'Case=Acc' in item.morph:
                        directObject = True

                if directObject == True:
                    vlemma_transitive_cnt[token.lemma_] += 1

In [34]:
MIN_FREQ = 5

In [35]:
verbs_to_use = []
for vlemma, cnt in sorted(vlemma_cnt.items(), key=lambda x:-x[1]):
    if cnt < MIN_FREQ: break
    if vlemma in vlemma2plpast:
        verb_past = vlemma2plpast[vlemma].most_common()[0][0]
        s1 = vlemma_transitive_cnt[vlemma]/cnt
        if s1>=.5 and verb_past not in verbs_to_use:
            verbs_to_use.append(verb_past)

In [36]:
with open('ru_transitive_verbs_past.txt', 'w') as f:
    for item in verbs_to_use:
        f.write("%s\n" % item)

In [39]:
nouns_to_use = []
for lemma, cnt in sorted(nlemma_cnt.items(), key=lambda x:-x[1]):
    if cnt < MIN_FREQ: break
    if lemma in nlemma2plnom:
        noun_pl = nlemma2plnom[lemma].most_common()[0][0]
        if noun_pl not in nouns_to_use and noun_pl not in verbs_to_use and lemma not in vlemma2plpast:
            nouns_to_use.append(noun_pl)

In [40]:
with open('ru_plural_nouns.txt', 'w') as f:
    for item in nouns_to_use:
        f.write("%s\n" % item)

In [41]:
len(verbs_to_use),len(nouns_to_use)

(832, 4413)

In [43]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt_model_id = 'sberbank-ai/rugpt3small_based_on_gpt2' 
 
gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_id).to(device)
gpt_model.eval()
gpt_tokenizer = GPT2TokenizerFast.from_pretrained(gpt_model_id)

def process(sentence):
    encodings = gpt_tokenizer(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = gpt_model(encodings.input_ids.to(device), labels=encodings.input_ids.to(device))
        log_likelihood = outputs[0] * encodings.input_ids.size(1)
    return float(log_likelihood.detach().cpu().numpy())

Downloading:   0%|          | 0.00/551M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [45]:
USE_NOUNS = 300
USE_VERBS = 200

In [49]:
sents = []
for v_idx,verb in enumerate(verbs_to_use):
    if v_idx>=USE_VERBS: break
    for n_idx,subj in enumerate(nouns_to_use):
        if n_idx>=USE_NOUNS: break
        sent_body = 'все '+ subj + ' кого-то ' + verb + '.'
        ppl_body = process(sent_body)
        sent_thing = 'все '+ subj + ' что-то ' + verb + '.'
        ppl_thing = process(sent_thing)
        sents.append((ppl_body,sent_body,n_idx,v_idx,'body'))
        sents.append((ppl_thing,sent_thing,n_idx,v_idx,'thing'))
sents = sorted(sents)

In [50]:
with open('ru_basic_sentences.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents:
        print("\t".join(map(str, s)), file=ofh)

In [7]:
sents = open('ru_basic_sentences.tsv', encoding='utf-8').read().split('\n')
sents = [x.split('\t') for x in sents]
nouns_to_use = open('ru_plural_nouns.txt', encoding='utf-8').read().split('\n')
verbs_to_use = open('ru_transitive_verbs_past.txt', encoding='utf-8').read().split('\n')

In [8]:
USE_SENTS = 10000

In [9]:
with open('ru_test_sentences_ni.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents[:USE_SENTS]:
        if s[-1]=='body':
            aff = nouns_to_use[int(s[2])] + ' никого ' + verbs_to_use[int(s[3])] + '.'
            neg = nouns_to_use[int(s[2])] + ' никого не ' + verbs_to_use[int(s[3])] + '.'
            many = 'многие '+ nouns_to_use[int(s[2])] + ' никого ' + verbs_to_use[int(s[3])] + '.'
            few = 'немногие '+ nouns_to_use[int(s[2])] + ' никого ' + verbs_to_use[int(s[3])] + '.'
            print("\t".join([aff,neg,many,few]+list(map(str, s[2:]))), file=ofh)
        elif s[-1]=='thing':
            aff = nouns_to_use[int(s[2])] + ' ничего ' + verbs_to_use[int(s[3])] + '.'
            neg = nouns_to_use[int(s[2])] + ' ничего не ' + verbs_to_use[int(s[3])] + '.'
            many = 'многие '+ nouns_to_use[int(s[2])] + ' ничего ' + verbs_to_use[int(s[3])] + '.'
            few = 'немногие '+ nouns_to_use[int(s[2])] + ' ничего ' + verbs_to_use[int(s[3])] + '.'
            print("\t".join([aff,neg,many,few]+list(map(str, s[2:]))), file=ofh)

In [10]:
with open('ru_test_sentences_libo.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents[:USE_SENTS]:
        if s[-1]=='body':
            aff = nouns_to_use[int(s[2])] + ' ' + verbs_to_use[int(s[3])] + ' кого-либо.'
            neg = nouns_to_use[int(s[2])] + ' не ' + verbs_to_use[int(s[3])] + ' кого-либо.'
            many = 'многие '+ nouns_to_use[int(s[2])] + ' ' + verbs_to_use[int(s[3])] + ' кого-либо.'
            few = 'немногие '+ nouns_to_use[int(s[2])] + ' ' + verbs_to_use[int(s[3])] + ' кого-либо.'
            print("\t".join([aff,neg,many,few]+list(map(str, s[2:]))), file=ofh)
        elif s[-1]=='thing':
            aff = nouns_to_use[int(s[2])] + ' ' + verbs_to_use[int(s[3])] + ' что-либо.'
            neg = nouns_to_use[int(s[2])] + ' не ' + verbs_to_use[int(s[3])] + ' чего-либо.'
            many = 'многие '+ nouns_to_use[int(s[2])] + ' ' + verbs_to_use[int(s[3])] + ' что-либо.'
            few = 'немногие '+ nouns_to_use[int(s[2])] + ' ' + verbs_to_use[int(s[3])] + ' что-либо.'
            print("\t".join([aff,neg,many,few]+list(map(str, s[2:]))), file=ofh)