In [12]:
import numpy as np
from collections import Counter, defaultdict
import textacy.datasets
import spacy_udpipe
import re
from anyascii import anyascii
spacy_udpipe.download("tr")
nlp = spacy_udpipe.load("tr")
ds = textacy.datasets.Wikipedia(lang="tr", version="current")

Already downloaded a model for the 'tr' language


In [None]:
nlemma_cnt = defaultdict(int)
nlemma2pl = defaultdict(Counter)
nlemma2sg = defaultdict(Counter)
vlemma_cnt = defaultdict(int)
vlemma_transitive_cnt = defaultdict(int)
vlemma2pastneg = defaultdict(Counter)
vlemma2pastaff = defaultdict(Counter)

for text,record in ds.records():
    processed = nlp(text)
    for token in processed:
        if token.pos_ == 'NOUN' and 'Case=Nom' in token.morph and re.match(r'^[A-Za-z]*$',anyascii(token.text)):
            nlemma_cnt[token.lemma_.lower()] += 1
            if len([x for x in token.morph if re.search(r'psor',x)])==0:
                if 'Number=Plur' in token.morph:
                    nlemma2pl[token.lemma_.lower()].update([token.text.lower(),])
                if 'Number=Sing' in token.morph:
                    nlemma2sg[token.lemma_.lower()].update([token.text.lower(),])
        if token.pos_ == 'VERB' and 'Number=Sing' in token.morph and 'Tense=Past' in token.morph and 'Person=3' in token.morph and 'Mood=Ind' in token.morph and 'Aspect=Perf' in token.morph and 'Voice=Cau' not in token.morph and 'Voice=Pass' not in token.morph and re.match(r'^[A-Za-z]*$',anyascii(token.text)):
            vlemma_cnt[token.lemma_.lower()] += 1
            if 'Polarity=Neg' in token.morph:
                vlemma2pastneg[token.lemma_.lower()].update([token.text.lower(),])
            if 'Polarity=Pos' in token.morph:
                vlemma2pastaff[token.lemma_.lower()].update([token.text.lower(),])
            directObject = False
            for item in token.children:
                if item.pos_ not in ['NOUN','PRON','PROPN']: continue
                if item.dep_ == "obj" and 'Case=Acc' in item.morph:
                    directObject = True
            if directObject == True:
                vlemma_transitive_cnt[token.lemma_] += 1

In [87]:
MIN_FREQ = 1

In [102]:
verbs_to_use_aff = []
verbs_to_use_neg = []
for vlemma, cnt in sorted(vlemma_cnt.items(), key=lambda x:-x[1]):
    if cnt < MIN_FREQ: break
    if vlemma in vlemma2pastneg and vlemma in vlemma2pastaff:
        verb_aff = vlemma2pastaff[vlemma].most_common()[0][0]
        verb_neg = vlemma2pastneg[vlemma].most_common()[0][0]
        s1 = vlemma_transitive_cnt[vlemma]/cnt
        if s1>=.375 and verb_aff not in verbs_to_use_aff and verb_neg not in verbs_to_use_neg:
            verbs_to_use_aff.append(verb_aff)
            verbs_to_use_neg.append(verb_neg)

In [103]:
len(verbs_to_use_aff),len(verbs_to_use_neg)

(103, 103)

In [105]:
with open('transitive_verbs_aff.txt', 'w') as f:
    for item in verbs_to_use_aff:
        f.write("%s\n" % item)

In [106]:
with open('transitive_verbs_neg.txt', 'w') as f:
    for item in verbs_to_use_neg:
        f.write("%s\n" % item)

In [104]:
MIN_FREQ = 5

In [109]:
nouns_to_use_pl = []
nouns_to_use_sg = []
for lemma, cnt in sorted(nlemma_cnt.items(), key=lambda x:-x[1]):
    if cnt < MIN_FREQ: break
    if lemma in nlemma2pl and lemma in nlemma2sg:
        noun_pl = nlemma2pl[lemma].most_common()[0][0]
        noun_sg = nlemma2sg[lemma].most_common()[0][0]
        if noun_pl not in nouns_to_use_pl+nouns_to_use_sg+verbs_to_use_aff+verbs_to_use_neg and noun_sg not in nouns_to_use_pl+nouns_to_use_sg+verbs_to_use_aff+verbs_to_use_neg: 
            nouns_to_use_pl.append(noun_pl)
            nouns_to_use_sg.append(noun_sg)

In [110]:
len(nouns_to_use_pl),len(nouns_to_use_sg)

(19378, 19378)

In [None]:
list(zip(nouns_to_use_pl,nouns_to_use_sg))[178]

In [111]:
with open('nouns_sg.txt', 'w') as f:
    for item in nouns_to_use_sg:
        f.write("%s\n" % item)

In [112]:
with open('nouns_pl.txt', 'w') as f:
    for item in nouns_to_use_pl:
        f.write("%s\n" % item)

In [113]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt_model_id = 'redrussianarmy/gpt2-turkish-cased'  
 
gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_id).to(device)
gpt_model.eval()
gpt_tokenizer = GPT2TokenizerFast.from_pretrained(gpt_model_id)

def process(sentence):
    encodings = gpt_tokenizer(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = gpt_model(encodings.input_ids.to(device), labels=encodings.input_ids.to(device))
        log_likelihood = outputs[0] * encodings.input_ids.size(1)
    return float(log_likelihood.detach().cpu().numpy())

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [114]:
process('çok güzel')

5.545171737670898

In [124]:
USE_NOUNS = 350
USE_VERBS = 200

In [125]:
sents = []
for v_idx,verb in enumerate(verbs_to_use_aff):
    if v_idx>=USE_VERBS: break
    for n_idx,subj in enumerate(nouns_to_use_sg):
        if n_idx>=USE_NOUNS: break
        sent_body = 'iki ' + subj + ' birini ' + verb + '.'
        ppl_body = process(sent_body)
        sent_thing = 'iki ' + subj + ' bir şey ' + verb + '.'
        ppl_thing = process(sent_thing)
        sents.append((ppl_body,sent_body,n_idx,v_idx,'body'))
        sents.append((ppl_thing,sent_thing,n_idx,v_idx,'thing'))
sents = sorted(sents)

In [126]:
with open('basic_sentences.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents:
        print("\t".join(map(str, s)), file=ofh)

In [127]:
USE_SENTS = 10000

In [128]:
with open('test_sentences.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents[:USE_SENTS]:
        if s[-1]=='body':
            aff = nouns_to_use_pl[int(s[2])] + ' kimseyi ' + verbs_to_use_aff[int(s[3])] + '.'
            neg = nouns_to_use_pl[int(s[2])] + ' kimseyi ' + verbs_to_use_neg[int(s[3])] + '.'
            many = 'birçok '+ nouns_to_use_sg[int(s[2])] + ' kimseyi ' + verbs_to_use_aff[int(s[3])] + '.'
            few = 'birkaç '+ nouns_to_use_sg[int(s[2])] + ' kimseyi ' + verbs_to_use_aff[int(s[3])] + '.'
        elif s[-1]=='thing':
            aff = nouns_to_use_pl[int(s[2])] + ' hiçbir şey ' + verbs_to_use_aff[int(s[3])] + '.'
            neg = nouns_to_use_pl[int(s[2])] + ' hiçbir şey ' + verbs_to_use_neg[int(s[3])] + '.'
            many = 'birçok '+ nouns_to_use_sg[int(s[2])] + ' hiçbir şey ' + verbs_to_use_aff[int(s[3])] + '.'
            few = 'birkaç '+ nouns_to_use_sg[int(s[2])] + ' hiçbir şey ' + verbs_to_use_aff[int(s[3])] + '.'
        print("\t".join([aff,neg,many,few]+list(map(str, s[2:]))), file=ofh)

In [130]:
with open('test_sentences_cok_az.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents[:USE_SENTS]:
        if s[-1]=='body':
            aff = nouns_to_use_pl[int(s[2])] + ' kimseyi ' + verbs_to_use_aff[int(s[3])] + '.'
            neg = nouns_to_use_pl[int(s[2])] + ' kimseyi ' + verbs_to_use_neg[int(s[3])] + '.'
            many = 'birçok '+ nouns_to_use_sg[int(s[2])] + ' kimseyi ' + verbs_to_use_aff[int(s[3])] + '.'
            few = 'çok az '+ nouns_to_use_sg[int(s[2])] + ' kimseyi ' + verbs_to_use_aff[int(s[3])] + '.'
        elif s[-1]=='thing':
            aff = nouns_to_use_pl[int(s[2])] + ' hiçbir şey ' + verbs_to_use_aff[int(s[3])] + '.'
            neg = nouns_to_use_pl[int(s[2])] + ' hiçbir şey ' + verbs_to_use_neg[int(s[3])] + '.'
            many = 'birçok '+ nouns_to_use_sg[int(s[2])] + ' hiçbir şey ' + verbs_to_use_aff[int(s[3])] + '.'
            few = 'çok az '+ nouns_to_use_sg[int(s[2])] + ' hiçbir şey ' + verbs_to_use_aff[int(s[3])] + '.'
        print("\t".join([aff,neg,many,few]+list(map(str, s[2:]))), file=ofh)