In [1]:
import numpy as np
from collections import Counter, defaultdict
import spacy
import textacy
import textacy.datasets
import re
from anyascii import anyascii
nlp = spacy.load("fr_core_news_sm")
ds = textacy.datasets.Wikipedia(lang="fr", version="current")

In [3]:
ds.download()

100%|██████████| 12.1G/12.1G [08:31<00:00, 23.6MB/s]  


In [15]:
vlemma_cnt = defaultdict(int)
vlemma_transitive_cnt = defaultdict(int)
vlemma2part = defaultdict(Counter)
nlemma_cnt = defaultdict(int)
nlemma2pl = defaultdict(Counter)

for text,record in ds.records(limit=20000):
    processed = nlp(text)
    for token in processed:
        if token.pos_ == 'NOUN' and 'Number=Plur' in token.morph and re.match(r'^[A-Za-z]*$',anyascii(token.text)):
            nlemma_cnt[token.lemma_] += 1
            nlemma2pl[token.lemma_].update([token.text.lower(),])
        if token.pos_ == 'VERB' and 'Gender=Masc' in token.morph and 'Number=Sing' in token.morph and 'Tense=Past' in token.morph and 'VerbForm=Part' in token.morph and re.match(r'^[A-Za-z]*$',anyascii(token.text)):
            vlemma_cnt[token.lemma_] += 1
            vlemma2part[token.lemma_].update([token.text.lower(),])
            directObject = False
            for item in token.children:
                if item.pos_ not in ['NOUN','PRON','PROPN']: continue
                if (item.dep_ == "obj"):
                    directObject = True
            if directObject == True:
                vlemma_transitive_cnt[token.lemma_] += 1

In [31]:
MIN_FREQ = 5

In [47]:
verbs_to_use = []
for vlemma, cnt in sorted(vlemma_cnt.items(), key=lambda x:-x[1]):
    if cnt < MIN_FREQ: break
    if vlemma in vlemma2part:
        verb_part = vlemma2part[vlemma].most_common()[0][0]
        s1 = vlemma_transitive_cnt[vlemma]/cnt
        if s1>=.425 and verb_part not in verbs_to_use:
            verbs_to_use.append(verb_part)

In [48]:
with open('fr_transitive_verbs_part.txt', 'w') as f:
    for item in verbs_to_use:
        f.write("%s\n" % item)

In [49]:
nouns_to_use = []
for lemma, cnt in sorted(nlemma_cnt.items(), key=lambda x:-x[1]):
    if cnt < MIN_FREQ: break
    if lemma in nlemma2pl:
        noun_pl = nlemma2pl[lemma].most_common()[0][0]
        if noun_pl not in nouns_to_use and noun_pl not in verbs_to_use and lemma not in vlemma2part:
            nouns_to_use.append(noun_pl)

In [50]:
with open('fr_plural_nouns.txt', 'w') as f:
    for item in nouns_to_use:
        f.write("%s\n" % item)

In [51]:
len(verbs_to_use),len(nouns_to_use)

(223, 11189)

In [27]:
import torch
from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer

In [29]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt_model_id = 'antoiloui/belgpt2' 
 
gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_id).to(device)
gpt_model.eval()
gpt_tokenizer = GPT2TokenizerFast.from_pretrained(gpt_model_id)

def process(sentence):
    encodings = gpt_tokenizer(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = gpt_model(encodings.input_ids.to(device), labels=encodings.input_ids.to(device))
        log_likelihood = outputs[0] * encodings.input_ids.size(1)
    return float(log_likelihood.detach().cpu().numpy())

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [55]:
USE_NOUNS = 300
USE_VERBS = 200

In [56]:
sents = []
for v_idx,verb in enumerate(verbs_to_use):
    if v_idx>=USE_VERBS: break
    for n_idx,subj in enumerate(nouns_to_use):
        if n_idx>=USE_NOUNS: break
        sent_body = 'deux '+ subj + ' ont ' + verb + " quelqu'un."
        ppl_body = process(sent_body)
        sent_thing = 'deux '+ subj + ' ont ' + verb + ' quelque chose.'
        ppl_thing = process(sent_thing)
        sents.append((ppl_body,sent_body,n_idx,v_idx,'body'))
        sents.append((ppl_thing,sent_thing,n_idx,v_idx,'thing'))
sents = sorted(sents)

In [57]:
with open('fr_basic_sentences.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents:
        print("\t".join(map(str, s)), file=ofh)

In [58]:
USE_SENTS = 10000

In [59]:
vowels = ['a','h','é','e','è','ê','à','á','â','i','o','u','œ','y']

In [62]:
with open('fr_test_sentences.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents[:USE_SENTS]:
        if s[-1]=='body':
            aff = 'les ' + nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' qui que ce soit.'
            neg = 'les ' + nouns_to_use[s[2]] + " n'ont pas " + verbs_to_use[s[3]] + ' qui que ce soit.'
            if nouns_to_use[s[2]][0] in vowels:
                many = "beaucoup d'"+ nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' qui que ce soit.'
                few = "peu d'"+ nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' qui que ce soit.'
            else:
                many = "beaucoup de "+ nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' qui que ce soit.'
                few = "peu de "+ nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' qui que ce soit.'                
        elif s[-1]=='thing':
            aff = 'les ' + nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' quoi que ce soit.'
            neg = 'les ' + nouns_to_use[s[2]] + " n'ont pas " + verbs_to_use[s[3]] + ' quoi que ce soit.'
            if nouns_to_use[s[2]][0] in vowels:
                many = "beaucoup d'"+ nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' quoi que ce soit.'
                few = "peu d'"+ nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' quoi que ce soit.'
            else:
                many = "beaucoup de "+ nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' quoi que ce soit.'
                few = "peu de "+ nouns_to_use[s[2]] + ' ont ' + verbs_to_use[s[3]] + ' quoi que ce soit.'                
        print("\t".join([aff,neg,many,few]+list(map(str, s[2:]))), file=ofh)