In [None]:
import spacy
import numpy as np
from collections import defaultdict
import textacy.datasets
import re
from collections import Counter, defaultdict
import en_core_web_sm
nlp = en_core_web_sm.load()

cw = textacy.datasets.CapitolWords()

### MQNLI words to filter out

In [2]:
with open('mqnli_transitive_verbs.txt', 'r') as f:
    mqnli_verbs = f.readlines()
mqnli_verbs = [x.split()[2].strip('\n') for x in mqnli_verbs]

In [3]:
with open('mqnli_agents.txt', 'r') as f:
    mqnli_agents = f.readlines()
mqnli_agents = [x.strip('\n') for x in mqnli_agents]

In [4]:
with open('mqnli_things.txt', 'r') as f:
    mqnli_things = f.readlines()
mqnli_things = [x.strip('\n') for x in mqnli_things]

In [5]:
mqnli_nouns = mqnli_agents + mqnli_things

### words to use

In [7]:
vlemma_cnt = defaultdict(int)
vlemma_transitive_cnt = defaultdict(int)
vlemma2past = defaultdict(Counter)
nounlemma_cnt = defaultdict(int)
nounlemma2pl = defaultdict(Counter)

for text,record in cw.records():
    processed = nlp(text)
    for token in processed:
        if token.pos_ == 'NOUN' and 'Number=Plur' in token.morph and re.match(r'^[A-Za-z]*$',token.text):
            nounlemma_cnt[token.lemma_] += 1
            nounlemma2pl[token.lemma_].update([token.text.lower(),])
        if token.pos_ == 'VERB' and 'Tense=Past' in token.morph and 'VerbForm=Part' not in token.morph and re.match(r'^[A-Za-z]*$',token.text):
            vlemma_cnt[token.lemma_] += 1
            vlemma2past[token.lemma_].update([token.text.lower(),])
            directObject = False
            for item in token.children:
                if item.pos_ not in ['NOUN','PRON']: continue
                if (item.dep_ == "dobj"):
                    directObject = True
                if directObject == True:
                    vlemma_transitive_cnt[token.lemma_] += 1

In [8]:
MIN_FREQ = 5

In [9]:
verbs_to_use = []
for vlemma, cnt in sorted(vlemma_cnt.items(), key=lambda x:-x[1]):
    if cnt < MIN_FREQ: break
    if vlemma in vlemma2past:
        verb_past = vlemma2past[vlemma].most_common()[0][0]
        s1 = vlemma_transitive_cnt[vlemma]/cnt
        if s1>=.5 and vlemma not in mqnli_verbs and verb_past not in verbs_to_use:
            verbs_to_use.append(verb_past)

In [10]:
with open('en_transitive_verbs_past.txt', 'w') as f:
    for item in verbs_to_use:
        f.write("%s\n" % item)

In [11]:
nouns_to_use = []
for lemma, cnt in sorted(nounlemma_cnt.items(), key=lambda x:-x[1]):
    if cnt < MIN_FREQ: break
    if lemma in nounlemma2pl:
        noun_pl = nounlemma2pl[lemma].most_common()[0][0]
        if noun_pl not in nouns_to_use and noun_pl not in verbs_to_use and lemma not in vlemma2past and lemma not in mqnli_nouns:
            nouns_to_use.append(noun_pl)

In [12]:
with open('en_plural_nouns.txt', 'w') as f:
    for item in nouns_to_use:
        f.write("%s\n" % item)

In [13]:
len(verbs_to_use),len(nouns_to_use)

(463, 2185)

## Generate and evaluate sentences

In [15]:
import torch
from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer

In [16]:
model_id = 'gpt2'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained(model_id)

def process(sentence):
    tokens = ["[CLS]"] + tokenizer.tokenize(sentence)
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    tokens_ids = torch.tensor([tokens_ids,], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model(tokens_ids, lm_labels=tokens_ids)
        log_likelihood = outputs.item()
    return np.exp(log_likelihood) 

In [17]:
USE_NOUNS = 300
USE_VERBS = 200

In [18]:
sents = []
for v_idx,verb in enumerate(verbs_to_use):
    if v_idx>=USE_VERBS: break
    for n_idx,subj in enumerate(nouns_to_use):
        if n_idx>=USE_NOUNS: break
        sent_body = 'two '+ subj + ' ' + nlp(verb)[0].lemma_ + ' somebody.'
        ppl_body = process(sent_body)
        sent_thing = 'two '+ subj + ' ' + nlp(verb)[0].lemma_ + ' something.'
        ppl_thing = process(sent_thing)
        sents.append((ppl_body,sent_body,n_idx,v_idx,'body'))
        sents.append((ppl_thing,sent_thing,n_idx,v_idx,'thing'))
sents = sorted(sents)

In [19]:
with open('en_basic_sentences.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents:
        print("\t".join(map(str, s)), file=ofh)

### making test sentences

In [20]:
USE_SENTS=10000

In [21]:
with open('en_test_sentences.tsv', 'w', encoding='utf-8') as ofh:
    for s in sents[:USE_SENTS]:
        aff = 'the '+ nouns_to_use[s[2]] + ' ' + verbs_to_use[s[3]] + ' any'+s[4]+'.'
        neg = 'the ' + nouns_to_use[s[2]] + ' did not ' + s[1].split(' ')[2]  + ' any'+s[4]+'.'
        many = 'many '+ nouns_to_use[s[2]] + ' ' + verbs_to_use[s[3]] + ' any'+s[4]+'.'
        few = 'few '+ nouns_to_use[s[2]] + ' ' + verbs_to_use[s[3]] + ' any'+s[4]+'.'
        print("\t".join([aff,neg,many,few]+list(map(str, s[2:]))), file=ofh)