In [5]:
import numpy as np
import torch
import pandas as pd
from transformers import PreTrainedTokenizerFast
import re
import spacy
nlp = spacy.load("en_core_web_sm")

In [6]:
tokenizer_bert = PreTrainedTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True,return_offsets_mapping = True, max_length=512,truncate=True,add_special_tokens=False,return_token_type_ids=False,return_attention_mask=False)

In [7]:
vocab_sorted = {k: v for k, v in sorted(tokenizer_bert.vocab.items(), key=lambda item: item[1])}

## Picking adjectives

In [8]:
words=[]
for item in vocab_sorted.items():
    if re.match('[a-z]{2,}$',item[0]):
        words.append(item[0])
len(words)

21719

In [9]:
nouns = []
adjs = []
for ix,word in enumerate(words):
    if nlp(word)[0].pos_ == 'NOUN' and len(nouns) < 1000:
        nouns.append(nlp(word)[0].text)
    elif nlp(word)[0].pos_ == 'ADJ' and len(adjs) < 2000:
        adjs.append(nlp(word)[0].text)

## Finding gradable adjectives

In [10]:
from collections import defaultdict
import textacy
import textacy.datasets
cw = textacy.datasets.CapitolWords()
cw.download()

In [11]:
adjectives_encountered = []
unique_adjectives_encountered = set()

In [12]:
for text,record in cw.records():
    processed = nlp(text)
    
    adjectives_encountered += [token for token in processed if token.text in adjs]
    
    for token in processed:
        if token.text in adjs:
            unique_adjectives_encountered |= set([token.text])

KeyboardInterrupt: 

In [649]:
len(adjectives_encountered),len(unique_adjectives_encountered)

(316114, 1573)

In [650]:
gradable = defaultdict(int)
non_gradable = defaultdict(int)

In [651]:
modifiers = ['somewhat','very','really','extremely','rather']

In [652]:
for adj in adjectives_encountered:
    if len([x for x in adj.children if x.text in modifiers])>0:
        gradable[adj.text] += 1
    else:
        non_gradable[adj.text]+=1

In [653]:
combined = defaultdict(list)

In [654]:
for adj in unique_adjectives_encountered:
    toAdd = []

    toAdd.append(gradable[adj])
    toAdd.append(non_gradable[adj])
    combined[adj] = toAdd

In [681]:
adjs = defaultdict(list)
for adj in combined:
    occurences = sum(combined[adj])
    gradability_score = round(float((combined[adj][0])/occurences) * 100, 3)
    if occurences > 100 and gradability_score > 0.6:
        adjs[adj] = gradability_score

In [682]:
len(adjs)

200

In [684]:
with open('gradable_adjectives.txt', 'w') as f:
    for item in adjs:
        f.write("%s\n" % item)

## Generating sentences

In [687]:
sentences = []
for noun in nouns:
    for adj in adjs:
        sentences.append('The '+noun+' is '+adj+'.')
        sentences.append('The '+noun+' are '+adj+'.')

In [688]:
len(sentences)

400000

In [689]:
sentences[:10]

['The time is valuable.',
 'The time are valuable.',
 'The time is rare.',
 'The time are rare.',
 'The time is successful.',
 'The time are successful.',
 'The time is sorry.',
 'The time are sorry.',
 'The time is broad.',
 'The time are broad.']

## Filtering by GPT perplexity

In [15]:
from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
device = torch.device('cuda:0')
model_id = 'gpt2'
model_gpt = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer_gpt = GPT2Tokenizer.from_pretrained(model_id)

In [16]:
def process_gpt(sentence):
    tokens = ["[CLS]"] + tokenizer_gpt.tokenize(sentence)
    tokens_ids = tokenizer_gpt.convert_tokens_to_ids(tokens)
    tokens_ids = torch.tensor([tokens_ids,], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model_gpt(tokens_ids, lm_labels=tokens_ids)
        log_likelihood = outputs.item()
    return np.exp(log_likelihood) 

In [694]:
pairs = {}
for sentence in sentences:
    pairs[sentence] = process_gpt(sentence)

In [695]:
df = pd.DataFrame.from_dict(pairs, orient='index').reset_index()
df = df.rename(columns={"index": "sentence", 0: "perplexity"})
df.sort_values(by='perplexity', ascending=True)

Unnamed: 0,sentence,perplexity
104082,The reason is simple.,32.092901
145282,The answer is simple.,33.160138
191283,The rules are simple.,35.773233
96482,The plan is simple.,36.188385
67282,The idea is simple.,37.175624
...,...,...
74621,The wouldn are junior.,17539.405174
19341,The wasn are richest.,18833.300436
74573,The wouldn are rural.,19602.368611
19421,The wasn are junior.,20574.691978


In [699]:
df.sort_values(by='perplexity', ascending=True).to_csv('/home/lisa/hobbies/modifiers_all.csv')
df.sort_values(by='perplexity', ascending=True).head(10000).to_csv('/home/lisa/hobbies/modifiers_top10k.csv')