In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
from train import main
from argparse import Namespace
import test
from vocab import Vocab
import numpy as np

In [4]:
from transformers import DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from transformers import glue_convert_examples_to_features
from transformers import glue_processors
from typing import List, Optional, Union
from dataclasses import dataclass

In [5]:
@dataclass
class InputExample:
    guid: str
    text_a: str
    text_b: str
    label: Optional[str] = None
        
@dataclass(frozen=True)
class InputFeatures:
    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[Union[int, float]] = None

In [6]:
def get_model(path, vocab):
    ckpt = torch.load(path)
    train_args = ckpt['args']
    model = test.AAE(vocab, train_args).to(device)
    model.load_state_dict(ckpt['model'])
    model.flatten()
    model.eval()
    return model

In [7]:
def encode(sents, vocab, batch_size, model, device, enc='mu'):
    batches, order = test.get_batches(sents, vocab, batch_size, device)
    z = []
    for inputs, _ in batches:
        mu, logvar = model.encode(inputs)
        if enc == 'mu':
            zi = mu
        else:
            zi = test.reparameterize(mu, logvar)
        z.append(zi.detach().cpu().numpy())
    z = np.concatenate(z, axis=0)
    z_ = np.zeros_like(z)
    z_[np.array(order)] = z
    return z_

def decode(z, vocab, batch_size, max_len, model, device, dec='sample'):
    sents = []
    i = 0
    while i < len(z):
        zi = torch.tensor(z[i: i+batch_size], device=device)
        outputs = model.generate(zi, max_len, dec).t()
        for s in outputs:
            sents.append([vocab.idx2word[id] for id in s[1:]])  # skip <go>
        i += batch_size
    return test.strip_eos(sents)

In [8]:
def load_data(premise, hypotheses, tokenizer):
    processor = glue_processors['mnli']()
    label_list = ["contradiction", "entailment", "neutral"]
    examples = []
    for i, hypothesis in enumerate(hypotheses):
        examples.append(InputExample(guid=f'test-{i}', text_a=premise, text_b=hypothesis, label='contradiction'))
    
    label_map = {label: i for i, label in enumerate(label_list)}
    labels = [label_map[example.label] for example in examples]

    batch_encoding = tokenizer(
        [(example.text_a, example.text_b) for example in examples],
        max_length=128,
        padding='max_length',
        truncation=True,
        return_token_type_ids=True
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    # dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset

## Load Premise-Hypothesis pairs

In [9]:
vocab = Vocab('../checkpoints/aae_epoch100/vocab.txt')
test.set_seed(598)
torch.manual_seed(598)
device = torch.device('cuda')

model = get_model('../checkpoints/aae_epoch100/model.pt', vocab)

perturb_noise = 0.25

In [10]:
classifier_path = '../checkpoints/mnli_baseline_distilbert-2023-04-07_10-48-14/checkpoint-last'
config = DistilBertConfig.from_pretrained(
    classifier_path,
    num_labels=3,
    finetuning_task='mnli',
    attention_probs_dropout_prob=0,
    hidden_dropout_prob=0.1
)
tokenizer = DistilBertTokenizer.from_pretrained(
    classifier_path,
    do_lower_case=True,
)
classifier = DistilBertForSequenceClassification.from_pretrained(
    classifier_path,
    config=config,
    ignore_mismatched_sizes=True
)

In [17]:
import pandas as pd
import Levenshtein
import string
import jiwer

data = pd.read_csv('data/mnli/train2.tsv', sep='\t')

aug_batch = []
aug_i = 0
data_start = 0
data_appended = data_start

premise_seen = {}

for index, row in data.loc[data_start:].iterrows():
    # Load premise, hypothesis, and label
    premise_text = row['sentence1']
    raw_premise = row['sentence1_binary_parse'].split(' ')
    try:
        raw_hypothesis = row['sentence2_binary_parse'].split(' ')
    except AttributeError:
        continue
    orig_label = row['gold_label']
        
    # Process premise
    premise_words = []
    for word in raw_premise:
        if word != "(" and word != ")":
            premise_words.append(word)
    premise = " ".join(premise_words)
    
    # Check that premise is unique
    if premise in premise_seen.keys():
        continue
        
    premise_seen[premise] = True

    # Process hypothesis
    hypothesis_words = []
    for word in raw_hypothesis:
        if word != "(" and word != ")":
            hypothesis_words.append(word)
    hypothesis = " ".join(hypothesis_words)
    
    # Generate sentences
    sents = [ hypothesis.split() ]
    z = encode(sents, vocab, 1, model, device)
    n = 10
    
    orig_hypothesis = hypothesis
    hypotheses = []
    for i in range(n):
        z_noise = z + np.random.normal(0, perturb_noise, size=z.shape).astype('f')
        decoded = decode(z_noise, vocab, 1, 30, model, device, dec='greedy')
        hypotheses.append(' '.join(decoded[0]))
        
    # Run classifier on new hypotheses
    dataset = load_data(premise, hypotheses, tokenizer)
    eval_dataloader = DataLoader(dataset, batch_size=16)
    for batch in eval_dataloader:
        classifier.eval()
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
        _, logits = classifier(**inputs)[:2]
        preds = logits.detach().cpu().numpy()
        preds = np.argmax(preds, axis=1)

    label_list = ["contradiction", "entailment", "neutral"]
    
    min_dist = 1e9
    best_sent = None
    best_label = None

    for sentence, pred in zip(hypotheses, preds):
        if '<unk>' in sentence:
            continue
        
        # Remove trailing punctuation for comparison
        if sentence[-1] in string.punctuation:
            sentence_comp = sentence[:-1].rstrip()
        else:
            sentence_comp = sentence
        if orig_hypothesis[-1] in string.punctuation:
            orig_hypothesis_comp = orig_hypothesis[:-1].rstrip()
        else:
            orig_hypothesis_comp = orig_hypothesis

        # Choose best sentence based on WER to original (with different label)
        if orig_label != label_list[pred] and orig_hypothesis_comp != sentence_comp:
            dist = jiwer.wer(orig_hypothesis_comp, sentence_comp)
            dist *= len(orig_hypothesis_comp.split())
            if dist <= 2 and dist > 0:
                if dist < min_dist:
                    min_dist = dist
                    best_sent = sentence
                    best_label = label_list[pred]
            
                
    # Skip if no close sentences were found
    if best_sent == None:
        continue

    print('Premise: {}'.format(premise))
    print('Original hypothesis: {} --> {}'.format(orig_hypothesis, orig_label))
    print('Best sentence: {} --> {}\n'.format(best_sent, best_label))

    # Fill aug_data row with necessary info
    aug_row = []
    for i in range(8):
        aug_row.append('')
    aug_row.append(premise_text)
    aug_row.append(best_sent)
    aug_row.append('')
    aug_row.append(orig_label)
    aug_row.append('')

    aug_batch.append(aug_row)
    aug_i += 1
    data_appended += 1
    
    # Write to TSV every 1000 lines in case it's too slow overall
    if aug_i >= 500:
        aug_data = pd.DataFrame(columns=data.columns)

        # Add data from batch
        for batch_i, batch_row in enumerate(aug_batch):
            aug_data.loc[batch_i] = batch_row
        
        aug_data.to_csv('data/aug{}.tsv'.format(data_appended), sep="\t")
        
        # Reset aug data and counter
        aug_data = pd.DataFrame(columns=data.columns)
        aug_batch = []
        aug_i = 0
        
    if data_appended >= data.shape[0] * 0.05:
        break

    

  data = pd.read_csv('data/mnli/train2.tsv', sep='\t')


Premise: How do you know ? All this is their information again .
Original hypothesis: This information belongs to them . --> entailment
Best sentence: This information belongs to them Tommy. --> neutral

Premise: I burst through a set of cabin doors , and fell to the ground -
Original hypothesis: I burst through the doors and fell down . --> entailment
Best sentence: I burst through the doors and fell down slowly. --> neutral





Premise: Issues in Data Synthesis .
Original hypothesis: Problems in data synthesis . --> entailment
Best sentence: Problems in data is fair. --> neutral





Premise: The other men shuffled .
Original hypothesis: The other men were shuffled around . --> entailment
Best sentence: The other men were shuffled around quicker. --> neutral





Premise: well it 's been very interesting
Original hypothesis: It has been very intriguing . --> entailment
Best sentence: It has very intriguing trails --> neutral





Premise: He started slowly back to the bunkhouse .
Original hypothesis: He returned slowly to the bunkhouse . --> entailment
Best sentence: He returned slowly to the Red . --> neutral

Premise: and it 's it 's quite a bit i think six something is the state and and uh the rest of the pie goes elsewhere but we 're in a particular part of the state that 's pretty well off so it 's it 's like we get a lot of that back as far as local taxation goes
Original hypothesis: I do not know exactly where the local taxes go . --> neutral
Best sentence: I do not know where the American taxes go . --> contradiction





Premise: Postal Service were to reduce delivery frequency .
Original hypothesis: The postal service could deliver less frequently . --> entailment
Best sentence: The postal service could deliver significantly postage . --> contradiction





Premise: Felicia 's Journey takes place behind the eyes of its central a young Irish girl , Felicia , who crosses the sea to England in a hopeful quest to find the father of her unborn child ; and the fat , middle-aged catering manager , Hiditch , who takes a paternal interest in the lass when it becomes clear that her young man has caddishly given her the slip .
Original hypothesis: The woman did not care where the man was as long as it was far . --> contradiction
Best sentence: The woman did not know where the man was as long as it was right --> neutral





Premise: You have access to the facts .
Original hypothesis: The facts are accessible to you . --> entailment
Best sentence: The facts are easy to you . --> neutral





Premise: Build environment Engineering Manufacturing Production -LRB- all rate tooling -RRB- -LRB- 1st set of production tooling -RRB-
Original hypothesis: It is the first set of production tooling for manufacturing . --> entailment
Best sentence: It is the first set of production tooling for advertising. --> neutral

Premise: I did not mention Monica in my lecture , but the first question I was asked was how President Clinton could do his job with all the distractions caused by the Monica Lewinsky affair .
Original hypothesis: They wanted to get through the lecture without any problems . --> neutral
Best sentence: They wanted to get through the faces without any issues --> contradiction





Premise: Hills and mountains are especially sanctified in the cult of Jainism .
Original hypothesis: The cult of Jainism hates nature . --> contradiction
Best sentence: The cult of Jainism likes nature . --> neutral





Premise: The famous tenements -LRB- or lands -RRB- began to be built .
Original hypothesis: The land remained deserted . --> contradiction
Best sentence: The land remained completely delicious. --> neutral





Premise: Each of the men wore leather armor and dressed in the style of heavy riders .
Original hypothesis: The men were naked . --> contradiction
Best sentence: The men were white. --> neutral



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Premise: which i mean i think it should be anyway
Original hypothesis: I do n't think it should be that way --> contradiction
Best sentence: I do think it should be that way --> entailment

Premise: no oh no oh well take care
Original hypothesis: Bye for now . --> entailment
Best sentence: Sara for now . --> contradiction





Premise: The With attorneys one year out of graduate school facing an average debt of just less than $ 90,000 and starting salaries at legal aid organizations averaging $ 31,000 , they could n't afford the job .
Original hypothesis: New attorneys have massive law school debt . --> entailment
Best sentence: New attorneys have a law school disability --> neutral





Premise: Here you 'll see a shrunken head , a two-headed goat , and a statue of Marilyn Monroe made of shredded money , among other curiosities .
Original hypothesis: One of the curiosities is a two-headed goat . --> entailment
Best sentence: One of the curiosities is a refreshing character. --> neutral





Premise: right right well it 's it 's a beautiful city and but the problem is like first example when i was young they they took me to Las Vegas and that was the most boring place on earth
Original hypothesis: I think Las Vegas is the most boring place I know . --> entailment
Best sentence: I think Las Vegas is the most exciting place I know the . --> contradiction



## Test

In [20]:
hypotheses = []
hypotheses.append("I don't know how cold it got last night .")

sents = [ hypotheses[0].split() ]
z = encode(sents, vocab, 1, model, device)

n = 10
for i in range(n):
    z_noise = z + np.random.normal(0, perturb_noise, size=z.shape).astype('f')
    decoded = decode(z_noise, vocab, 1, 30, model, device, dec='greedy')

    hypotheses.append(' '.join(decoded[0]))
    print(' '.join(decoded[0]))

I don't know how much it last night got like.
I don't know how much it last night was said.
I don't know how much it cold last night last night
I don't know how much it last night was
I don't know how much it last night last night
I don't know how much it was last night
I don't know how it got cold last night
I don't know how much it came last night again.
I don't know how long it was night again.
I don't know how much it last night was loud.


In [53]:
hypotheses = []
hypotheses.append("Product and geography are what make cream skimming work .")

sents = [ hypotheses[0].split() ]
z = encode(sents, vocab, 1, model, device)

n = 10
for i in range(n):
    z_noise = z + np.random.normal(0, perturb_noise, size=z.shape).astype('f')
    decoded = decode(z_noise, vocab, 1, 30, model, device, dec='greedy')
    print(' '.join(decoded[0]))

Product and geography are what makes it go ahead and programming
Product and geography are the quickest to work poorly.
Product and geography are the quickest location of weight and Windows .
Product and geography are what makes it go .
Home and geography are what to make weight .
Fiscal and geography are what mailers can be programming programming is.
Product and geography are the reason to weight programming .
Product and geography are what make it go home programming
Product and geography are what mailers go Windows programming
Product and geography are the quickest to make weight programming .


## Find labels


In [14]:
premise = "Postal Service were to reduce delivery frequency."
orig_hypothesis = "The postal service could deliver less frequently."
orig_label = "entailment"
hypotheses = [
    "The postal service could deliver significantly often."
]

dataset = load_data(premise, hypotheses, tokenizer)



In [15]:
eval_dataloader = DataLoader(dataset, batch_size=16)
for batch in eval_dataloader:
    classifier.eval()
    with torch.no_grad():
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
    _, logits = classifier(**inputs)[:2]
    preds = logits.detach().cpu().numpy()
    preds = np.argmax(preds, axis=1)

    print(preds.tolist())
    
label_list = ["contradiction", "entailment", "neutral"]

[0]


In [16]:
print(f'Premise: {premise}')
print(f'Original hypothesis: {orig_hypothesis}')
print(f'Label: {orig_label}')
print('--------------------')
for sentence, pred in zip(hypotheses, preds):
    print(f'{sentence} --> {label_list[pred]}')

Premise: Postal Service were to reduce delivery frequency.
Original hypothesis: The postal service could deliver less frequently.
Label: entailment
--------------------
The postal service could deliver significantly often. --> contradiction


In [62]:
print(f'Premise: {premise}')
print(f'Original hypothesis: {orig_hypothesis}')
print(f'Label: {orig_label}')
print('--------------------')
for sentence, pred in zip(hypotheses, preds):
    print(f'{sentence} --> {label_list[pred]}')


In [64]:
min_dist = 1e9
best_sent = ''
best_label = ''

for sentence, pred in zip(hypotheses, preds):
    # Remove trailing punctuation for comparison
    if sentence[-1] in string.punctuation:
        sentence_comp = sentence[:-1].rstrip()
    else:
        sentence_comp = sentence
    if orig_hypothesis[-1] in string.punctuation:
        orig_hypothesis_comp = orig_hypothesis[:-1].rstrip()
    else:
        orig_hypothesis_comp = orig_hypothesis
            
    if orig_label != label_list[pred] and orig_hypothesis_comp != sentence_comp:
        dist = Levenshtein.distance(orig_hypothesis_comp, sentence_comp)
        if dist < min_dist:
            min_dist = dist
            best_sent = sentence
            best_label = label_list[pred]
                    
print(f'Original hypothesis: {orig_hypothesis}')
print('best sentence: {} --> {}'.format(best_sent, best_label))


I don't know how cold it got last night
I don't know how long it got last night
I don't know how cold it got last night
I don't know how how it last night was yesterday
I don't know how cold it got last night
I don't know how bad it last night constantly
I don't know how cold it got last night
I don't know how much it when last night cup
I don't know how cold it got last night
I don't know how much it last night got wet
I don't know how cold it got last night
I don't know how much water it last night night
I don't know how cold it got last night
I don't know how bad it last night got constantly
I don't know how cold it got last night
I don't know how much it last night went constantly
I don't know how cold it got last night
I don't know how long it got last night
Original hypothesis: I don't know how cold it got last night.
best sentence: I don't know how long it got last night . --> neutral
