In [1]:
import random
import sys
sys.path.append("/scratch/ratch/tjf324/pytorch-pretrained-BERT/")
from pytorch_pretrained_bert import modeling, tokenization
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
import logging
import torch
import numpy as np
import pandas as pd
import tqdm

from language_modeling.runners import (
    tokenize_example, InputExample,
    convert_example_to_features, features_to_data,
)

WNLI_TRAIN_PATH = "/scratch/tjf324/data/glue_auto_dl/WNLI/train.tsv"
WNLI_DEV_PATH = "/scratch/tjf324/data/glue_auto_dl/WNLI/dev.tsv"

import spacy
nlp = spacy.load('en_core_web_sm')

def get_pos(sent):
    return [token.pos_ for token in nlp(sent)]

def is_noun(pos):
    return pos in ["PRON", "PROPN", "NOUN"]

def get_pos_dict(sent):
    return {
        token.text.lower(): token.pos_
        for token in nlp(sent)
    }

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
# device = torch.device("cuda:0")
# bert_model_name = "bert-large-uncased"
# max_sequence_length = 128
# MASK = "[MASK]"

# model = modeling.BertForPreTraining.from_pretrained(bert_model_name)
# model.to(device);
# model.eval();



device = torch.device("cuda:0")
bert_model_name = "bert-large-uncased"
max_sequence_length = 128
MASK = "[MASK]"

model = modeling.BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=2)
model.to(device);

tokenizer = tokenization.BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True);


param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=5e-5,
                     warmup=0.1,
                     t_total=300)

In [3]:
train_df = pd.read_csv(WNLI_TRAIN_PATH, sep="\t")
val_df = pd.read_csv(WNLI_DEV_PATH, sep="\t")

print((train_df["label"]==0).mean())
print((val_df["label"]==0).mean())

0.5086614173228347
0.5633802816901409


In [47]:
def remove_in_blacklist(chunk):
    BLACKLIST = ["he", "she", "it", "they", "who", "her", "we", "them",
                 "him", "his", "us", "me", "i", "anything", "anyone", "somebody", 
                 "someone"]
    return chunk.text.lower() not in BLACKLIST
    

def get_all_sentences(df, skip_false=False):
    sent1s, sent2s = [], []
    for _, row in tqdm.tqdm_notebook(df.iterrows(), total=len(df)):

        if skip_false:
            if not row.label:
                continue
        
        sent1, sent2 = row.sentence1, row.sentence2

        sent1s.append(sent1)
        sent2s_this = {"true": sent2, "alt": []}

        sent1_doc = nlp(sent1)
        sent2_doc = nlp(sent2)
        
        
        noun_chunks_sent1 = list(filter(remove_in_blacklist, sent1_doc.noun_chunks))
        noun_chunks_sent2 = list(filter(remove_in_blacklist, sent2_doc.noun_chunks))

        for chunk2 in noun_chunks_sent2[:1]:
            for chunk1 in noun_chunks_sent1:
          
                if chunk2.root.text.lower() == chunk1.root.text.lower():
                    continue
                alt_sent = sent2.replace(chunk2.text, chunk1.text)
                if alt_sent.lower() == sent2.lower():
                    continue
                sent2s_this["alt"].append(alt_sent)
        sent2s.append(sent2s_this)
        
    return sent1s, sent2s


In [8]:
# sent1s, sent2s = get_all_sentences(train_df, True)

HBox(children=(IntProgress(value=0, max=635), HTML(value='')))

In [50]:
preds = []
selected = []

stop_at = 1000

labels = train_df.label[:stop_at]
labels = [1] * len(sent1s)

model.train()

steps = 0

for sent1, sent2, label in tqdm.tqdm_notebook(zip(sent1s[:stop_at], 
                                                   sent2s[:stop_at], 
                                                   labels),
                                               total=min(stop_at, len(sent1s))):
    steps += 1
    example = InputExample(
        guid=0,
        text_a=sent1,
        text_b=sent2["true"],
        is_next=True,
    )
    tokenized_example = tokenize_example(example, tokenizer)
    tokenized_examples = [tokenized_example]
   
    for sent2_alt in sent2["alt"]:
        example = InputExample(
            guid=0,
            text_a=sent1,
            text_b=sent2_alt,
            is_next=True,
        )
        tokenized_example = tokenize_example(example, tokenizer)
        tokenized_examples.append(tokenized_example)
        
    if len(tokenized_examples) > 5:
        tokenized_examples = [tokenized_examples[0]] + list(np.random.choice(tokenized_examples, 2))
        

    
        
    features = [convert_example_to_features(ex, tokenizer, max_sequence_length, select_prob=0.0)
                for ex in tokenized_examples]
    
    batch = features_to_data(features).to(device)

    labels = torch.zeros(len(features), dtype=torch.long).to(device=device, )
    if label:
        labels[0] = 1
    
    loss = model(
                batch.input_ids, 
                batch.segment_ids, 
                batch.input_mask, 
                labels,
            )
    
    loss.backward()
    if not steps % 5:
        optimizer.step()
        optimizer.zero_grad()

            

HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

In [53]:
for i, j, l in zip(train_df['sentence1'], train_df['sentence2'], train_df['label']):
    print(i, j, l)

I stuck a pin through a carrot. When I pulled the pin out, it had a hole. The carrot had a hole. 1
John couldn't see the stage with Billy in front of him because he is so short. John is so short. 1
The police arrested all of the gang members. They were trying to stop the drug trade in the neighborhood. The police were trying to stop the drug trade in the neighborhood. 1
Steve follows Fred's example in everything. He influences him hugely. Steve influences him hugely. 0
When Tatyana reached the cabin, her mother was sleeping. She was careful not to disturb her, undressing and climbing back into her berth. mother was careful not to disturb her, undressing and climbing back into her berth. 0
George got free tickets to the play, but he gave them to Eric, because he was particularly eager to see it. George was particularly eager to see it. 0
John was jogging through the park when he saw a man juggling watermelons. He was very impressive. John was very impressive. 0
I couldn't put the pot on

In [44]:
# val_sent1s, val_sent2s = get_all_sentences(val_df, False)

HBox(children=(IntProgress(value=0, max=71), HTML(value='')))

In [51]:
model.eval()
preds = []
for sent1, sent2 in tqdm.tqdm_notebook(zip(val_sent1s, val_sent2s), total=len(val_sent1s)):
    example = InputExample(
        guid=0,
        text_a=sent1,
        text_b=sent2["true"],
        is_next=True,
    )
    tokenized_example = tokenize_example(example, tokenizer)
    features = [convert_example_to_features(tokenized_example, tokenizer, max_sequence_length, select_prob=0.0)]
    batch = features_to_data(features).to(device)
    with torch.no_grad():   
        result = model(
                batch.input_ids, 
                batch.segment_ids, 
                batch.input_mask, 
            )
    print(result)
    pred = int(torch.softmax(result, 1)[0, 1] > 0.3)
    preds.append(pred)

HBox(children=(IntProgress(value=0, max=71), HTML(value='')))

tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], device='cuda:0')
tensor([[0.8163, 0.0467]], devic

In [31]:
preds = np.array(preds)
print(preds.mean())
print((preds == val_df["label"][:stop_at]).mean())

0.0
0.5633802816901409


In [25]:
for _, (i, j) in enumerate(zip(sent1s, selected)):
    print(i, j)
    if _ == 5:
        break

I stuck a pin through a carrot. When I pulled the pin out, it had a hole. The carrot had a hole.
John couldn't see the stage with Billy in front of him because he is so short. Billy is so short.
The police arrested all of the gang members. They were trying to stop the drug trade in the neighborhood. the gang members were trying to stop the drug trade in the neighborhood.
Steve follows Fred's example in everything. He influences him hugely. Fred's example influences him hugely.
When Tatyana reached the cabin, her mother was sleeping. She was careful not to disturb her, undressing and climbing back into her berth. mother was careful not to disturb her, undressing and climbing back into her berth.
George got free tickets to the play, but he gave them to Eric, because he was particularly eager to see it. George was particularly eager to see it.


In [38]:
# train_pred_arr, train_all_alt_ls = pred_v1(train_df, tokenizer, model)
# val_pred_arr, val_all_alt_ls = pred_v1(val_df, tokenizer, model)

100%|██████████| 635/635 [10:12<00:00,  1.04it/s]
100%|██████████| 71/71 [01:05<00:00,  1.07it/s]


In [41]:
# print((train_pred_arr==train_df["label"]).mean())
# print((val_pred_arr==val_df["label"]).mean())

0.5606299212598426
0.6197183098591549


In [66]:
# def visualize_sample(i):
#     print(f"Label: {bool(train_df.label.iloc[i])}, Predicted: {train_pred_arr[i]}")
#     print("Sentence 1: ", train_df.iloc[i].sentence1)
#     print("Sentence 2: ", train_df.iloc[i].sentence2)
#     print(train_all_alt_ls[i])

# visualize_sample(20)

## V2 and other stuff

In [45]:
train_pred_arr, train_all_alt_ls = pred_v2(train_df, tokenizer, model)
val_pred_arr, val_all_alt_ls = pred_v2(val_df, tokenizer, model)

100%|██████████| 635/635 [00:59<00:00, 10.66it/s]
100%|██████████| 71/71 [00:05<00:00, 11.60it/s]


In [46]:
print((train_pred_arr==train_df["label"]).mean())
print((val_pred_arr==val_df["label"]).mean())

0.5795275590551181
0.5915492957746479


In [None]:
def pred_v2(df, tokenizer, model):
    pred_ls = []
    all_alt_ls = []
    tokenized_examples = []
    for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        result_ls = [ ]
        alt_ls = []
        example = InputExample(
            guid=0,
            text_a=row["sentence1"],
            text_b=row["sentence2"],
            is_next=True,
        )
        tokenized_example = tokenize_example(example, tokenizer)
        tokenized_examples.append(tokenized_example)
        tokens_a_pos_dict = get_pos_dict(example.text_a)
        tokens_b_pos_dict = get_pos_dict(example.text_b)
        tokens_a_nouns = {
            tokenizer.vocab[word]
            for word in tokenized_example.tokens_a
            if is_noun(tokens_a_pos_dict.get(word))
        }
        for word in ["he", "she", "it", "her", "him"]:
            word_id = tokenizer.vocab[word]
            if word_id in tokens_a_nouns:
                tokens_a_nouns.remove(word_id)
        tokens_a_ids = np.array(list(tokens_a_nouns))
        pred_result = True
        for i in range(len(tokenized_example.tokens_b)):
            b_token = tokenized_example.tokens_b[i]
            if not is_noun(tokens_b_pos_dict.get(b_token)):
                continue
            tokenized_example = tokenize_example(example, tokenizer)
            tokenized_example.tokens_b[i] = MASK
            features = convert_example_to_features(tokenized_example, tokenizer, max_sequence_length, select_prob=0.0)
            batch = features_to_data([features]).to(device)
            with torch.no_grad():
                result = model(
                    batch.input_ids, 
                    batch.segment_ids, 
                    batch.input_mask, 
                )
            masked_indices = np.arange(batch.input_ids.shape[1])[batch.input_ids[0].cpu().numpy()==103]
            assert len(masked_indices) == 1
            masked_index = masked_indices[0]
            srs = pd.Series(
                result[0][0][masked_index].cpu().numpy()[tokens_a_ids],
                index=[tokenizer.ids_to_tokens[i] for i in tokens_a_ids],
            ).sort_values()
            result_ls.append(srs.index[-1])
            if not srs.index[-1]==b_token:
                pred_result = False
            alt_ls.append(" ".join(tokenized_example.tokens_b).replace(
                MASK, srs.index[-1].upper(),
            ))

        pred_ls.append(pred_result)
        all_alt_ls.append(alt_ls)
    pred_arr = np.array(pred_ls)
    return pred_arr, all_alt_ls

In [None]:
def pred_v1(df, tokenizer, model):
    pred_ls = []
    all_alt_ls = []
    tokenized_examples = []
    for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        result_ls = [ ]
        alt_ls = []
        example = InputExample(
            guid=0,
            text_a=row["sentence1"],
            text_b=row["sentence2"],
            is_next=True,
        )
        tokenized_example = tokenize_example(example, tokenizer)
        tokenized_examples.append(tokenized_example)
        tokens_a_pos = get_pos(" ".join(tokenized_example.tokens_a))
        tokens_b_pos = get_pos(" ".join(tokenized_example.tokens_b))
        tokens_a_nouns = {
            tokenizer.vocab[word]
            for word, pos in zip(tokenized_example.tokens_a, tokens_a_pos)
            if is_noun(pos)
        }
        tokens_a_ids = np.array(list(tokens_a_nouns))
        pred_result = True
        for i in range(len(tokenized_example.tokens_b)):
            if not is_noun(tokens_b_pos[i]):
                continue
            b_token = tokenized_example.tokens_b[i]
            tokenized_example = tokenize_example(example, tokenizer)
            tokenized_example.tokens_b[i] = MASK
            features = convert_example_to_features(tokenized_example, tokenizer, max_sequence_length, select_prob=0.0)
            batch = features_to_data([features]).to(device)
            with torch.no_grad():
                result = model(
                    batch.input_ids, 
                    batch.segment_ids, 
                    batch.input_mask, 
                )
            masked_indices = np.arange(batch.input_ids.shape[1])[batch.input_ids[0].cpu().numpy()==103]
            assert len(masked_indices) == 1
            masked_index = masked_indices[0]
            srs = pd.Series(
                result[0][0][masked_index].cpu().numpy()[tokens_a_ids],
                index=[tokenizer.ids_to_tokens[i] for i in tokens_a_ids],
            ).sort_values()
            result_ls.append(srs.index[-1])
            if not srs.index[-1]==b_token:
                pred_result = False
            alt_ls.append(" ".join(tokenized_example.tokens_b).replace(
                MASK, srs.index[-1].upper(),
            ))

        pred_ls.append(pred_result)
        all_alt_ls.append(alt_ls)
    pred_arr = np.array(pred_ls)
    return pred_arr, all_alt_ls