In [1]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import re

In [0]:
!pip install transformers

In [0]:
!pip install transformers
!pip install wget
# unzipping glue datasets
!unzip 60c2bdb54d156a41194446737ce03e2e-17b8dd0d724281ed7c3b2aeeda662b92809aadd5.zip

In [0]:
!python '/content/60c2bdb54d156a41194446737ce03e2e-17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py'

In [2]:
# check for GPU
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# use GPU
device = torch.device("cuda")
# confirm
print('We are using a ', torch.cuda.get_device_name(0))

Found GPU at: /device:GPU:0
We are using a  Tesla T4


-----
### Randomly selecting 100 cases from each heuristic
To do so, we re-factored our ```read_and_convert_hans``` function to ```read_and_convert_hans_2```

In [0]:
from utils import read_and_convert_hans_2
premises, hypotheses, pairIDs, gold_labels, heuristics, subcases, template = read_and_convert_hans_2('heuristics_train_set.txt')

In [0]:
df = pd.DataFrame()
df['premise'] = premises
df['hypothesis'] = hypotheses
df['pairID'] = pairIDs
df['gold_label'] = gold_labels
df['heuristic'] = heuristics
df['subcase'] = subcases
df['template'] = template

In [0]:
non_entail = df[df['gold_label'] == 'non-entailment']

In [0]:
entail = df[df['gold_label'] == 'entailment']

----- 
A function to randomly sample x subcase examples from HANS

In [0]:
# A function to randomly sample x subcase examples from HANS
def get_hans_examples(df, amount):
  output = pd.DataFrame()
  # each subcase
  for subcase in df['subcase'].unique():
    sub_df = df[df['subcase'] == subcase]
    templates = sub_df['template'].unique()
    template_df = pd.DataFrame()
    # equal distribution for each template
    for template in templates:
      sentences = sub_df[sub_df['template'] == template].sample(n=int(amount/len(templates)), random_state=72)
      template_df = pd.concat([template_df,sentences])

    # if not amount, randomly sample from full subcase rest 
    if len(template_df) != amount:
      template_df = pd.concat([template_df,sub_df.sample(n=amount-len(template_df), random_state=72)])
    output = pd.concat([output,template_df])

  assert(len(output) == amount*15) # 15 subcases

  # convert into lists required for HANS training
  premises = []
  hypotheses = []
  pairIDs = []
  gold_labels = []

  premises = output['premise']
  hypotheses = output['hypothesis']
  pairIDs = output['pairID']
  gold_labels = output['gold_label']

  assert(len(pairIDs) == len(premises))
  assert(len(premises) == len(hypotheses))
  assert(len(pairIDs) == len(gold_labels))

  return list(premises), list(hypotheses), list(pairIDs), list(gold_labels)

In [0]:
#from utils import get_hans_examples
hans_premises_e, hans_hypotheses_e, hans_pairIDs_e, hans_labels_e = get_hans_examples(entail, 100)
hans_premises_n, hans_hypotheses_n, hans_pairIDs_n, hans_labels_n = get_hans_examples(non_entail, 100)

In [0]:
hans_premises = hans_premises_e + hans_premises_n
hans_hypotheses = hans_hypotheses_e + hans_hypotheses_n
hans_labels = hans_labels_e + hans_labels_n

In [0]:
from utils import read_data
# reading in MNLI dataset
train_premises, train_hypotheses, train_labels = read_data('/content/glue_data/MNLI/train.tsv')
val_premises, val_hypotheses, val_labels = read_data('/content/glue_data/MNLI/dev_matched.tsv')

In [0]:
train_premises = train_premises + hans_premises
train_hypotheses = train_hypotheses + hans_hypotheses
train_labels = train_labels + hans_labels

In [82]:
from transformers import BertTokenizer
# loading bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
from utils import tokenize_sentences
train_inputs, train_ids, train_masks = tokenize_sentences(train_premises, train_hypotheses, 128, tokenizer)
val_inputs, val_ids, val_masks = tokenize_sentences(val_premises, val_hypotheses, 128, tokenizer)

In [0]:
from utils import tokenize_labels_hans
train_labels = tokenize_labels_hans(train_labels, tokenizer)
val_labels = tokenize_labels_hans(val_labels, tokenizer)

In [0]:
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)

In [0]:
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)

In [0]:
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

In [0]:
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32

In [0]:
# Create DataLoader for training set
train_data = TensorDataset(train_inputs, train_masks, train_ids, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for validation set
validation_data = TensorDataset(val_inputs, val_masks, val_ids, val_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [91]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2,  
    output_attentions = False, 
    output_hidden_states = False, 
)

# run model on GPU
model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8
                )

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [0]:
from transformers import get_linear_schedule_with_warmup

# number of training epochs (authors recommend between 2 and 4)
epochs = 1 # manually train 3 times to avoid GPU connection issues

total_steps = len(train_dataloader)*epochs

# create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [0]:
import re
from utils import tokenize_sentences
def read_and_convert_hans(filepath): 
  premises = []
  hypotheses = []
  pairIDs = []
  gold_labels = []
  first_line = True
  with open(filepath) as file:
    for fline in file:
      line = re.split(r'\t+', fline)
      if first_line == True:
        first_line = False
        premises.append(line[5])
        hypotheses.append(line[6])
        continue
      pairIDs.append(int(re.sub('ex', '', line[7])))
      gold_labels.append(line[0])
      premises.append(line[5])
      hypotheses.append(line[6])
    
    #assert(len(pairIDs) == len(premises))
    assert(len(premises) == len(hypotheses))
    assert(len(pairIDs) == len(gold_labels))

    return premises, hypotheses, pairIDs, gold_labels

In [94]:
import random
import time
import datetime
import re
import os
from google.colab import files


torch.set_default_dtype(torch.float64)

seed = 72

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

loss_values = []

for epoch in range(0, epochs):
  print('---------- Epoch %s ----------' % str(epoch))
  # start clock
  t0 = time.time()

  # reset loss for epoch
  total_loss = 0

  # put model into training mode
  model.train()

  # for each batch of the training data
  for step, batch in enumerate(train_dataloader):

    if step % 100 == 0 and not step == 0:
      time_elapsed = str(datetime.timedelta(seconds=int(round(time.time() - t0))))
      print('\t Batch %i of %i. Time elapsed: %s' % (step, len(train_dataloader), time_elapsed))
    
    # retrieve tensors from dataloader
    # copy each to GPU using to(device)
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    sequence_ids = batch[2].to(device)
    labels = batch[3].to(device)

    # clear previously calculated gradients
    model.zero_grad()

    # perform forward pass
    # the loss is returned
    outputs = model(
        input_ids = input_ids.long(),
        attention_mask = attention_mask.long(),
        token_type_ids = sequence_ids.long(),
        labels = labels.long()
        )
    
    loss = outputs[0]
    total_loss += loss.item()

    # perform backward pass to calculate gradients
    loss.backward()

    # Clip the norm of the gradients to 1.0 to help prevent "exploding gradients" 
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Update parameters and take a step using the computed gradient
    optimizer.step()

    # Update the learning rate
    scheduler.step()

  try:
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    output_dir = '/content/saved_model'
    print("Saving model to %s" % output_dir)
    torch.save(model.state_dict(), output_dir)
    torch.save(model, '/content/entire_model.pth')
  except:
    print('Saving Failed')

  # Calculate the average loss over the training data.
  avg_train_loss = total_loss / len(train_dataloader)
  loss_values.append(avg_train_loss)

  print('--- Average Training Loss: %f' % avg_train_loss)

  # Measure performance on validation set
  t0 = time.time()
  model.eval()

  try:
    hans_premises, hans_hypotheses, hans_pairIDs, hans_labels = read_and_convert_hans('/content/heuristics_evaluation_set.txt')
    test_inputs, test_ids, test_masks = tokenize_sentences(hans_premises, hans_hypotheses, 128, tokenizer)
    test_inputs = torch.tensor(test_inputs)
    test_ids = torch.tensor(test_ids)
    test_masks = torch.tensor(test_masks)
    hans_pairIDs = torch.tensor(hans_pairIDs)
    test_data = TensorDataset(test_inputs, test_masks, test_ids, hans_pairIDs)
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    predictions = []
    pair_ids = []

    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in test_dataloader:

        batch = tuple(t.to(device) for t in batch)
              
        # Unpack the inputs from dataloader
        input_ids, attention_mask, sequence_ids, batch_pair_ids = batch

        # no need for grad since evaluation
        with torch.no_grad():        

          outputs = model(input_ids = input_ids.long(),
                              attention_mask = attention_mask.long(),
                              token_type_ids = sequence_ids.long())
              
          logits = outputs[0]

          # Move logits and labels to CPU
          logits = logits.detach().cpu().numpy()
          batch_pair_ids = batch_pair_ids.to('cpu').numpy()

          for i in range(0,len(logits)): 
            sub_list = []
            sub_list.append(float(logits[i][0]))
            sub_list.append(float(logits[i][1]))
            predictions.append(sub_list)
            pair_ids.append('ex' + str(batch_pair_ids[i]))
          
    df = pd.DataFrame()
    df['pairID'] = pair_ids
    df['gold_label'] = predictions
    df.to_csv('hans_predictions_post.csv', index=False)

    print('---- HANS Testing Completed ----')
  except:
    print('---- HANS testing failed ----')

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  for batch in validation_dataloader:

      batch = tuple(t.to(device) for t in batch)
          
      # Unpack the inputs from dataloader
      input_ids, attention_mask, sequence_ids, labels = batch

      # no need for grad since evaluation
      with torch.no_grad():        

        outputs = model(input_ids = input_ids.long(),
                          attention_mask = attention_mask.long(),
                          token_type_ids = sequence_ids.long())
          
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
          
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = np.sum(np.argmax(logits, axis=1).flatten() == label_ids.flatten())/len(label_ids)
          
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

  # Report the final accuracy for this validation run.
  print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
  try:
    print("  Validation took: {:}".format((datetime.timedelta(seconds=int(round(time.time() - t0)))))) 
  except:
    continue

print('Training complete.')

---------- Epoch 0 ----------
	 Batch 100 of 12366. Time elapsed: 0:01:11
	 Batch 200 of 12366. Time elapsed: 0:02:22
	 Batch 300 of 12366. Time elapsed: 0:03:33
	 Batch 400 of 12366. Time elapsed: 0:04:44
	 Batch 500 of 12366. Time elapsed: 0:05:54
	 Batch 600 of 12366. Time elapsed: 0:07:05
	 Batch 700 of 12366. Time elapsed: 0:08:16
	 Batch 800 of 12366. Time elapsed: 0:09:27
	 Batch 900 of 12366. Time elapsed: 0:10:38
	 Batch 1000 of 12366. Time elapsed: 0:11:48
	 Batch 1100 of 12366. Time elapsed: 0:12:59
	 Batch 1200 of 12366. Time elapsed: 0:14:10
	 Batch 1300 of 12366. Time elapsed: 0:15:21
	 Batch 1400 of 12366. Time elapsed: 0:16:31
	 Batch 1500 of 12366. Time elapsed: 0:17:42
	 Batch 1600 of 12366. Time elapsed: 0:18:53
	 Batch 1700 of 12366. Time elapsed: 0:20:03
	 Batch 1800 of 12366. Time elapsed: 0:21:14
	 Batch 1900 of 12366. Time elapsed: 0:22:25
	 Batch 2000 of 12366. Time elapsed: 0:23:36
	 Batch 2100 of 12366. Time elapsed: 0:24:46
	 Batch 2200 of 12366. Time elapse

In [0]:
import re
from utils import tokenize_sentences
def read_and_convert_hans(filepath): 
  premises = []
  hypotheses = []
  pairIDs = []
  gold_labels = []
  first_line = True
  with open(filepath) as file:
    for fline in file:
      line = re.split(r'\t+', fline)
      if first_line == True:
        first_line = False
        premises.append(line[5])
        hypotheses.append(line[6])
        continue
      pairIDs.append(int(re.sub('ex', '', line[7])))
      gold_labels.append(line[0])
      premises.append(line[5])
      hypotheses.append(line[6])
    
    #assert(len(pairIDs) == len(premises))
    assert(len(premises) == len(hypotheses))
    assert(len(pairIDs) == len(gold_labels))

    return premises, hypotheses, pairIDs, gold_labels

In [0]:
hans_premises, hans_hypotheses, hans_pairIDs, hans_labels = read_and_convert_hans('/content/heuristics_evaluation_set.txt')

In [0]:
test_inputs, test_ids, test_masks = tokenize_sentences(hans_premises, hans_hypotheses, 128, tokenizer)

In [0]:
test_inputs = torch.tensor(test_inputs)
test_ids = torch.tensor(test_ids)
test_masks = torch.tensor(test_masks)
hans_pairIDs = torch.tensor(hans_pairIDs)

In [0]:
test_data = TensorDataset(test_inputs, test_masks, test_ids, hans_pairIDs)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [36]:
predictions = []
pair_ids = []

model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in test_dataloader:

    batch = tuple(t.to(device) for t in batch)
          
    # Unpack the inputs from dataloader
    input_ids, attention_mask, sequence_ids, batch_pair_ids = batch

    # no need for grad since evaluation
    with torch.no_grad():        

      outputs = model(input_ids = input_ids.long(),
                          attention_mask = attention_mask.long(),
                          token_type_ids = sequence_ids.long())
          
      logits = outputs[0]

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      batch_pair_ids = batch_pair_ids.to('cpu').numpy()

      for i in range(0,len(logits)): 
        sub_list = []
        sub_list.append(float(logits[i][0]))
        sub_list.append(float(logits[i][1]))
        predictions.append(sub_list)
        pair_ids.append('ex' + str(batch_pair_ids[i]))
      
df = pd.DataFrame()
df['pairID'] = pair_ids
df['gold_label'] = predictions
df.to_csv('hans_predictions_post.csv', index=False)

print('---- HANS Testing Completed ----')

---- HANS Testing Completed ----


In [0]:
# output to CSV file to submit in kaggle competition
df = pd.DataFrame()
df['pairID'] = pair_ids
df['gold_label'] = predictions
df.to_csv('hans_predictions_trained.csv', index=False)