In [0]:
import tensorflow as tf
import torch
import os
import pandas as pd
import numpy as np

### Using a GPU for faster training time

In [0]:
# check for GPU
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# use GPU
device = torch.device("cuda")
# confirm
print('We are using a ', torch.cuda.get_device_name(0))

Found GPU at: /device:GPU:0
We are using a  Tesla P100-PCIE-16GB


### Installing dependencies

In [0]:
!pip install transformers
!pip install wget
# unzipping glue datasets
!unzip 60c2bdb54d156a41194446737ce03e2e-17b8dd0d724281ed7c3b2aeeda662b92809aadd5.zip

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |████████████████████████████████| 501kB 2.6MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 12.2MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 16.6MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████

In [0]:
# downloading datasets
!python '/content/60c2bdb54d156a41194446737ce03e2e-17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py'

Downloading and extracting CoLA...
	Completed!
Downloading and extracting SST...
	Completed!
Processing MRPC...
Local MRPC data not specified, downloading data from https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt
	Completed!
Downloading and extracting QQP...
	Completed!
Downloading and extracting STS...
	Completed!
Downloading and extracting MNLI...
	Completed!
Downloading and extracting SNLI...
	Completed!
Downloading and extracting QNLI...
	Completed!
Downloading and extracting RTE...
	Completed!
Downloading and extracting WNLI...
	Completed!
Downloading and extracting diagnostic...
	Completed!


-------------------
### Reading Dataset

In [0]:
import csv
import sys
import re

# function to read and convert MNLI dataset
def read_data(filepath):
  premises = []
  hypotheses = []
  labels = []
  with open(filepath) as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\n')
    for row in reader:
      line = re.split(r'\t+', row[0])
      premises.append(line[8])
      hypotheses.append(line[9])
      labels.append(line[10])
  assert (len(premises) == len(hypotheses))
  assert (len(premises) == len(labels))
  
  return premises, hypotheses, labels

In [0]:
train_premises, train_hypotheses, train_labels = read_data('/content/glue_data/MNLI/train.tsv')
val_premises, val_hypotheses, val_labels = read_data('/content/glue_data/MNLI/dev_matched.tsv')

-----------------
### Tokenizing

- Add begining and end tokens
- Pad & Truncate to single length
- Add attention mask

e.g:

      For sequence pairs:
      tokens:    [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
      type_ids:      0 0  0    0    0     0       0 0     1  1  1  1   1 1
      attention_mask:1 1  1    1    1     1       1 1     1  1  1  1   1 1

      source: https://github.com/google-research/bert/blob/cc7051dc592802f501e8a6f71f8fb3cf9de95dc9/run_classifier.py#L161

  Note that all three of these will be padded with 0's to match the max sequence length

In [0]:
from transformers import BertTokenizer
# loading bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
def tokenize_sentences(premises, hypotheses, max_sequence_length):
  # premises, hypothesis, and labels should all have same length
  encoded_sentences = []
  type_ids = []
  attention_masks = []

  for i in range(0,len(premises)):
    # skip first line
    if i == 0: continue
    encoded_sentence = []
    type_id = []
    attention_mask = []

    # encoding premise
    encoded_premise = (tokenizer.convert_tokens_to_ids(tokenizer.tokenize(premises[i])))
    # encoding hypothesis
    encoded_hypothesis = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(hypotheses[i]))

    # truncate if length is too large
    # following heuristic in https://github.com/google-research/bert/blob/cc7051dc592802f501e8a6f71f8fb3cf9de95dc9/run_classifier.py#L557
    # i.e. always truncate smaller one
    if len(encoded_premise) + len(encoded_hypothesis) > max_sequence_length-3:
      while True:
        length = len(encoded_premise) + len(encoded_hypothesis)
        if length <= max_sequence_length-3:
          break
        elif len(encoded_premise) > len(encoded_hypothesis):
          encoded_premise.pop()
        else:
          encoded_hypothesis.pop()

    # build vectors
    encoded_sentence.append(101)  # [CLS] == 101
    encoded_sentence += encoded_premise
    encoded_sentence.append(102) # [SEP] == 101
    type_id = list(np.zeros(len(encoded_sentence)))

    encoded_sentence += encoded_hypothesis
    encoded_sentence.append(102)
    type_id += list(np.ones(len(encoded_hypothesis)+1))

    assert len(encoded_sentence) == len(type_id)

    # attention mask
    attention_mask = list(np.ones(len(encoded_sentence)))

    # pad to max_sequence_length
    while len(encoded_sentence) < max_sequence_length:
      encoded_sentence.append(0)
      type_id.append(0)
      attention_mask.append(0)

    assert len(encoded_sentence) == max_sequence_length
    assert len(type_id) == max_sequence_length
    assert len(attention_mask) == max_sequence_length
    
    encoded_sentences.append(encoded_sentence)
    type_ids.append(type_id)
    attention_masks.append(attention_mask)

  return encoded_sentences, type_ids, attention_masks

In [0]:
def tokenize_labels(labels):
  encoded_labels = []
  for i in range(0, len(labels)):
    # skip 'label1'
    if i == 0: 
      continue
    if labels[i] == 'entailment':
      encoded_labels.append(0)
    elif labels[i] == 'contradiction':
      encoded_labels.append(1)
    else:
      encoded_labels.append(2)

  return list(encoded_labels)

The following steps may take a few minutes

In [0]:
train_inputs, train_ids, train_masks = tokenize_sentences(train_premises, train_hypotheses, 128)

In [0]:
val_inputs, val_ids, val_masks = tokenize_sentences(val_premises, val_hypotheses, 128)

In [0]:
train_labels = tokenize_labels(train_labels)
val_labels = tokenize_labels(val_labels)

We now need to convert these vectors to tensors

In [0]:
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)

In [0]:
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)

In [0]:
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

In [0]:
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

### Creating DataLoaders
Python iterables

In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [0]:
batch_size = 32

In [0]:
# Create DataLoader for training set
train_data = TensorDataset(train_inputs, train_masks, train_ids, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for validation set
validation_data = TensorDataset(val_inputs, val_masks, val_ids, val_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

--------
### Loading BERT

In [0]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [0]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3,  
    output_attentions = False, 
    output_hidden_states = False, 
)

# run model on GPU
model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8
                )

In [0]:
from transformers import get_linear_schedule_with_warmup

# number of training epochs (authors recommend between 2 and 4)
epochs = 3

total_steps = len(train_dataloader)*epochs

# create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

### Train the model

Again, this script is based off of https://mccormickml.com/2019/07/22/BERT-fine-tuning/#11-using-colab-gpu-for-training

In [0]:
import random
import time
import datetime
import os
from google.colab import files


torch.set_default_dtype(torch.float64)

seed = 72

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

loss_values = []

for epoch in range(0, epochs):
  print('---------- Epoch %s ----------' % str(epoch))
  # start clock
  t0 = time.time()

  # reset loss for epoch
  total_loss = 0

  # put model into training mode
  model.train()

  # for each batch of the training data
  for step, batch in enumerate(train_dataloader):

    if step % 100 == 0 and not step == 0:
      time_elapsed = str(datetime.timedelta(seconds=int(round(time.time() - t0))))
      print('\t Batch %i of %i. Time elapsed: %s' % (step, len(train_dataloader), time_elapsed))
    
    # retrieve tensors from dataloader
    # copy each to GPU using to(device)
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    sequence_ids = batch[2].to(device)
    labels = batch[3].to(device)

    # clear previously calculated gradients
    model.zero_grad()

    # perform forward pass
    # the loss is returned
    outputs = model(
        input_ids = input_ids.long(),
        attention_mask = attention_mask.long(),
        token_type_ids = sequence_ids.long(),
        labels = labels.long()
        )
    
    loss = outputs[0]
    total_loss += loss

    # perform backward pass to calculate gradients
    loss.backward()

    # Clip the norm of the gradients to 1.0 to help prevent "exploding gradients" 
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Update parameters and take a step using the computed gradient
    optimizer.step()

    # Update the learning rate
    scheduler.step()

  try:
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    output_dir = '/content/model_save/'
    print("Saving model to %s" % output_dir)

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
  except:
    print('Saving Failed')

  # Calculate the average loss over the training data.
  avg_train_loss = total_loss / len(train_dataloader)
  loss_values.append(avg_train_loss)

  print('--- Average Training Loss: %f' % avg_train_loss)

  # Measure performance on validation set
  t0 = time.time()
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  for batch in validation_dataloader:

      batch = tuple(t.to(device) for t in batch)
          
      # Unpack the inputs from dataloader
      input_ids, attention_mask, sequence_ids, labels = batch

      # no need for grad since evaluation
      with torch.no_grad():        

        outputs = model(input_ids = input_ids.long(),
                          attention_mask = attention_mask.long(),
                          token_type_ids = sequence_ids.long())
          
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
          
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = np.sum(np.argmax(logits, axis=1).flatten() == label_ids.flatten())/len(label_ids)
          
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

  # Report the final accuracy for this validation run.
  print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
  try:
    print("  Validation took: {:}".format((datetime.timedelta(seconds=int(round(time.time() - t0)))))) 
  except:
    continue

print('Training complete.')


---------- Epoch 0 ----------
	 Batch 100 of 12272. Time elapsed: 0:01:13
	 Batch 200 of 12272. Time elapsed: 0:02:23
	 Batch 300 of 12272. Time elapsed: 0:03:34
	 Batch 400 of 12272. Time elapsed: 0:04:44
	 Batch 500 of 12272. Time elapsed: 0:05:55
	 Batch 600 of 12272. Time elapsed: 0:07:06
	 Batch 700 of 12272. Time elapsed: 0:08:16
	 Batch 800 of 12272. Time elapsed: 0:09:27
	 Batch 900 of 12272. Time elapsed: 0:10:38
	 Batch 1000 of 12272. Time elapsed: 0:11:48
	 Batch 1100 of 12272. Time elapsed: 0:12:59
	 Batch 1200 of 12272. Time elapsed: 0:14:10
	 Batch 1300 of 12272. Time elapsed: 0:15:20
	 Batch 1400 of 12272. Time elapsed: 0:16:31
	 Batch 1500 of 12272. Time elapsed: 0:17:41
	 Batch 1600 of 12272. Time elapsed: 0:18:52
	 Batch 1700 of 12272. Time elapsed: 0:20:03
	 Batch 1800 of 12272. Time elapsed: 0:21:13
	 Batch 1900 of 12272. Time elapsed: 0:22:24
	 Batch 2000 of 12272. Time elapsed: 0:23:35
	 Batch 2100 of 12272. Time elapsed: 0:24:45
	 Batch 2200 of 12272. Time elapse

In [0]:
model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in validation_dataloader:

    batch = tuple(t.to(device) for t in batch)
          
      # Unpack the inputs from dataloader
    input_ids, attention_mask, sequence_ids, labels = batch

      # no need for grad since evaluation
    with torch.no_grad():        

      outputs = model(input_ids = input_ids.long(),
                          attention_mask = attention_mask.long(),
                          token_type_ids = sequence_ids.long())
          
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
      logits = outputs[0]

        # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()
          
        # Calculate the accuracy for this batch of test sentences.
      tmp_eval_accuracy = np.sum(np.argmax(logits, axis=1).flatten() == label_ids.flatten())/len(label_ids)
          
        # Accumulate the total accuracy.
      eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
      nb_eval_steps += 1

  # Report the final accuracy for this validation run.
print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

  Accuracy: 0.85


----------
### Testing Model

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# since no labels are provided in the data set, we must redefine the read_data function
def read_test_data(filepath):
  premises = []
  hypotheses = []
  pair_ids = []
  with open(filepath) as tsvfile:
    first = True
    reader = csv.reader(tsvfile, delimiter='\n')
    for row in reader:
      line = re.split(r'\t+', row[0])
      if first:
        first = False
        premises.append(line[8])
        hypotheses.append(line[9])
        continue
      premises.append(line[8])
      hypotheses.append(line[9])
      pair_ids.append(int(line[2]))
  assert (len(premises) == len(hypotheses))
  
  return premises, hypotheses, pair_ids

In [0]:
test_premises, test_hypotheses, test_pair_ids = read_test_data('/content/glue_data/MNLI/test_matched.tsv')
test_inputs, test_ids, test_masks = tokenize_sentences(test_premises, test_hypotheses, 128)

In [0]:
test_inputs = torch.tensor(test_inputs)
test_ids = torch.tensor(test_ids)
test_masks = torch.tensor(test_masks)
test_pair_ids = torch.tensor(test_pair_ids)

In [0]:
batch_size = 32

In [0]:
test_data = TensorDataset(test_inputs, test_masks, test_ids, test_pair_ids)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [0]:
predictions = []
pair_ids = []

model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in test_dataloader:

    batch = tuple(t.to(device) for t in batch)
          
    # Unpack the inputs from dataloader
    input_ids, attention_mask, sequence_ids, batch_pair_ids = batch

    # no need for grad since evaluation
    with torch.no_grad():        

      outputs = model(input_ids = input_ids.long(),
                          attention_mask = attention_mask.long(),
                          token_type_ids = sequence_ids.long())
          
      logits = outputs[0]

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      batch_pair_ids = batch_pair_ids.to('cpu').numpy()

      for i in range(0,len(logits)): 
        predictions.append(logits[i])
        pair_ids.append(batch_pair_ids[i])

print('---- Testing Completed ----')

---- Testing Completed ----


In [0]:
label_predictions = []
for prediction in predictions:
  label_index = np.argmax(prediction)
  if label_index == 0:
    label_predictions.append('entailment')
  elif label_index == 1:
    label_predictions.append('contradiction')
  else:
    label_predictions.append('neutral')

In [0]:
# output to CSV file to submit in kaggle competition
df = pd.DataFrame()
df['pairID'] = pair_ids
df['gold_label'] = label_predictions
df.to_csv('predictions.csv', index=False)

In [0]:
df

Unnamed: 0,pairID,gold_label
0,55615,neutral
1,85595,neutral
2,42972,entailment
3,133689,contradiction
4,108690,neutral
...,...,...
9791,113013,entailment
9792,14918,neutral
9793,101150,entailment
9794,69735,contradiction


In [0]:
output_dir = '/content/model_save_final/'
print("Saving model to %s" % output_dir)

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    #model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to /content/model_save_final/


('/content/model_save_final/vocab.txt',
 '/content/model_save_final/special_tokens_map.json',
 '/content/model_save_final/added_tokens.json')

In [0]:
def read_and_convert_hans(filepath): 
  premises = []
  hypotheses = []
  pairIDs = []
  gold_labels = []
  first_line = True
  with open(filepath) as file:
    for fline in file:
      line = re.split(r'\t+', fline)
      if first_line == True:
        first_line = False
        premises.append(line[5])
        hypotheses.append(line[6])
        continue
      pairIDs.append(int(re.sub('ex', '', line[7])))
      gold_labels.append(line[0])
      premises.append(line[5])
      hypotheses.append(line[6])
    
    #assert(len(pairIDs) == len(premises))
    assert(len(premises) == len(hypotheses))
    assert(len(pairIDs) == len(gold_labels))

    return premises, hypotheses, pairIDs, gold_labels

In [0]:
hans_premises, hans_hypotheses, hans_pairIDs, hans_labels = read_and_convert_hans('/content/heuristics_evaluation_set.txt')

In [0]:
test_inputs, test_ids, test_masks = tokenize_sentences(hans_premises, hans_hypotheses, 128)

In [0]:
test_inputs = torch.tensor(test_inputs)
test_ids = torch.tensor(test_ids)
test_masks = torch.tensor(test_masks)
hans_pairIDs = torch.tensor(hans_pairIDs)

In [0]:
test_data = TensorDataset(test_inputs, test_masks, test_ids, hans_pairIDs)
#test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [0]:
predictions = []
pair_ids = []

model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in test_dataloader:

    batch = tuple(t.to(device) for t in batch)
          
    # Unpack the inputs from dataloader
    input_ids, attention_mask, sequence_ids, batch_pair_ids = batch

    # no need for grad since evaluation
    with torch.no_grad():        

      outputs = model(input_ids = input_ids.long(),
                          attention_mask = attention_mask.long(),
                          token_type_ids = sequence_ids.long())
          
      logits = outputs[0]

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      batch_pair_ids = batch_pair_ids.to('cpu').numpy()

      for i in range(0,len(logits)): 
        predictions.append(logits[i])
        pair_ids.append('ex' + str(batch_pair_ids[i]))

print('---- Testing Completed ----')

---- Testing Completed ----


In [0]:
# output to CSV file to submit in kaggle competition
df = pd.DataFrame()
df['pairID'] = pair_ids
df['gold_label'] = predictions
df.to_csv('hans_predictions.csv', index=False)

In [0]:
df.to_csv('hans_predictions.csv', index=False)

In [0]:
label_predictions = []
for prediction in predictions:
  label_index = np.argmax(prediction)
  if label_index == 0:
    label_predictions.append('entailment')
  else:
    label_predictions.append('non-entailment')

In [0]:
df = pd.DataFrame()
df['pairID'] = pair_ids
df['gold_label'] = label_predictions

In [0]:
df.to_csv('hans_predictions_with_labels.csv', index=False)

In [0]:
import pandas as pd
test = pd.read_csv('predictions_with_labels.csv')

-----
Evaluating Results on HANS

In [0]:
!python evaluate_heur_output.py hans_predictions_with_labels.csv

Heuristic entailed results:
lexical_overlap: 0.9588
subsequence: 0.9852
constituent: 0.992

Heuristic non-entailed results:
lexical_overlap: 0.4464
subsequence: 0.0958
constituent: 0.1524

Subcase results:
ln_subject/object_swap: 0.425
ln_preposition: 0.616
ln_relative_clause: 0.484
ln_passive: 0.022
ln_conjunction: 0.685
le_relative_clause: 0.963
le_around_prepositional_phrase: 0.999
le_around_relative_clause: 0.988
le_conjunction: 0.845
le_passive: 0.999
sn_NP/S: 0.028
sn_PP_on_subject: 0.247
sn_relative_clause_on_subject: 0.132
sn_past_participle: 0.004
sn_NP/Z: 0.068
se_conjunction: 0.933
se_adjective: 1.0
se_understood_object: 0.995
se_relative_clause_on_obj: 0.998
se_PP_on_obj: 1.0
cn_embedded_under_if: 0.398
cn_after_if_clause: 0.03
cn_embedded_under_verb: 0.201
cn_disjunction: 0.022
cn_adverb: 0.111
ce_embedded_under_since: 0.968
ce_after_since_clause: 1.0
ce_embedded_under_verb: 0.992
ce_conjunction: 1.0
ce_adverb: 1.0

Template results:
temp1: 0.425
temp5: 0.3885350318471338


In [0]:
hans = pd.read_csv('hans_predictions_with_labels.csv')