**CSI5138F - Intro to DL and RL - Project**

This notebook is heavily influenced by the following Tutorial:
  https://mccormickml.com/2019/07/22/BERT-fine-tuning/ 

# Set up the notebook

In [13]:
# install pytorch transformers
!pip install pytorch-transformers

You should consider upgrading via the '/Users/aidanaarts/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


## Imports and mount drive

In [14]:
import torch
import os
import string
import copy
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_transformers import *
import numpy as np
import json
import collections
#from google.colab import drive
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

# Import nltk WordNet
import nltk
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()

# Mount google drive containing the datasets
#drive.mount('/content/drive', force_remount=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aidanaarts/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Get RoBERTa tokenizer

In [15]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

## Utility functions

In [16]:
# Create a function to import json objects from jsonl files
def load_json_objects_from_file(filename):
  # Array for json objects
  json_objects = []
  # Read file line by line
  with open(filename, mode = "r") as jsonl_file:
      for line in jsonl_file:
          json_objects.append(json.loads(line))
  return json_objects

#Takes a list of words (strings) and a sentence (as a RoBERTa tokenized ID list) and returns a list
#of pairs indicating the tokens' start and end positions in the sentence for each word
#NOTE: it can also apply to a group of words separated by spaces
#NOTE: It is important that the list of words given describes a sentence, because the order is relevant to do the matching properly
      
def find_word_in_tokenized_sentence(word,token_ids):
  decomposedWord = tokenizer.encode(word)
  # Iterate through to find a matching sublist of the token_ids
  for i in range(len(token_ids)):
    if token_ids[i] == decomposedWord[0] and token_ids[i:i+len(decomposedWord)] == decomposedWord:
      return (i,i+len(decomposedWord)-1)
  # This is the ouput when no matching pattern is found
  return (-1,-1)
  
def find_words_in_tokenized_sentences(wordList,token_ids):
  intList = []
  for word in wordList:
    if len(intList) == 0:
      intList.append(find_word_in_tokenized_sentence(word,token_ids))
    else:
      afterLastInterval = intList[-1][1]+1
      interv = find_word_in_tokenized_sentence(word,token_ids[afterLastInterval:])
      actualPositions = (interv[0] + afterLastInterval,interv[1]+afterLastInterval)
      intList.append(actualPositions)
  return intList

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels, return_predict_correctness = False):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  if return_predict_correctness:
    return np.sum(pred_flat == labels_flat) / len(labels_flat), pred_flat == labels_flat
  else:
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def flat_predictions(preds):
  pred_flat = np.argmax(preds, axis=1).flatten()
  return pred_flat == 1

## Constants

In [17]:
BATCH_SIZE = 64
EPOCHS = 50
PATIENCE = 10
# Prepare Torch to use GPU
device = torch.device("cpu")
#n_gpu = torch.cuda.device_count()
#torch.cuda.get_device_name(0)

## Import and Process the WordInContext Training data

In [18]:
# Create a function to preprocess the WiC data
def wic_preprocessing(json_objects, training = True, shuffle_data = False, verbose = False):
  wic_sentences = []
  wic_encoded = []
  wic_labels = []
  wic_word_locs = []
  wic_indexes = []
  for index, example in enumerate(json_objects):
    #wic_indexes.append(example['idx']) # Is it the index??
    wic_indexes.append(index)
    sentence = f"<s>{example['sentence1']}</s><s>{example['sentence2']}</s>"
    wic_sentences.append(sentence)
    # Then encode the sentences
    wic_encoded.append(tokenizer.encode(sentence, add_special_tokens=False))
    # Find the word in each sentences
    word = example['word']
    word_locs = (-1, -1)
    # Split the 2 sentences on space. (Also, lemmatize and uncapitilize each word)
    sent1_split = example['sentence1'].split(' ')
    sent2_split = example['sentence2'].split(' ')
    # Get the index of word in both sentences
    sent1_word_char_loc = (example['start1'], example['end1'])
    sent2_word_char_loc = (example['start2'], example['end2'])
    # Create a variable to keep track of the number of characters parsed in each sentence as we loop
    sent_chars = 0
    # Loop over the words in the first sentence
    i, j = 0, 0
    word1_not_found, word2_not_found = True, True
    while word1_not_found and i < len(sent1_split):
      word_len = len(sent1_split[i])
      if sent_chars >= sent1_word_char_loc[0] or sent_chars + word_len >= sent1_word_char_loc[1]:
        word_locs = (i, -1) # Found the word in the sentence
        word1_not_found = False
      elif sent_chars > sent1_word_char_loc[1]:
        # If we somehow got past the word. Assume it was the previous word
        word_locs = (i - 1, -1) # Found the word in the sentence
        word1_not_found = False
      else:
        # Look at the next word
        sent_chars += word_len + 1 # Plus one for the space
        i += 1
    # Loop over the words in the second
    sent_chars = 0 # Reset
    while word2_not_found and j < len(sent2_split):
      word_len = len(sent2_split[j])
      if sent_chars >= sent2_word_char_loc[0] or sent_chars + word_len >= sent2_word_char_loc[1]:
        word_locs = (i, j) # Found the word in the sentence
        word2_not_found = False
      elif sent_chars > sent2_word_char_loc[1]:
        # If we somehow got past the word. Assume it was the previous word
        word_locs = (i, j - 1) # Found the word in the sentence
        word2_not_found = False
      else:
        # Look at the next word
        sent_chars += word_len + 1 # Plus one for the space
        j += 1
    # For testing
    if verbose:
      print(word)
      print(sent1_split)
      print(sent2_split)
      print(word_locs)
    # Now to find the word in the tokenized sentences
    word1 = sent1_split[word_locs[0]].translate(str.maketrans('', '', string.punctuation)) #Remove punctuation (See https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string)
    word2 = sent2_split[word_locs[1]].translate(str.maketrans('', '', string.punctuation)) #Remove punctuation
    token_word_locs = find_words_in_tokenized_sentences([word1, word2], wic_encoded[-1])
    wic_word_locs.append(token_word_locs)
    # Get the label if we expect it to be there
    if training:
      if example['label']:
        wic_labels.append(1)
      else:
        wic_labels.append(0)
  # Pad the sequences and find the encoded word location in the combined input
  max_len = np.array([len(ex) for ex in wic_encoded]).max()
  wic_padded = {"input_ids" : [], "attention_mask" : [], "token_type_ids" : [], "word1_locs": [], "word2_locs" : [], "index" : wic_indexes}
  for i in range(0, len(wic_encoded)):
    enc_sentence = wic_encoded[i]
    word_locs = wic_word_locs[i]
    # Pad the sequences
    ex_len = len(enc_sentence)
    padded_sentence = enc_sentence.copy()
    padded_sentence.extend([0]*(max_len - ex_len))
    wic_padded["input_ids"].append(padded_sentence)
    padded_mask = [1] * ex_len
    padded_mask.extend([0]*(max_len - ex_len))
    wic_padded["attention_mask"].append(padded_mask)
    # Create the vector to get back the words after RoBERTa
    token_word_locs = wic_word_locs[i]
    first_word_loc = []
    second_word_loc = []
    len_first_word = token_word_locs[0][1] - token_word_locs[0][0] + 1
    len_second_word = token_word_locs[1][1] - token_word_locs[1][0] + 1
    for j in range(0, max_len):
      if j >= token_word_locs[0][0] and j <= token_word_locs[0][1]:
        #Part of the first word
        first_word_loc.append(1.0 / len_first_word)
      else:
        first_word_loc.append(0.0)
      if j >= token_word_locs[1][0] and j <= token_word_locs[1][1]:
        #Part of the second word
        second_word_loc.append(1.0 / len_second_word)
      else:
        second_word_loc.append(0.0)
    # We want to append a [1, max_len] vector instead of a [max_len] vector so wrap in an array
    wic_padded["word1_locs"].append([first_word_loc])
    wic_padded["word2_locs"].append([second_word_loc])
    # token_type_ids is a mask that tells where the first and second sentences are
    token_type_id = []
    first_sentence = True
    sentence_start = True
    for token in padded_sentence:
      if first_sentence and sentence_start and token == 0:
        # Allows 0 at the start of the first sentence
        token_type_id.append(0)
      elif first_sentence and token > 0:
        if sentence_start:
          sentence_start = False
        token_type_id.append(0)
      elif first_sentence and not sentence_start and token == 0:
        first_sentence = False
        # Start of second sentence
        token_type_id.append(1)
      else:
        # Second sentence
        token_type_id.append(1)
    wic_padded["token_type_ids"].append(token_type_id)
  if training:
    if shuffle_data:
      # Shuffle the data
      raw_set = {"input_ids": [], "token_type_ids": [], "attention_mask": [], "labels": [], "word1_locs": [], "word2_locs" : [], "index" : []}
      raw_set["input_ids"], raw_set["token_type_ids"], raw_set["attention_mask"], raw_set["labels"], raw_set["word1_locs"], raw_set["word2_locs"], raw_set["index"] = shuffle(
          wic_padded["input_ids"], wic_padded["token_type_ids"], wic_padded["attention_mask"], wic_labels, wic_padded["word1_locs"], wic_padded["word2_locs"], wic_padded["index"])
    else:
      raw_set = {"input_ids": wic_padded["input_ids"], "token_type_ids": wic_padded["token_type_ids"],
                 "attention_mask": wic_padded["attention_mask"], "labels": wic_labels, "index" : wic_padded["index"],
                 "word1_locs": wic_padded["word1_locs"], "word2_locs" : wic_padded["word2_locs"]}
  else: # No labels present (Testing set)
    # Do not shuffle the testing set
    raw_set = {"input_ids": wic_padded["input_ids"], "token_type_ids": wic_padded["token_type_ids"], 
               "attention_mask": wic_padded["attention_mask"], "index" : wic_padded["index"], 
               "word1_locs": wic_padded["word1_locs"], "word2_locs" : wic_padded["word2_locs"]}
  # Return the raw data (Need to put them in a PyTorch tensor and dataset)
  return raw_set

In [19]:
# Process the data
train_json_objs = load_json_objects_from_file("/Users/aidanaarts/Documents/Fall2021/CSCI 3832 Natural Language Processing/Class Project/WiC/train.jsonl")
raw_train_set = wic_preprocessing(train_json_objs, shuffle_data=True, verbose = False) # We do not want to shuffle for now.
print(train_json_objs[raw_train_set["index"][15]])
print(raw_train_set["input_ids"][15]),
print(raw_train_set["token_type_ids"][15]),
print(raw_train_set["attention_mask"][15]),
print(raw_train_set["labels"][15])
print(raw_train_set["word1_locs"][15]) #right, so it's better to use one-hot vectors to do a selection afterwards, I can compute it more precisely with the token intervals however
print(raw_train_set["word2_locs"][15])

{'word': 'face', 'sentence1': 'An angry face.', 'sentence2': 'He looked out at a roomful of faces.', 'idx': 4400, 'label': False, 'start1': 9, 'start2': 30, 'end1': 13, 'end2': 35, 'version': 1.1}
[0, 660, 5800, 652, 4, 2, 0, 91, 1415, 66, 23, 10, 929, 2650, 9, 2419, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
0
[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [20]:
print(len(raw_train_set["labels"])/BATCH_SIZE)

78.125


### Load into a PyTorch dataset

In [21]:
# Create a PyTorch dataset for it
train_data = TensorDataset(
    torch.tensor(raw_train_set["input_ids"]),
    torch.tensor(raw_train_set["token_type_ids"]),
    torch.tensor(raw_train_set["attention_mask"]),
    torch.tensor(raw_train_set["labels"]),
    torch.tensor(raw_train_set["word1_locs"]),
    torch.tensor(raw_train_set["word2_locs"]),
    torch.tensor(raw_train_set["index"])
)
# Create a sampler and loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

### Repeat the procedure to load the WiC validation and testing sets

In [22]:
# Load the json objects from each file
test_json_objs = load_json_objects_from_file("/Users/aidanaarts/Documents/Fall2021/CSCI 3832 Natural Language Processing/Class Project/WiC/test.jsonl")
print(test_json_objs)
valid_json_objs = load_json_objects_from_file("/Users/aidanaarts/Documents/Fall2021/CSCI 3832 Natural Language Processing/Class Project/WiC/dev.jsonl")
# Process the objects
raw_test_set = wic_preprocessing(test_json_objs, training = False) # The labels for the testing set are unknown
raw_valid_set = wic_preprocessing(valid_json_objs)
# Create PyTorch datasets
test_data = TensorDataset(
    torch.tensor(raw_test_set["input_ids"]),
    torch.tensor(raw_test_set["token_type_ids"]),
    torch.tensor(raw_test_set["attention_mask"]),
    torch.tensor(raw_test_set["word1_locs"]),
    torch.tensor(raw_test_set["word2_locs"]),
    torch.tensor(raw_test_set["index"])
)
validation_data = TensorDataset(
    torch.tensor(raw_valid_set["input_ids"]),
    torch.tensor(raw_valid_set["token_type_ids"]),
    torch.tensor(raw_valid_set["attention_mask"]),
    torch.tensor(raw_valid_set["labels"]),
    torch.tensor(raw_valid_set["word1_locs"]),
    torch.tensor(raw_valid_set["word2_locs"]),
    torch.tensor(raw_valid_set["index"])
)
# Create a sampler and loader for each
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)



In [23]:
print(len(raw_valid_set["labels"])/BATCH_SIZE)

6.6875


# RoBERTa

## Loading the model

In [24]:
# Variable for the path to the model
NAME = 'Aidan'
PATH = '/Users/aidanaarts/Documents/Fall2021/CSCI 3832 Natural Language Processing/Class Project/AIDAN'
if not os.path.exists(PATH):
  os.mkdir(PATH) # Make the directory

# Load the base RoBERTa model
#model = RobertaModel.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')
#roberta_init_weights = model.state_dict()

# This is where the model weights would be loaded for our fine-tuned models
#Model_name = "RoBERTa_0.0_1.0_0.0_Epochs2"
#PATH = f'/content/drive/My Drive/CSI5138_Project/Models/{NAME}/{Model_name}'
#model = RobertaModel.from_pretrained(PATH)
#model = RobertaForMaskedLM.from_pretrained(PATH)

## Create a custom head for WiC instead of a classification head
Based on https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html and https://huggingface.co/transformers/_modules/transformers/modeling_roberta.html#RobertaModel



In [25]:
class WiC_Head(torch.nn.Module):
    def __init__(self, roberta_based_model, embedding_size = 768):
        """
        Keeps a reference to the provided RoBERTa model. 
        It then adds a linear layer that takes the distance between two 
        """
        super(WiC_Head, self).__init__()
        self.embedding_size = embedding_size
        self.embedder = roberta_based_model
        self.linear_diff = torch.nn.Linear(embedding_size, 250, bias = True)
        self.linear_seperator = torch.nn.Linear(250, 2, bias = True)
        self.loss = torch.nn.CrossEntropyLoss()
        self.activation = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax()

    def forward(self, input_ids=None, attention_mask=None, labels=None,
                word1_locs = None, word2_locs = None):
        """
        Takes in the same argument as RoBERTa forward plus two tensors for the location of the 2 words to compare
        """
        if word1_locs is None or word2_locs is None:
          raise ValueError("The tensors (word1_locs, word1_locs) containing the location of the words to compare in the input vector must be provided.")
        elif input_ids is None:
          raise ValueError("The input_ids tensor must be provided.")
        elif word1_locs.shape[0] != input_ids.shape[0] or word2_locs.shape[0] != input_ids.shape[0]:
          raise ValueError("All provided vectors should have the same batch size.")
        batch_size = word1_locs.shape[0]
        # Get the embeddings
        embs, _ = self.embedder.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # Get the words
        word1s = torch.matmul(word1_locs, embs).view(batch_size, self.embedding_size)
        word2s = torch.matmul(word2_locs, embs).view(batch_size, self.embedding_size)
        diff = word1s - word2s
        # Calculate outputs using activation
        layer1_results = self.activation(self.linear_diff(diff))
        logits = self.softmax(self.linear_seperator(layer1_results))
        outputs = logits
        # Calculate the loss
        if labels is not None:
            #  We want seperation like a SVM so use Hinge loss
            loss = self.loss(logits.view(-1, 2), labels.view(-1))
            outputs = (loss, logits)
        return outputs

## Create the WiC model

In [26]:
class_model = WiC_Head(model, embedding_size = 768)

## The training loop

In [27]:
# Variable for minimal accuracy
MIN_ACCURACY = 0.73 # Based on the average accuracy
REACHED_MIN_ACCURACY = False
best_weights = class_model.state_dict()
# Want to maximize accuracy
max_val_acc = (0, 0)
# Put the model in GPU
###class_model.cuda()
# Create the optimizer
param_optimizer = list(class_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
# I use the one that comes with the models, but any other optimizer could be used
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
# Store our loss and accuracy for plotting
fit_history = {"loss": [],  "accuracy": [], "val_loss": [], "val_accuracy": []}
epoch_number = 0
epoch_since_max = 0
continue_learning = True
while epoch_number < EPOCHS and continue_learning:
  epoch_number += 1
  print(f"Training epoch #{epoch_number}")
  # Tracking variables
  tr_loss, tr_accuracy = 0, 0
  nb_tr_examples, nb_tr_steps = 0, 0
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  # Training
  # Set our model to training mode (as opposed to evaluation mode)
  class_model.train()
  # Freeze RoBERTa weights
  #class_model.embedder.eval()
  class_model.embedder.requires_grad_ = False
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    ###batch = tuple(t.cuda() for t in batch)
    batch = tuple(t for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)   
    loss, logits = class_model(b_input_ids, attention_mask=b_input_mask, 
                               labels=b_labels, word1_locs = b_word1, word2_locs = b_word2) 
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()
    # Calculate the accuracy
    b_accuracy = flat_accuracy(logits, label_ids) # For RobertaForClassification
    # Append to fit history
    fit_history["loss"].append(loss.item()) 
    fit_history["accuracy"].append(b_accuracy) 
    # Update tracking variables
    tr_loss += loss.item()
    tr_accuracy += b_accuracy
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
    if nb_tr_steps%10 == 0:
      print("\t\tTraining Batch {}: Loss: {}; Accuracy: {}".format(nb_tr_steps, loss.item(), b_accuracy))
  print("Training:\n\tLoss: {}; Accuracy: {}".format(tr_loss/nb_tr_steps, tr_accuracy/nb_tr_steps))
  # Validation
  # Put model in evaluation mode to evaluate loss on the validation set
  class_model.eval()
  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    ###batch = tuple(t.cuda() for t in batch)
    batch = tuple(t for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)
      loss, logits = class_model(b_input_ids, attention_mask=b_input_mask, 
                                 labels=b_labels, word1_locs = b_word1, word2_locs = b_word2)
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()
    # Calculate the accuracy
    b_accuracy = flat_accuracy(logits, label_ids) # For RobertaForClassification
    # Append to fit history
    fit_history["val_loss"].append(loss.item()) 
    fit_history["val_accuracy"].append(b_accuracy) 
    # Update tracking variables
    eval_loss += loss.item()
    eval_accuracy += b_accuracy
    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1
    if nb_eval_steps%10 == 0:
      print("\t\tValidation Batch {}: Loss: {}; Accuracy: {}".format(nb_eval_steps, loss.item(), b_accuracy))
  eval_acc = eval_accuracy/nb_eval_steps
  if eval_acc >= max_val_acc[0]:
    max_val_acc = (eval_acc, epoch_number)
    continue_learning = True
    epoch_since_max = 0 # New max
    best_weights = copy.deepcopy(class_model.state_dict()) # Keep the best weights
    # See if we have reached min_accuracy
    if eval_acc >= MIN_ACCURACY:
      REACHED_MIN_ACCURACY = True
    # Save to file only if it has reached min acc
    if REACHED_MIN_ACCURACY:
      # Save the best weights to file
      torch.save(best_weights, os.path.join(PATH,'WiCHead.pt'))
      continue_learning = False # Stop learning. Reached baseline acc for this model
  else:
    epoch_since_max += 1
    if epoch_since_max > PATIENCE:
      continue_learning = False # Stop learning, starting to overfit
  print("Validation:\n\tLoss={}; Accuracy: {}".format(eval_loss/nb_eval_steps, eval_accuracy/nb_eval_steps))
print(f"Best accuracy ({max_val_acc[0]}) obtained at epoch #{max_val_acc[1]}.")
# Reload the best weights (from memory)
class_model.load_state_dict(best_weights)

Training epoch #1


  logits = self.softmax(self.linear_seperator(layer1_results))
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /Users/distiller/project/conda/conda-bld/pytorch_1607370249289/work/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


		Training Batch 10: Loss: 0.6912416219711304; Accuracy: 0.484375
		Training Batch 20: Loss: 0.6872810125350952; Accuracy: 0.5
		Training Batch 30: Loss: 0.6806833148002625; Accuracy: 0.671875
		Training Batch 40: Loss: 0.6856250762939453; Accuracy: 0.5625
		Training Batch 50: Loss: 0.6972631812095642; Accuracy: 0.453125
		Training Batch 60: Loss: 0.6756414771080017; Accuracy: 0.65625
		Training Batch 70: Loss: 0.6912974119186401; Accuracy: 0.515625
Training:
	Loss: 0.6870369601853287; Accuracy: 0.533623417721519
Validation:
	Loss=0.6723114422389439; Accuracy: 0.6523944805194805
Training epoch #2
		Training Batch 10: Loss: 0.6456862092018127; Accuracy: 0.671875
		Training Batch 20: Loss: 0.6352983117103577; Accuracy: 0.796875
		Training Batch 30: Loss: 0.6364216804504395; Accuracy: 0.640625
		Training Batch 40: Loss: 0.6566485166549683; Accuracy: 0.609375
		Training Batch 50: Loss: 0.6083206534385681; Accuracy: 0.71875
		Training Batch 60: Loss: 0.6504693627357483; Accuracy: 0.640625
	

<All keys matched successfully>

### Save fit history to json file

In [0]:
###with open(os.path.join(PATH, "fit_history.json"), 'w') as json_file:
##  json.dump(fit_history, json_file)

### Save a model that was close to the baseline (Manual choice)

In [0]:
# Save the best weights to file
# torch.save(best_weights, os.path.join(PATH,'WiCHead.pt'))

## Load an already trained WiC model from file

In [0]:
# Load the model
#class_model.load_state_dict(torch.load(os.path.join(PATH,'WiCHead.pt')))
# Put the model in GPU
#class_model.cuda()

## Get the predictions on the validation set

In [31]:
validation_predictions_correctness = {}
# Validation
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Put model in evaluation mode
class_model.eval()
# Evaluate data for one epoch
for batch in validation_dataloader:
  # Add batch to GPU
  ###batch = tuple(t.cuda() for t in batch)
  batch = tuple(t for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up validation
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)
    loss, logits = class_model(b_input_ids, attention_mask=b_input_mask, 
                                labels=b_labels, word1_locs = b_word1, word2_locs = b_word2)
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.cpu().numpy()
  # Calculate the accuracy
  b_accuracy, b_pred_correctness = flat_accuracy(logits, label_ids, return_predict_correctness = True) # For RobertaForClassification
  indexes = b_index.detach().cpu().numpy() # Get the indexes
  # Add to predictions
  for index, pred in zip(indexes, b_pred_correctness):
    validation_predictions_correctness[index] = pred
  # Update tracking variables
  eval_loss += loss.item()
  eval_accuracy += b_accuracy
  nb_eval_examples += b_input_ids.size(0)
  nb_eval_steps += 1
  if nb_eval_steps%10 == 0:
    print("\t\tValidation Batch {}: Loss: {}; Accuracy: {}".format(nb_eval_steps, loss.item(), b_accuracy))
print("Validation:\n\tLoss={}; Accuracy: {}".format(eval_loss/nb_eval_steps, eval_accuracy/nb_eval_steps))
validation_predictions_correctness = collections.OrderedDict(sorted(validation_predictions_correctness.items()))
print(validation_predictions_correctness)

  logits = self.softmax(self.linear_seperator(layer1_results))


Validation:
	Loss=0.5554699301719666; Accuracy: 0.7556818181818182
OrderedDict([(0, True), (1, False), (2, True), (3, True), (4, True), (5, True), (6, True), (7, True), (8, True), (9, True), (10, True), (11, False), (12, True), (13, True), (14, False), (15, True), (16, True), (17, True), (18, True), (19, False), (20, False), (21, True), (22, True), (23, True), (24, True), (25, False), (26, False), (27, True), (28, True), (29, True), (30, False), (31, True), (32, True), (33, True), (34, True), (35, True), (36, True), (37, True), (38, True), (39, True), (40, False), (41, True), (42, True), (43, True), (44, True), (45, True), (46, True), (47, False), (48, True), (49, False), (50, True), (51, True), (52, True), (53, True), (54, True), (55, True), (56, False), (57, True), (58, True), (59, True), (60, False), (61, True), (62, True), (63, False), (64, True), (65, False), (66, False), (67, True), (68, True), (69, True), (70, True), (71, True), (72, True), (73, False), (74, False), (75, True), 

## Get the testing results
**Works for now. Would need to be modified to submit for competition reward.**

See https://competitions.codalab.org/competitions/20010#learn_the_details-evaluation

**Update:** The jsonl file from superglue is not in the same order as the text file taken from WiC. It doesn't match the idx parameter either. That is weird.

In [32]:
test_predictions = {}
nb_test_examples, nb_test_steps = 0, 0
# Testing
# Put model in evaluation mode to evaluate loss on the validation set
class_model.eval()
# Evaluate data for one epoch
for batch in test_dataloader:
  # Add batch to GPU
  ###batch = tuple(t.cuda() for t in batch)
  batch = tuple(t for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_token_ids, b_input_mask, b_word1, b_word2, b_index = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up validation
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = class_model(b_input_ids, attention_mask=b_input_mask, word1_locs = b_word1, word2_locs = b_word2)
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  # Get the predictions
  b_preds = flat_predictions(logits)
  indexes = b_index.detach().cpu().numpy() # Get the indexes
  for index, pred in zip(indexes, b_preds):
    test_predictions[index] = pred
  # Update tracking variables
  nb_test_examples += b_input_ids.size(0)
  nb_test_steps += 1
  if nb_test_steps%10 == 0:
    print("\t\tTest Batch {}".format(nb_test_steps))
# Print final results
print("Testing Done!")
test_predictions = collections.OrderedDict(sorted(test_predictions.items()))
print(test_predictions)

  logits = self.softmax(self.linear_seperator(layer1_results))


		Test Batch 10
Testing Done!
OrderedDict([(0, True), (1, False), (2, False), (3, True), (4, True), (5, True), (6, True), (7, True), (8, True), (9, True), (10, False), (11, False), (12, True), (13, True), (14, True), (15, False), (16, False), (17, False), (18, True), (19, True), (20, True), (21, True), (22, False), (23, True), (24, False), (25, True), (26, False), (27, True), (28, True), (29, True), (30, True), (31, True), (32, True), (33, True), (34, False), (35, False), (36, True), (37, True), (38, False), (39, True), (40, True), (41, True), (42, False), (43, True), (44, False), (45, True), (46, False), (47, True), (48, True), (49, False), (50, True), (51, False), (52, False), (53, True), (54, True), (55, True), (56, True), (57, True), (58, False), (59, True), (60, True), (61, True), (62, True), (63, False), (64, True), (65, False), (66, True), (67, True), (68, True), (69, True), (70, True), (71, True), (72, False), (73, False), (74, True), (75, True), (76, True), (77, True), (78, Tr

In [1]:
test_predicitons = [(0, True), (1, True), (2, True), (3, False), (4, True), (5, True), (6, True), (7, True), (8, True), (9, True), (10, True), (11, True), (12, True), (13, True), (14, True), (15, True), (16, False), (17, False), (18, True), (19, True), (20, True), (21, True), (22, False), (23, True), (24, False), (25, True), (26, False), (27, True), (28, False), (29, True), (30, True), (31, True), (32, True), (33, True), (34, False), (35, False), (36, True), (37, True), (38, False), (39, True), (40, False), (41, False), (42, False), (43, True), (44, False), (45, True), (46, False), (47, True), (48, False), (49, False), (50, True), (51, False), (52, False), (53, True), (54, True), (55, True), (56, True), (57, False), (58, True), (59, False), (60, False), (61, False), (62, True), (63, False), (64, True), (65, True), (66, True), (67, True), (68, True), (69, True), (70, True), (71, True), (72, False), (73, False), (74, True), (75, True), (76, True), (77, False), (78, True), (79, False), (80, True), (81, True), (82, True), (83, False), (84, True), (85, True), (86, True), (87, False), (88, True), (89, True), (90, False), (91, True), (92, False), (93, False), (94, True), (95, True), (96, True), (97, False), (98, True), (99, True), (100, True), (101, True), (102, False), (103, False), (104, False), (105, True), (106, False), (107, True), (108, True), (109, True), (110, False), (111, True), (112, True), (113, False), (114, False), (115, True), (116, False), (117, False), (118, False), (119, False), (120, True), (121, True), (122, False), (123, False), (124, True), (125, False), (126, True), (127, True), (128, True), (129, True), (130, True), (131, True), (132, True), (133, True), (134, False), (135, True), (136, False), (137, True), (138, True), (139, True), (140, True), (141, False), (142, False), (143, True), (144, False), (145, True), (146, True), (147, True), (148, False), (149, True), (150, False), (151, False), (152, True), (153, True), (154, False), (155, True), (156, True), (157, False), (158, True), (159, False), (160, False), (161, False), (162, True), (163, False), (164, True), (165, True), (166, True), (167, True), (168, True), (169, True), (170, False), (171, False), (172, True), (173, False), (174, True), (175, False), (176, False), (177, True), (178, False), (179, False), (180, True), (181, False), (182, False), (183, True), (184, False), (185, True), (186, False), (187, True), (188, False), (189, True), (190, False), (191, True), (192, False), (193, True), (194, True), (195, True), (196, True), (197, False), (198, False), (199, False), (200, False), (201, False), (202, True), (203, True), (204, False), (205, False), (206, False), (207, False), (208, False), (209, False), (210, True), (211, True), (212, True), (213, False), (214, False), (215, False), (216, True), (217, True), (218, True), (219, True), (220, False), (221, False), (222, True), (223, True), (224, True), (225, False), (226, False), (227, True), (228, False), (229, False), (230, True), (231, False), (232, False), (233, False), (234, True), (235, True), (236, True), (237, False), (238, False), (239, True), (240, True), (241, True), (242, True), (243, True), (244, True), (245, False), (246, True), (247, False), (248, True), (249, True), (250, True), (251, False), (252, False), (253, True), (254, True), (255, True), (256, True), (257, True), (258, True), (259, True), (260, True), (261, True), (262, True), (263, True), (264, False), (265, True), (266, False), (267, False), (268, True), (269, False), (270, True), (271, False), (272, True), (273, False), (274, True), (275, True), (276, False), (277, True), (278, True), (279, False), (280, True), (281, False), (282, False), (283, False), (284, True), (285, True), (286, True), (287, False), (288, False), (289, True), (290, True), (291, False), (292, True), (293, True), (294, False), (295, True), (296, True), (297, True), (298, False), (299, False), (300, True), (301, True), (302, True), (303, True), (304, True), (305, True), (306, False), (307, True), (308, True), (309, True), (310, True), (311, True), (312, False), (313, False), (314, True), (315, False), (316, True), (317, True), (318, True), (319, True), (320, False), (321, True), (322, False), (323, True), (324, False), (325, True), (326, True), (327, True), (328, True), (329, False), (330, False), (331, True), (332, True), (333, True), (334, True), (335, True), (336, False), (337, True), (338, False), (339, False), (340, True), (341, False), (342, True), (343, True), (344, False), (345, False), (346, True), (347, True), (348, True), (349, True), (350, True), (351, True), (352, False), (353, True), (354, False), (355, True), (356, True), (357, True), (358, True), (359, True), (360, False), (361, True), (362, True), (363, True), (364, True), (365, False), (366, True), (367, False), (368, True), (369, True), (370, True), (371, False), (372, False), (373, True), (374, True), (375, True), (376, True), (377, True), (378, False), (379, True), (380, False), (381, True), (382, True), (383, True), (384, True), (385, False), (386, False), (387, False), (388, True), (389, True), (390, True), (391, True), (392, True), (393, True), (394, True), (395, True), (396, False), (397, True), (398, False), (399, True), (400, True), (401, False), (402, True), (403, True), (404, False), (405, False), (406, False), (407, False), (408, True), (409, True), (410, True), (411, False), (412, False), (413, False), (414, False), (415, False), (416, True), (417, False), (418, True), (419, False), (420, False), (421, True), (422, True), (423, True), (424, True), (425, True), (426, True), (427, True), (428, True), (429, True), (430, True), (431, False), (432, True), (433, False), (434, False), (435, False), (436, True), (437, True), (438, True), (439, False), (440, False), (441, True), (442, True), (443, True), (444, True), (445, False), (446, True), (447, True), (448, True), (449, True), (450, True), (451, False), (452, True), (453, False), (454, True), (455, True), (456, True), (457, False), (458, True), (459, False), (460, True), (461, True), (462, True), (463, True), (464, False), (465, False), (466, True), (467, True), (468, False), (469, True), (470, False), (471, False), (472, True), (473, True), (474, True), (475, False), (476, True), (477, True), (478, False), (479, False), (480, True), (481, True), (482, True), (483, False), (484, True), (485, True), (486, True), (487, False), (488, False), (489, True), (490, False), (491, False), (492, False), (493, True), (494, True), (495, True), (496, False), (497, True), (498, True), (499, True), (500, True), (501, False), (502, True), (503, True), (504, True), (505, True), (506, True), (507, False), (508, False), (509, True), (510, True), (511, True), (512, False), (513, False), (514, False), (515, True), (516, True), (517, True), (518, True), (519, True), (520, True), (521, True), (522, True), (523, False), (524, False), (525, False), (526, True), (527, False), (528, False), (529, False), (530, True), (531, False), (532, False), (533, True), (534, True), (535, True), (536, True), (537, True), (538, True), (539, False), (540, True), (541, True), (542, False), (543, False), (544, True), (545, False), (546, True), (547, False), (548, False), (549, True), (550, False), (551, False), (552, True), (553, False), (554, False), (555, True), (556, True), (557, True), (558, True), (559, False), (560, True), (561, True), (562, False), (563, True), (564, True), (565, False), (566, True), (567, False), (568, True), (569, False), (570, True), (571, False), (572, False), (573, True), (574, False), (575, False), (576, False), (577, True), (578, False), (579, True), (580, True), (581, True), (582, True), (583, True), (584, False), (585, False), (586, True), (587, True), (588, True), (589, True), (590, False), (591, True), (592, False), (593, True), (594, True), (595, False), (596, True), (597, True), (598, True), (599, False), (600, True), (601, True), (602, True), (603, True), (604, True), (605, False), (606, True), (607, True), (608, True), (609, True), (610, False), (611, True), (612, True), (613, False), (614, True), (615, True), (616, True), (617, True), (618, False), (619, True), (620, True), (621, False), (622, True), (623, True), (624, False), (625, True), (626, True), (627, True), (628, False), (629, True), (630, True), (631, True), (632, True), (633, False), (634, False), (635, True), (636, True), (637, True)]
test_json_objs = load_json_objects_from_file("/Users/aidanaarts/Documents/Fall2021/CSCI 3832 Natural Language Processing/Class Project/WiC/test.jsonl")
count = 0
for i in range(0, len(test_predicitons)):
    tp_label = test_predicitons[i][1]
    #print(label)
    tjo_label = test_json_objs[i]['label']
    #print(tjo_label)
#    test_json_objs
    if tp_label == tjo_label:
        count += 1
percent = count / 638
print(percent)

NameError: name 'load_json_objects_from_file' is not defined