This code is taken from our first homework in 685, and is modified to solve our task!

In [None]:
# Please note that this code just follows the provided video
# Mount data from drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Folder name
folderName = 'UMass/Spring 2022/COMPSCI685/CS685 Project/Sanity Check'
assert folderName is not None, "[Error] Please enter folder name."


# Load python files from our folder
import sys
sys.path.append('/content/drive/My Drive/{}'.format(folderName))

%cd /content/drive/My\ Drive/$folderName/ 

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1s39Gy1mP7wbq26JnrSjY-Rsch9COaNmZ/Spring 2022/COMPSCI685/CS685 Project/Sanity Check


# Text classification

Now we'll move onto fine-tuning  pretrained language models specifically on your dataset. This part of the homework is meant to be an introduction to the HuggingFace library, and it contains code that will potentially be useful for your final projects. Since we're dealing with large models, the first step is to change to a GPU runtime.

## Adding a hardware accelerator

Please go to the menu and add a GPU as follows:

`Edit > Notebook Settings > Hardware accelerator > (GPU)`

Run the following cell to confirm that the GPU is detected.

In [None]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


## Installing Hugging Face's Transformers library
We will use Hugging Face's Transformers (https://github.com/huggingface/transformers), an open-source library that provides general-purpose architectures for natural language understanding and generation with a collection of various pretrained models made by the NLP community. This library will allow us to easily use pretrained models like `BERT` and perform experiments on top of them. We can use these models to solve downstream target tasks, such as text classification, question answering, and sequence labeling.

Run the following cell to install Hugging Face's Transformers library

In [None]:
!pip install transformers
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

# Download helper functions file
helper_file = drive.CreateFile({'id': '16HW-z9Y1tM3gZ_vFpJAuwUDohz91Aac-'})
helper_file.GetContentFile('helpers.py')
print('helper file downloaded! (helpers.py)')

Collecting transformers
  Downloading transformers-4.19.0-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 64.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.

# Data Prep and Model Specifications

## Create train/test/validation splits

In [None]:
from sklearn.model_selection import train_test_split
from helpers import tokenize_and_format, flat_accuracy
import pandas as pd

def build_data_set(texts, labels):
  ### tokenize_and_format() is a helper function provided in helpers.py ###
  input_ids, attention_masks = tokenize_and_format(texts)

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)

  return [(input_ids[i], attention_masks[i], labels[i]) for i in range(len(texts))]

def get_test_set_text(filename):
  classifier_df = pd.read_csv(filename)
  classifier_df["text"] = classifier_df['Generated Text']

  classifier_df["label"] = 1
  classifier_df = classifier_df[["text", "label"]]
  classifier_df.head()

  texts = classifier_df.text.values
  labels = classifier_df.label.values

  test_set = build_data_set(texts, labels)
  test_text = texts

  return classifier_df, test_set, test_text

In [None]:
classifier_df, test_set, test_text = get_test_set_text("Optimization/002/gen_predictions.csv")
# classifier_df, test_set, test_text = get_test_set_text("T5-10-epochs-test1-outputs/gen_predictions.csv")
# classifier_df, test_set, test_text = get_test_set_text("outputs_pseudo_parallel/003/gen_predictions.csv")

print(f"Data length: {len(classifier_df)}")
print(f"Test: {len(test_text)}")
classifier_df.head()

Data length: 1462
Test: 1462


Unnamed: 0,text,label
0,a jumbled confession can only receive A jumble...,1
1,I love the rich Capulet's daughter.,1
2,", but we must have you to marry us.",1
3,I'll tell thee more in anon how and where we m...,1
4,", Holy Saint Francis, this is a changeable!",1


In [None]:
pseudo_parallel_classifier_df, pseudo_parallel_test_set, pseudo_parallel_test_text = get_test_set_text("outputs_pseudo_parallel/003O/gen_predictions.csv")
#pseudo_parallel_classifier_df, pseudo_parallel_test_set, pseudo_parallel_test_text = get_test_set_text("T5_2-5-epochs-test1-outputs-removed-shakespeare/001/gen_predictions.csv")
#pseudo_parallel_classifier_df, pseudo_parallel_test_set, pseudo_parallel_test_text = get_test_set_text("T5-10-epochs-test1-outputs/gen_predictions.csv")
print(f"Data length: {len(pseudo_parallel_classifier_df)}")
print(f"Test: {len(pseudo_parallel_test_text)}")
pseudo_parallel_classifier_df.head()

Data length: 1462
Test: 1462


Unnamed: 0,text,label
0,", Shakespear, A jumbled confession can only re...",1
1,I love rich Capulet's daughter.,1
2,"Shakespear, we're bound to each other in every...",1
3,I'll tell you more later of when and where we ...,1
4,"Holy Saint Francis, this is a drastic change!",1


Here we choose the model we want to finetune from https://huggingface.co/transformers/pretrained_models.html. Because the task requires us to label sentences, we wil be using BertForSequenceClassification below. You may see a warning that states that `some weights of the model checkpoint at [model name] were not used when initializing. . .` This warning is expected and means that you should fine-tune your pre-trained model before using it on your downstream task. See [here](https://github.com/huggingface/transformers/issues/5421#issuecomment-652582854) for more info.

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

def get_model(): 
  model = BertForSequenceClassification.from_pretrained("classifier/001")

  # Tell pytorch to run this model on the GPU.
  model.cuda()

  return model

model = get_model()

# Hyperparameters #

In [None]:
batch_size = 128
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
epochs = 5



# Fine-tune your model
Here we provide code for fine-tuning your model, monitoring the loss, and checking your validation accuracy. Rerun both of the below cells when you change your hyperparameters above.

In [None]:
import numpy as np
# function to get validation accuracy
def get_validation_performance(model, val_set, batch_size=100):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0
    results = []

    for i in range(num_batches):

      end_index = min(batch_size * (i+1), len(val_set))

      batch = val_set[i*batch_size:end_index]
      
      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])
      
      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)
        
      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        num_correct = np.sum(pred_flat == labels_flat)
        total_correct += num_correct

        batch_results = pred_flat == labels_flat
        for row in batch_results:
          results.append(row)
          
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(val_set)
    return avg_val_accuracy, results



# Evaluate model on the test set


## Parallel ##

In [None]:
acc, results = get_validation_performance(model, test_set, batch_size)
print(acc)

0.8331053351573188


In [None]:
import pandas as pd
labeledOutput = pd.read_csv("Optimization/002/gen_predictions.csv")
labeledOutput['labels'] = [1 if label else 0 for label in results]
labeledOutput.drop('Unnamed: 0', axis=1, inplace=True)
labeledOutput.head()

labeledOutput.to_csv("Optimization/002/labeled_en_predictions.csv")

## Pseudo Parallel ##

In [None]:
acc, _ = get_validation_performance(model, pseudo_parallel_test_set, batch_size)
print(acc)

0.4466484268125855
