# Performance Experiments on BERTweet and RoBERTa

In [None]:
!pip install transformers
!pip install emoji
!pip install datasets
!pip install wget
!pip install torch

In [None]:
####### Import necessary libraries

### Computational Libraries
import numpy as np
import torch
import torch.nn as nn
from torch.optim import AdamW
import transformers
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer
from transformers import get_linear_schedule_with_warmup

### Data Interpretation and Retrieval
from torch.utils.data import TensorDataset, DataLoader
import emoji
import datasets
from datasets import load_dataset

### Visualization
from tqdm import tqdm
import warnings

### For test_results storage
import json

### Miscellaneous
import wget
import os
import random
import time
import datetime
from pprint import pprint

In [None]:
####### General config

### Set seed for reproducability
SEED = 12

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

### Disable warnings
transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()

In [None]:
###### Helper function for loading data
def get_dataloader(inputs, masks, labels, batch_size=32):

    labels = torch.tensor(labels)
    data = TensorDataset(inputs, masks, labels)
    dataloader = DataLoader(data, shuffle=True, batch_size=batch_size)

    return dataloader

In [None]:
####### Load Datasets into Memory
def load_datasets(max_length: int, batch_size: int):
    
    datasets = {"roberta": {"glue": dict(dict()), "tweet": dict(dict())},
                "tweet": {"glue": dict(dict()), "tweet": dict(dict())}
               }
    
    ### GLUE Datasets
    for model in model_names:
        for glue_set in glue_sets:
            for split in dataset_splits:
                dataset = load_dataset("glue", glue_set, split=split)
                
                num_classes = dataset.info.features['label'].num_classes
                
                if model == 'roberta':
                    tokenizer = roberta_tokenizer
                else:
                    tokenizer = bertweet_tokenizer

                ### BERTweet Tokenization
                tokenized = tokenizer(dataset['sentence'], add_special_tokens=True, padding='max_length', max_length=max_length, return_tensors='pt', truncation=True)
                inputs = tokenized['input_ids']
                masks = tokenized['attention_mask']
                labels = np.asarray(dataset['label'])

                if glue_set not in datasets[model]['glue']:
                    datasets[model]['glue'][glue_set] = dict()
                    datasets[model]['glue'][glue_set]['num_classes'] = num_classes
                    
                datasets[model]['glue'][glue_set][split] = get_dataloader(inputs, masks, labels, batch_size=batch_size)
                print("Loaded dataset %s's %s split for GLUE benchmark for model %s"%(glue_set, split, model))
                
    ### Twitter Datasets
    for model in model_names:
        for twitter_set in tweet_sets:
            for split in dataset_splits:
                dataset = load_dataset("tweet_eval", twitter_set, split=split)
                
                num_classes = dataset.info.features['label'].num_classes
                
                if model == 'roberta':
                    tokenizer = roberta_tokenizer
                else:
                    tokenizer = bertweet_tokenizer

                ### BERTweet Tokenization
                tokenized = tokenizer(dataset['text'], add_special_tokens=True, padding='max_length', max_length=max_length, return_tensors='pt', truncation=True)
                inputs = tokenized['input_ids']
                masks = tokenized['attention_mask']
                labels = np.asarray(dataset['label'])

                if twitter_set not in datasets[model]['tweet']:
                    datasets[model]['tweet'][twitter_set] = dict()
                    datasets[model]['tweet'][twitter_set]['num_classes'] = num_classes
                    
                datasets[model]['tweet'][twitter_set][split] = get_dataloader(inputs, masks, labels, batch_size=batch_size)
                print("Loaded dataset %s's %s split for tweet_eval benchmark for model %s"%(twitter_set, split, model))
    
    
    return datasets

In [None]:
#### Function for loading models into memory
def load_models(datasets, roberta=True, tweet=True):
    models = {"roberta": {"glue": dict(), "tweet": dict()},
                "tweet": {"glue": dict(), "tweet": dict()}
             }
    
    ### Models for GLUE fine-tuning
    for model_name in model_names:
        for glue_set in glue_sets:
            
            if model_name == 'roberta' and roberta:
                new_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=datasets[model_name]['glue'][glue_set]['num_classes'])
            elif model_name == "tweet" and tweet:
                new_model = RobertaForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=datasets[model_name]['glue'][glue_set]['num_classes'])
            else:
                continue
                
            ### Disable gradient for params learned in pre-training
            for name, param in new_model.named_parameters():
                if 'classifier' not in name:
                    param.requires_grad = False
            
            models[model_name]['glue'][glue_set] = new_model
            if new_model is not None:
              print("Loaded model %s for GLUE benchmark %s"%(model_name, glue_set))
            
    ### Models for tweet-eval fine-tuning
    for model_name in model_names:
        for tweet_set in tweet_sets:
            
            if model_name == 'roberta' and roberta:
                new_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=datasets[model_name]['tweet'][tweet_set]['num_classes'])
            elif model_name == "tweet" and tweet:
                new_model = RobertaForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=datasets[model_name]['tweet'][tweet_set]['num_classes'])
            else:
                continue
            
            ### Disable gradient for params learned in pre-training
            for name, param in new_model.named_parameters():
                if 'classifier' not in name:
                    param.requires_grad = False

            models[model_name]['tweet'][tweet_set] = new_model
            print("Loaded model %s for tweet_eval benchmark %s"%(model_name, tweet_set))
            
    return models

In [None]:
####### Instantiate Tokenizers
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
print("BERTweet Tokenizer successfully instantiated")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
print("RoBERTa Tokenizer successfully instantiated")

In [None]:
##### Declare what models and what datasets will be loaded into memory 
### Can not do all at once unless you have a LOT of System RAM
### You must ensure models and datasets are loaded here in order to train

### All possible models ["roberta", "tweet"]
model_names = ["tweet"]

### All possible glue tasks ["cola", "sst2"]
glue_sets = ["cola"]

### All possible tweet_eval tasks ["emoji", "emotion", "hate", "irony",
#     "offensive", "sentiment", "stance_abortion", "stance_atheism",
#     "stance_climate", "stance_feminist", "stance_hillary"]
tweet_sets = ["emoji", "emotion", "hate"]

### All possible dataset splits ["train", "validation", "test"]
dataset_splits = ["train", "validation", "test"]

In [None]:
### Load datasets
MAX_LEN = 30        # Maximum length of input tokens
BATCH_SIZE = 32     # Batch size
data = load_datasets(MAX_LEN, BATCH_SIZE)

In [None]:
### Load models
### Ensure you re-run this cell when you want to test the same model again
##  so you can reset the trained weights
models = load_models(data)

In [None]:
##### Functions for training models

### Return accuracy from a set of predictions and true labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

### So time prints pretty
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

### Get optimizer and scheduler for model. We use AdamW for optimization
##  and we linearly scale the learning rate to its max lr for the first 10% of
##  training
def get_optimizer_and_scheduler(model, total_steps, lr, weight_decay):
    # Apply weight decay to all parameters beside the biases or LayerNorm weights
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            'weight_decay': weight_decay},
        {
            'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0
        }
    ]
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        # Warmup learning rate for first 10% of training steps
        num_warmup_steps=int(0.10 * total_steps), 
        num_training_steps=total_steps,
    )
    return optimizer, scheduler


### Train the model, return the loss values and validation accuracies for
##  each epoch and return the final test accuracy
def train_model(model, epochs, train_dataloader, validation_dataloader,
                test_dataloader, lr, weight_decay, device="cuda"):
    # Use GPU, if available
    device = torch.device(device)
    model = model.to(device)

    # Setup optimizer and LR scheduler 
    total_steps = len(train_dataloader) * epochs
    optimizer, scheduler = get_optimizer_and_scheduler(
        model, total_steps, lr=lr, weight_decay=weight_decay
    )

    loss_values = []
    eval_accs = []

    for epoch in range(0, epochs):
        t0 = time.time()

        total_loss = 0
        model.train()

        with tqdm(train_dataloader, unit="batch") as train_pbar:
            for batch in train_pbar:
                train_pbar.set_description(f"Training (epoch {epoch + 1})")
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                model.zero_grad()        

                # Forward pass
                outputs = model(
                    input_ids=b_input_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels
                )
                
                # Calculate loss for this batch
                loss = outputs.loss

                # Add loss for total in epoch
                total_loss += loss.item()

                # Calculate gradients
                loss.backward()

                # Clip the norm of the gradients to 1.0.
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                # Update parameters based on gradient, learning rate, weight decay, etc.
                optimizer.step()

                # Update the learning rate.
                scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)            
        
        # Store the loss value.
        loss_values.append(avg_train_loss)

        print("  * Average training loss: {0:.2f}".format(avg_train_loss))
        print("  * Training epoch took: {:}".format(format_time(time.time() - t0)))
            
        print("Running Validation...")

        t0 = time.time()
        model.eval()

        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            with torch.no_grad():        
                # Forward pass
                outputs = model(
                    input_ids=b_input_ids, 
                    attention_mask=b_input_mask
                )
            
            # Get the "logits" output
            logits = outputs.logits
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            # Calculate the accuracy for this batch of test sentences.
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            # Accumulate the total accuracy.
            eval_accuracy += tmp_eval_accuracy
            # Track the number of batches
            nb_eval_steps += 1

        avg_eval_acc = eval_accuracy/nb_eval_steps
        print("  * Accuracy: {0:.2f}".format(avg_eval_acc))
        print("  * Validation took: {:}".format(format_time(time.time() - t0)))
        eval_accs.append(avg_eval_acc)


    ##### Obtain test accuracy
    print("Running Testing...")

    t0 = time.time()
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in test_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch
      
      with torch.no_grad():        
          # Forward pass
          outputs = model(
              input_ids=b_input_ids, 
              attention_mask=b_input_mask
          )
      
      # Get the "logits" output
      logits = outputs.logits
      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      # Calculate the accuracy for this batch of test sentences.
      tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      # Accumulate the total accuracy.
      eval_accuracy += tmp_eval_accuracy
      # Track the number of batches
      nb_eval_steps += 1

    test_acc = eval_accuracy/nb_eval_steps
    print("  * Test Accuracy: {0:.2f}".format(test_acc))
    print("  * Testing took: {:}".format(format_time(time.time() - t0)))
      
    print("Training complete!")
    return loss_values, eval_accs, test_acc

In [None]:
##### For storage for training results

### Example object for what format json file should have when read
test_results_example = {
    "roberta": {
        "glue": {
            "cola": [],
            "sst2": []
        },
        "tweet": {
            "emoji": [],
            "emotion": [],
            "hate": [],
            "irony": [],
            "offensive": [],
            "sentiment": [],
            "stance_abortion": [],
            "stance_atheism": [],
            "stance_climate": [],
            "stance_feminist": [],
            "stance_hillary": [],
        }
    }, 
    "tweet": {
        "glue": {
            "cola": [],
            "sst2": []
        },
        "tweet": {
            "emoji": [],
            "emotion": [],
            "hate": [],
            "irony": [],
            "offensive": [],
            "sentiment": [],
            "stance_abortion": [],
            "stance_atheism": [],
            "stance_climate": [],
            "stance_feminist": [],
            "stance_hillary": [],
        }
    }
}

### Write example to test_results.json if the file does not exist
if not os.path.isfile("test_results.json"):
  with open('test_results.json', 'w') as f:
    example_json = json.dumps(test_results_example, indent=2, sort_keys=True)
    f.write(example_json)

In [None]:
#### Edit these variables to choose what model to run
model_string = 'tweet'     # Choices: 'roberta', 'tweet'
benchmark = 'glue'   # Choices: 'glue', 'tweet'

task = 'cola'      # Choices if benchmark is 'glue':
                        # 'cola', 'sst2'
                      # Choices if benchmark is 'tweet':
                        # "emoji", "emotion", "hate", "irony",
                        # "offensive", "sentiment", "stance_abortion", "stance_atheism",
                        # "stance_climate", "stance_feminist", "stance_hillary"


dataloader_train = data[model_string][benchmark][task]['train']
dataloader_val = data[model_string][benchmark][task]['validation']
dataloader_test = data[model_string][benchmark][task]['test']
model = models[model_string][benchmark][task]

In [None]:
### Run this to train with respect to variables in cell above
lr = 5e-3
weight_decay = 0.01
epochs=7

loss_vals, eval_accs, test_acc = train_model(
    model=model,
    epochs=epochs,
    train_dataloader=dataloader_train,
    validation_dataloader=dataloader_val,
    test_dataloader=dataloader_test,
    lr=lr,
    weight_decay=weight_decay,
    device="cuda"
)

In [None]:
### Write previous results to test_results file
results = {
    # "training_losses": loss_vals,
    # "validation_accs": eval_accs,
    "test_acc": test_acc,
    "hyperparams": {
        "batch_size": BATCH_SIZE,
        "max_len": MAX_LEN,
        "lr": lr,
        "weight_decay": weight_decay,
        "epochs": epochs,
    }
}

with open('test_results.json', 'r') as f:
  file_data = json.load(f)
  file_data[model_string][benchmark][task].append(results)
  file_data = json.dumps(file_data, indent=4, sort_keys=True)

with open('test_results.json', 'w') as f:
  f.write(file_data)