In [1]:
import pandas as pd
import os
import numpy as np
import torch
import logging
import time
import random
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
import datetime
import functools
import operator
#import boto3, re, sys, math, json, sagemaker

from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, BertTokenizer, RobertaForSequenceClassification, RobertaTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer, AlbertForSequenceClassification, AlbertTokenizer, get_linear_schedule_with_warmup
from transformers import BartForSequenceClassification, BartTokenizer, XLNetForSequenceClassification, XLNetTokenizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from collections import defaultdict, Counter
from tqdm import tqdm, trange
#from sagemaker import get_execution_role

In [None]:
# df = pd.read_csv("", encoding='utf-8')

In [None]:
def stratify(data, strata_field: str, split_sizes, **kwargs):
    """Stratify and split the data.
    :data (base.DataType): dataset to split.
    :split_sizes (int | base.List[int]): The number of documents in each split.
    :strata_field (str): Name of label field.
    :returns train, dev, test (base.Tuple[list, base.Union[list, None], list]): Return stratified splits.
    """

    train_size = split_sizes[0]

    num_splits = len(split_sizes)
    if num_splits == 1:
        test_size = 1 - split_sizes[0]
    elif num_splits == 2:
        test_size = split_sizes[-1]
    elif num_splits == 3:
        dev_size = split_sizes[1]
        test_size = split_sizes[2]

    stratified_train_list = []
    stratified_dev_list = []
    stratified_test_list = []

    for i in set(data[strata_field]):
        temp_df = data[data[strata_field] == i]

        if dev_size is not None:
            train, temp_test = train_test_split(temp_df, test_size=test_size+dev_size, random_state=42)
            dev, test = train_test_split(temp_test, test_size= (test_size/(test_size+dev_size)), random_state=42)

            stratified_train_list.append(train)
            stratified_dev_list.append(dev)
            stratified_test_list.append(test)

        else:
            train, test = train_test_split()
            stratified_train_list.append(train)
            stratified_test_list.append(test)


    train = pd.concat(stratified_train_list)
    dev = pd.concat(stratified_dev_list)
    test = pd.concat(stratified_test_list)

    train = shuffle(train).reset_index(drop=True)
    dev = shuffle(dev).reset_index(drop=True)
    test = shuffle(test).reset_index(drop=True)

    return train, dev, test

In [None]:
train, dev, test = stratify(df, "", [0.8, 0.1, 0.1])

In [2]:
#role = get_execution_role()
#train = pd.read_csv("s3://eastasianprejudice/Data/east_asian_prejudice_train.csv", encoding='utf-8')
#dev = pd.read_csv("s3://eastasianprejudice/Data/east_asian_prejudice_dev.csv", encoding='utf-8')
#test = pd.read_csv("s3://eastasianprejudice/Data/east_asian_prejudice_test.csv", encoding='utf-8')

In [3]:
class TransformerClassifier(nn.Module):
    def __init__(self, model, num_labels: int):
        """
        model: From HuggingFace transformers library

        num_labels (int): Number of annotation classes
        """
        super(TransformerClassifier, self).__init__()
        self.model_types = {"BERT": [BertForSequenceClassification, BertTokenizer, 'bert-large-uncased'], #'bert-base-uncased'
          "RoBERTa": [RobertaForSequenceClassification, RobertaTokenizer, 'roberta-large'], #'roberta-base'
          "DistilBERT": [DistilBertForSequenceClassification, DistilBertTokenizer, 'distilbert-base-cased'], 
          "AlBERT": [AlbertForSequenceClassification, AlbertTokenizer, 'albert-xlarge-v2'],  # 'albert-xlarge-v2' "albert-large-v2" 'albert-base-v2' 'albert-xxlarge-v2'
          "BART": [BartForSequenceClassification, BartTokenizer, "bart-large"],  
          "XLNet": [XLNetForSequenceClassification, XLNetTokenizer, "xlnet-large-cased"], #"xlnet-base-cased"
          } 
        self.model_selection = model
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model_types[self.model_selection][0].from_pretrained(self.model_types[self.model_selection][2], num_labels = num_labels).to(self.device)
        self.seed_val = 22

        random.seed(self.seed_val)
        np.random.seed(self.seed_val)
        torch.manual_seed(self.seed_val)
        if self.device == 'cuda':
            torch.cuda.manual_seed_all(self.seed_val)

    def clean(self, data, normalize_list, annotate_list, labels):
        """
        This function preprocesses the text using the Ekphrasis library
        
        data: Pandas series object containing strings of text

        normalize_list: list of data features to clean

        annotate_list: list of data features to annotate

        labels: Pandas series containing data annotations
        """

        text_processor = TextPreProcessor(
            normalize= normalize_list,
            annotate= annotate_list,
            fix_html=True,
            segmenter="twitter", 
            #unpack_hashtags=False,  
            #unpack_contractions=False,  
            #spell_correct_elong=False,  
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons]
        )

        clean_data = data.map(lambda x: " ".join(text_processor.pre_process_doc(x)))

        return self.preprocesser(clean_data, labels)

  
    def preprocesser(self, sequence, labels):
        """
        This function converts a string of text into a tokenized format compatible with the selected model

        sequence: An iterable series of data (i.e. Pandas Series, list..) where elements are strings

        labels: Pandas series containing data annotations
        """

        self.tokenizer = self.model_types[self.model_selection][1].from_pretrained(self.model_types[self.model_selection][2])

        indexed_tokens = []
        attention_masks = []


        for counter, sentence in enumerate(sequence):
            if counter % 1000 == 0:
                print("Processing row {}".format(counter))
            if counter == len(sequence):
                print("Done!")


            encoded_dict = self.tokenizer.encode_plus(
                      sentence,            
                      add_special_tokens = True,
                      max_length = self.pad_length,         
                      pad_to_max_length = True,
                      return_attention_mask = True,  
                      return_tensors = 'pt',   
                  )
  
            indexed_tokens.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])

        return self.batcher(torch.cat(indexed_tokens, dim=0), torch.cat(attention_masks, dim=0), labels)


    def batcher(self, indexed_tokens, attention_masks, labels):
        """
        This function creates batches of a specified size to save on memory

        indexed_tokens: Tokenized text output by model preprocesser

        attention_masks: Attention masks output by model preprocesser

        labels: Pandas series containing data annotations
        """

        data = TensorDataset(indexed_tokens, attention_masks, labels)
        sampler = SequentialSampler(data)
        dataloader = DataLoader(data, sampler=sampler, batch_size = self.batch_size)

        return dataloader


    def fine_tune(self, train_data, train_labels, dev_data, dev_labels, normalize_list, annotate_list, pad_length: int, early_stop_vals: dict, batch_size: int = 32, epochs: int = 3, learning_rate: float = 2e-5, weight_decay: float = 0.1, warmup: float = 0.06):   
        """
        Updates pre-trained model's weights based on given dataset

        train_data: Pandas series object containing text data for train set

        train_labels: Pandas series object containing ground truth annotations for train set

        dev_data: Pandas series object containing text data for dev set

        dev_labels: Pandas series object containing ground truth annotations for dev set

        normalize_list: list of data features to clean

        annotate_list: list of data features to annotate

        pad_length (int): Max sentence length

        early_stopping: Dictionary containing patience value (int) and delta value (float). The patience determines the number of epochs to wait to achieve the given delta

        batch_size (int): Number of sentences in batch. Default is 32.

        epochs (int): Number of times to run through all batches. Default value is 3 according to 2-4 recommended in original BERT paper.

        learning_rate (float): Default value is 2e-5 according to recommended value from original BERT paper.

        weight decay (float): Default value is 0.1 

        warmup (float): Default value is 0.06; percentage of training steps in warmup
        """
    
        self.pad_length = pad_length
        self.batch_size = batch_size
        self.early_stop_vals = early_stop_vals
        self.train_labels = torch.Tensor(train_labels.values).to(torch.int64)
        self.dev_labels = torch.Tensor(dev_labels.values).to(torch.int64)
        self.train_dataloader = self.clean(train_data, normalize_list, annotate_list, self.train_labels)
        self.val_dataloader = self.clean(dev_data, normalize_list, annotate_list, self.dev_labels)

        self.optimizer = optim.AdamW(self.model.parameters(), lr = learning_rate, weight_decay = weight_decay)
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps = warmup * (len(self.train_dataloader) * epochs), num_training_steps = (1-warmup) * (len(self.train_dataloader) * epochs))

        self.train_loss_values, self.val_loss_values = [], []

        for epoch in trange(epochs, desc= "Epoch"):
            if self.early_stopping() == False:
                print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
                print('Training...')

                train_total_loss = 0

                self.model.train()

                for step, batch in enumerate(self.train_dataloader):
                    if step % 50 == 0:
                        print("Processing batch...{}".format(step))
                        print("  Batch {:>5,}  of  {:>5,}.".format(step, len(self.train_dataloader)))

                    b_input_ids, b_input_mask, b_labels = tuple(t.to(self.device) for t in batch)

                    self.model.zero_grad()  

                    outputs = self.model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels.unsqueeze(0))

                    train_total_loss += outputs[0].item()
                    outputs[0].backward()

                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    self.optimizer.step()
                    self.scheduler.step()

                avg_train_loss = train_total_loss / len(self.train_dataloader)            
                self.train_loss_values.append(avg_train_loss)

                print()
                print("  Average training loss: {0:.2f}".format(avg_train_loss))

                print()
                print("Running Validation...")
                print()

                val_total_loss, val_total_len = 0, 0 
                num_correct = 0
                val_losses = []


                self.model.eval()

                for batch in self.val_dataloader:
                    val_total_len += batch[0].shape[0]
                    b_input_ids, b_input_mask, b_labels = tuple(t.to(self.device) for t in batch)

                    with torch.no_grad():        

                        outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels.unsqueeze(0))
  
                    val_total_loss += outputs[0].item()

                    pred = outputs[1].argmax(1, keepdim=True).float()
                    correct_tensor = pred.eq(b_labels.float().view_as(pred))
                    correct = np.squeeze(correct_tensor.cpu().numpy())
                    num_correct += np.sum(correct)

                val_acc = num_correct / val_total_len
                avg_val_loss = val_total_loss / len(self.val_dataloader)  
                self.val_loss_values.append(avg_val_loss)

                print(f"Epoch | Validation Accuracy | Training Loss | Validation Loss")
                print(f"{epoch+1:5d} |       {val_acc:.5f}       |    {avg_train_loss:.5f}    |     {avg_val_loss:.5f}")

                print()

                if epoch == (epochs-1):
                    print("Training complete!")
                    return self.training_plot()
                else:
                    continue
      
            else:
                print("Stopping early...")
                print("Training complete!")
                return self.training_plot()
            
    def early_stopping(self):
        """
        Determines whether or not the model will keep running based on the patience and delta given relative to the val loss
        """
        if len(self.val_loss_values) > self.early_stop_vals["patience"]:
            if self.val_loss_values[-1] <= np.mean(np.array(self.val_loss_values[-1-self.early_stop_vals["patience"]:-1])) - self.early_stop_vals["delta"]:
                return False
            else:
                return True
        else:
            return False


    def test(self, test_data, test_labels, normalize_list, annotate_list):
        """
        Tests the model's performance based on a several metrics

        test_data: Pandas series object containing text data

        test_labels: Pandas series object containing labels

        normalize_list: list of data features to clean

        annotate_list: list of data features to annotate
        """
        self.test_labels = torch.Tensor(test_labels.values).to(torch.int64)
        self.test_dataloader = self.clean(test_data, normalize_list, annotate_list, self.test_labels)
    
        print('Predicting labels for {} sentences...'.format(len(self.test_labels)))

        self.model.eval()

        predictions, true_labels = [], []

        for batch in self.test_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(self.device) for t in batch)
      
            with torch.no_grad():
                outputs = self.model(b_input_ids, attention_mask=b_input_mask)

            logits = outputs[0].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            predictions.append(logits)
            true_labels.append(label_ids)

        print('    DONE.')
    
        return self.metrics(true_labels, predictions)


    def training_plot(self):
        """
        Plots loss after each epoch

        training_loss_values: list of floats; output from fine_tune function

        val_loss_values: list of floats; output from fine_tune function
        """
        sns.set(style='darkgrid')
        plt.rcParams["figure.figsize"] = (12,6)

        plt.plot(self.train_loss_values, 'b-o', label="train")
        plt.plot(self.val_loss_values, 'g-o', label="valid")

        #plt.title("Training and Validation loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.legend()

        return plt.show()

  
    def metrics(self, labels, preds):
        """
        Returns the Matthew's correlation coefficient, accuracy rate, true positive rate, true negative rate, false positive rate, false negative rate, precission, recall, and f1 score

        labels: list of correct labels

        pred: list of model predictions
        """

        labels = functools.reduce(operator.iconcat, labels, []) #labels and preds lists of lists --> need to flatten 
        preds = functools.reduce(operator.iconcat, preds, [])

        pred_flat = np.argmax(preds, axis=1).flatten()

        mcc = matthews_corrcoef(labels, pred_flat)
        acc = accuracy_score(labels, pred_flat)#np.sum(pred_flat == labels) / len(labels)
        cm = confusion_matrix(labels, pred_flat)

        f1 = f1_score(labels, pred_flat, average= "weighted")
        precision = precision_score(labels, pred_flat, average= "weighted")
        recall = f1_score(labels, pred_flat, average= "weighted")

        self.results = {
            "mcc": mcc,
            "acc": acc,
            "confusion_matrix": cm,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }

        return self.results, labels, pred_flat

    def save(self, output_directory, test_name, train_name, predictions_name, labels, pred_flat):
        """
        This function saves the model to a specified directory.
        
        output_directory: Folder to save file in

        test_name: File name for dictionary containing model test performance across metrics

        train_name: File name for dictionary containing model train performance across metrics

        labels: List of labels for test set

        preds_flat: List of model predictions after passed through argmax()
        """

        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
 
        output_directory = os.path.join(output_directory, self.model_selection)

        output_test_file = os.path.join(output_directory, test_name)
        output_train_file = os.path.join(output_directory, train_name)
        output_predictions_file = os.path.join(output_directory, predictions_name)

        print(output_directory)

        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model 
        model_to_save.save_pretrained(output_directory) 
        self.tokenizer.save_pretrained(output_directory)

        training_dict = {"train_loss": self.train_loss_values, "val_loss": self.val_loss_values}
        np.save(output_train_file, training_dict)
        np.save(output_test_file, self.results)

        test_predictions = pd.DataFrame([labels, pred_flat])
        test_predictions = test_predictions.T
        test_predictions = test_predictions.rename(columns={0: 'Labels', 1: 'Predictions'})
        test_predictions.to_csv(output_predictions_file)

        return print("Saving complete.")

In [4]:
#RoBERTa fine-tuning hyperparameters for GLUE: 
NORMALIZE_LIST = ["url", "hashtag", "user"]
ANNOTATE_LIST = []
LEARNING_RATE = [1e-5, 2e-5, 3e-5]
N_EPOCHS = 10 
EARLY_STOPPING = {"patience": 2, "delta": 0.03}
N_LABELS =  4
PAD_LENGTH = 64
BATCH_SIZE = [16, 32]
WEIGHT_DECAY = 0.1 
WARMUP = 0.06 
OUTPUT_DIR = "" #create seperate folders for each model before
TEST_NAME = "dogwhistle_test_results.npy"
TRAIN_NAME = "dogwhistlee_train_results.npy"
PREDICTIONS_NAME = "dogwhistle_test_predictions.csv"

In [None]:
results_dict = {}
max_f1_value = 0

for i in BATCH_SIZE:
    learning_rate_dict = {}
    for j in LEARNING_RATE: 
        Classifier = TransformerClassifier("RoBERTa", N_LABELS) 
        Classifier.fine_tune(train["text.clean"], train["labels"], dev["text.clean"], dev["labels"], NORMALIZE_LIST, ANNOTATE_LIST, PAD_LENGTH, EARLY_STOPPING, i, N_EPOCHS, j, WEIGHT_DECAY, WARMUP) 
        learning_rate_dict[j], labels, preds_flat = Classifier.test(test["text.clean"], test["labels"], NORMALIZE_LIST, ANNOTATE_LIST)

    if learning_rate_dict[j]["f1"] >= max_f1_value: #only save best model
        max_f1_value = learning_rate_dict[j]["f1"]
        Classifier.save(OUTPUT_DIR, TEST_NAME, TRAIN_NAME, PREDICTIONS_NAME, labels, preds_flat)

    results_dict[i] = learning_rate_dict 

#save complete training results
np.save(os.path.join(os.path.join(OUTPUT_DIR, "RoBERTa"), "dogwhistle_total_training_results.npy"), results_dict)