# NLU Group 1: Final Project
## Example: Fine-tuning a classifier on the Social Bias Inference Corpus

Based on: https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb

This notebook by: Cameron Clarke (ccc779@nyu.edu)

Created: 3/16/2021

# 0. Setup

In [None]:
!pip install transformers
!pip install datasets
!pip install bert_score
!pip install optuna

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 5.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 18.4MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 26.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=098d5

# 1. Preprocessing

In [None]:
from transformers import AutoTokenizer
import datasets
from datasets import load_dataset, load_metric
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# [TODO] update this for whichever model type is desired

model_checkpoint = "bert-base-cased"
batch_size = 8

In [None]:
# [TODO] update this for whichever dataset is desired

dataset = load_dataset("social_bias_frames")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1527.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=970.0, style=ProgressStyle(description_…

Using custom data configuration default



Downloading and preparing dataset social_bias_frames/default (download: 6.03 MiB, generated: 42.41 MiB, post-processed: Unknown size, total: 48.45 MiB) to /root/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=6326977.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset social_bias_frames downloaded and prepared to /root/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d. Subsequent calls will reuse this data.


In [None]:
metric = load_metric('bertscore')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2474.0, style=ProgressStyle(description…




In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




In [None]:
sentence1_key = 'post'

In [None]:
def preprocess_function(examples, sentence1_key, sentence2_key=None):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True, padding=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding=True)

In [None]:
label_colname = 'offensiveYN'

relabel_dict = {
    '0.0': 0, # not offensive
    '0.5': 1, # maybe offiensive
    '1.0': 2, # offensive
    '': None # missing value
}

relabel_func = lambda column: [relabel_dict[elt] for elt in column]

In [None]:
# Following a design pattern found here: https://huggingface.co/transformers/custom_datasets.html#using-the-nlp-datasets-metrics-library


encoded_dataset = dataset.map(lambda x: preprocess_function(x, sentence1_key),  batched=True)
encoded_dataset = encoded_dataset.map(lambda x: {'labels': relabel_func(x[label_colname])},  batched=True)

new_features = encoded_dataset['train'].features.copy()
new_features["labels"] = datasets.ClassLabel(names=['no', 'maybe', 'yes'])

encoded_dataset['train'] = encoded_dataset['train'].cast(new_features)
encoded_dataset['validation'] = encoded_dataset['validation'].cast(new_features)
encoded_dataset['test'] = encoded_dataset['test'].cast(new_features)



HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))




In [None]:
encoded_dataset = encoded_dataset.filter(lambda row: not (row['labels'] is None))

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))




In [None]:
encoded_dataset['train']['offensiveYN'], encoded_dataset['train']['labels']

(['1.0',
  '0.5',
  '0.5',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '0.0',
  '0.0',
  '0.0',
  '1.0',
  '1.0',
  '0.5',
  '0.5',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '0.0',
  '1.0',
  '1.0',
  '0.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '0.5',
  '0.5',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '0.0',
  '1.0',
  '0.0',
  '1.0',
  '1.0',
  '0.0',
  '0.5',
  '0.0',
  '1.0',
  '1.0',
  '0.0',
  '0.5',
  '0.5',
  '1.0',
  '0.0',
  '1.0',
  '0.5',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '0.5',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '0.0',
  '0.0',
  '1.0',
  '0.0',
  '1.0',
  '0.5',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
  '0.5',
  '0.0',
  '0.0',
  '0.0',
  '0.0',
  '0.5',
  '1.0',
  '1.0',
  '1.0',
  '1.0',
 

# 2. Fine-tuning

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
# num_labels = 3 # The original 'offensiveYN' outcome is a categorical variable with three levels
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
# model.eval()
# model.to('cuda')

In [None]:
# [TODO] update this with whatever metric is desired

metric_name = "accuracy"

args = TrainingArguments(
    "test-SBIC-bert",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [None]:
# Reference: https://github.com/armandalewis/ling-ga-1012/blob/main/eval_metrics.py

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        # 'f1': f1,
        # 'precision': precision,
        # 'recall': recall
    }

# def compute_metrics(eval_pred, metric):
#     predictions, labels = eval_pred

#     predictions = np.argmax(predictions, axis=1)
#     return metric.compute(predictions=predictions, references=labels)

In [None]:
# validation_key = "validation"
# trainer = Trainer(
#     model,
#     args,
#     train_dataset=encoded_dataset['train'],
#     eval_dataset=encoded_dataset['validation'],
#     tokenizer=tokenizer,
#     compute_metrics=lambda x: compute_metrics(x, metric)
# )

# Hyperparameter Tuning

In [None]:
def model_init():
    num_labels = 3 # The original 'offensiveYN' outcome is a categorical variable with three levels
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
    model.eval()
    model.to('cuda')
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=tokenizer,
    # compute_metrics=lambda x: compute_metrics(x, metric)
    compute_metrics=compute_metrics
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-2, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 1),
        "seed": trial.suggest_int("seed", 1, 40),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [batch_size]),
    }

In [None]:
best_run = trainer.hyperparameter_search(n_trials=5, direction="maximize", hp_space=my_hp_space)

[32m[I 2021-03-31 12:40:43,616][0m A new study created in memory with name: no-name-9a9d8dc1-63d3-439b-833a-c64f96296739[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForS

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9552,0.900342,0.573074,94.1624,175.197


[32m[I 2021-03-31 13:38:21,701][0m Trial 0 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.0017083372941272326, 'num_train_epochs': 1, 'seed': 14, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification f

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9859,0.904427,0.573074,94.6797,174.24


[32m[I 2021-03-31 14:36:17,091][0m Trial 1 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.009984450997581298, 'num_train_epochs': 1, 'seed': 33, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification fr

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.995,0.91227,0.573074,94.7487,174.113


[32m[I 2021-03-31 15:34:30,563][0m Trial 2 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.009845643561621797, 'num_train_epochs': 1, 'seed': 18, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification fr

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.979,0.894108,0.573074,94.4195,174.72


[32m[I 2021-03-31 16:32:15,496][0m Trial 3 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.005465576677694313, 'num_train_epochs': 1, 'seed': 26, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification fr

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9407,0.901132,0.573074,94.3046,174.933


[32m[I 2021-03-31 17:30:17,072][0m Trial 4 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.0019008891007900582, 'num_train_epochs': 1, 'seed': 15, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m


In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9552,0.900342,0.573074,94.3258,174.894


TrainOutput(global_step=13861, training_loss=1.0214273050813376, metrics={'train_runtime': 3475.5147, 'train_samples_per_second': 3.988, 'total_flos': 1.8257329918607868e+16, 'epoch': 1.0, 'train_mem_cpu_alloc_delta': 1960809, 'train_mem_gpu_alloc_delta': 13911040, 'train_mem_cpu_peaked_delta': 89718129, 'train_mem_gpu_peaked_delta': 6059090432})

In [None]:
trainer.evaluate()

{'epoch': 1.0,
 'eval_accuracy': 0.5730738922228283,
 'eval_loss': 0.9003417491912842,
 'eval_mem_cpu_alloc_delta': 405831,
 'eval_mem_cpu_peaked_delta': 1172198,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 203021312,
 'eval_runtime': 94.2749,
 'eval_samples_per_second': 174.988}

In [None]:
trainer.state

TrainerState(epoch=1.0, global_step=13861, max_steps=13861, num_train_epochs=1, total_flos=1.8257329918607868e+16, log_history=[{'loss': 1.2008, 'learning_rate': 0.001646713410780893, 'epoch': 0.04, 'step': 500}, {'loss': 1.0669, 'learning_rate': 0.0015850895274345529, 'epoch': 0.07, 'step': 1000}, {'loss': 1.1307, 'learning_rate': 0.0015234656440882132, 'epoch': 0.11, 'step': 1500}, {'loss': 1.0691, 'learning_rate': 0.0014618417607418733, 'epoch': 0.14, 'step': 2000}, {'loss': 1.0785, 'learning_rate': 0.0014002178773955337, 'epoch': 0.18, 'step': 2500}, {'loss': 1.0436, 'learning_rate': 0.0013385939940491936, 'epoch': 0.22, 'step': 3000}, {'loss': 1.0641, 'learning_rate': 0.0012769701107028539, 'epoch': 0.25, 'step': 3500}, {'loss': 1.0455, 'learning_rate': 0.001215346227356514, 'epoch': 0.29, 'step': 4000}, {'loss': 1.0324, 'learning_rate': 0.0011537223440101743, 'epoch': 0.32, 'step': 4500}, {'loss': 1.0252, 'learning_rate': 0.0010920984606638342, 'epoch': 0.36, 'step': 5000}, {'los