# Fine-Tuning BERT

In [1]:
from datasets import Dataset
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
import torch
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
df = pd.read_pickle('p4k_reviews_dataset_processed_balanced.pkl')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,url,score,pub_year,text
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,9411,http://pitchfork.com/reviews/albums/12589-johnny-cash-remixed/,0.0,2009,"I've been scouring the book of Revelation for some mention of this album, figuring it had to be ..."
0.0,9832,http://pitchfork.com/reviews/albums/12135-angles/,0.0,2008,"If you're an American reader familiar with the British hip-hop duo Dan Le Sac vs. Scroobius Pip,..."
0.0,9461,http://pitchfork.com/reviews/albums/12544-the-effects-of-333/,0.0,2009,Credit where credit's due: Black Rebel Motorcycle Club have made far better music than either th...
1.0,3914,http://pitchfork.com/reviews/albums/18560-pixies-ep-1/,1.0,2013,The sad spoils of a job in music criticism: I am finally given the chance to review a new releas...
1.0,6094,http://pitchfork.com/reviews/albums/15996-lou-reed-metallica/,1.0,2011,"When Metallica announced last June that they had recorded a new album with Lou Reed, fans of bot..."
...,...,...,...,...,...
10.0,7004,http://pitchfork.com/reviews/albums/15069-nowhere-20th-anniversary-edition/,10.0,2011,"Along with the 20th anniversary reissue of Ride's debut LP, Nowhere, comes a thick booklet of ol..."
10.0,857,http://pitchfork.com/reviews/albums/21845-sign-o-the-times/,10.0,2016,"In 1987, Prince Rogers Nelson was in transition. He’d disbanded the Revolution, the band that ha..."
10.0,8835,http://pitchfork.com/reviews/albums/13130-reckoning-deluxe-edition/,10.0,2009,"Given their vast and varied catalog, it's sometimes easier to imagine R.E.M. as a discography th..."
10.0,4639,http://pitchfork.com/reviews/albums/17499-rumours/,10.0,2013,"Fleetwood Mac's Rumours would never be just an album. Upon its release in 1977, it became the fa..."


In [5]:
dataX = df['text']
dataY = df['score']
dataX.head(5), dataY.head(5)

(score      
 0.0    9411    I've been scouring the book of Revelation for some mention of this album, figuring it had to be ...
        9832    If you're an American reader familiar with the British hip-hop duo Dan Le Sac vs. Scroobius Pip,...
        9461    Credit where credit's due: Black Rebel Motorcycle Club have made far better music than either th...
 1.0    3914    The sad spoils of a job in music criticism: I am finally given the chance to review a new releas...
        6094    When Metallica announced last June that they had recorded a new album with Lou Reed, fans of bot...
 Name: text, dtype: object,
 score      
 0.0    9411    0.0
        9832    0.0
        9461    0.0
 1.0    3914    1.0
        6094    1.0
 Name: score, dtype: float64)

In [6]:
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

In [7]:
raw_train_df = pd.concat([x_train, y_train], axis=1)
raw_test_df = pd.concat([x_test, y_test], axis=1)
raw_val_df = pd.concat([x_val, y_val], axis=1)

In [8]:
raw_train_ds = Dataset.from_pandas(raw_train_df)
raw_test_ds = Dataset.from_pandas(raw_test_df)
raw_val_ds = Dataset.from_pandas(raw_test_df)

In [10]:
print(raw_train_ds, raw_val_ds, raw_test_ds)

Dataset({
    features: ['text', 'score', '__index_level_0__', '__index_level_1__'],
    num_rows: 2456
}) Dataset({
    features: ['text', 'score', '__index_level_0__', '__index_level_1__'],
    num_rows: 308
}) Dataset({
    features: ['text', 'score', '__index_level_0__', '__index_level_1__'],
    num_rows: 308
})


In [11]:
BASE_MODEL = "bert-base-uncased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 512
BATCH_SIZE = 12
EPOCHS = 20

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
ds = {"train": raw_train_ds, "validation": raw_val_ds, "test": raw_test_ds}

def preprocess_function(examples):
    label = examples["score"] 
    examples = tokenizer(examples["text"], padding='max_length', truncation=True, max_length=MAX_LENGTH)
    
    # Change this to real number
    examples["label"] = float(label)
    return examples

for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=["text", "score"])

                                                                

In [13]:
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    
    # Compute accuracy 
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [14]:
training_args = TrainingArguments(
    output_dir="./models/bert-base-uncased-fine-tuned-regression-2",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [15]:
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [16]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, __index_level_1__. If __index_level_0__, __index_level_1__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2456
  Num Epochs = 20
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 4100
  Number of trainable parameters = 109483009


Epoch,Training Loss,Validation Loss,Mse,Mae,R2,Accuracy
1,No log,2.159673,2.159673,1.159783,0.318851,0.253247
2,No log,1.743036,1.743036,1.014747,0.450256,0.311688
3,3.727400,2.043428,2.043428,1.096771,0.355514,0.295455
4,3.727400,1.73111,1.73111,1.013685,0.454018,0.347403
5,0.663800,1.637751,1.637751,0.957066,0.483462,0.399351
6,0.663800,1.535734,1.535734,0.929292,0.515638,0.399351
7,0.663800,1.509741,1.509741,0.91632,0.523836,0.396104
8,0.272000,1.460881,1.460881,0.921639,0.539246,0.363636
9,0.272000,1.510837,1.510837,0.911221,0.54349,0.38961
10,0.172200,1.426096,1.426096,0.911429,0.558678,0.402597


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, __index_level_1__. If __index_level_0__, __index_level_1__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 308
  Batch size = 12
Saving model checkpoint to ./models/bert-base-uncased-fine-tuned-regression-2\checkpoint-205
Configuration saved in ./models/bert-base-uncased-fine-tuned-regression-2\checkpoint-205\config.json
Model weights saved in ./models/bert-base-uncased-fine-tuned-regression-2\checkpoint-205\pytorch_model.bin
Deleting older checkpoint [models\bert-base-uncased-fine-tuned-regression-2\checkpoint-2149] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, __index_level_1__. I

***** Running Evaluation *****
  Num examples = 308
  Batch size = 12
Saving model checkpoint to ./models/bert-base-uncased-fine-tuned-regression-2\checkpoint-2255
Configuration saved in ./models/bert-base-uncased-fine-tuned-regression-2\checkpoint-2255\config.json
Model weights saved in ./models/bert-base-uncased-fine-tuned-regression-2\checkpoint-2255\pytorch_model.bin
Deleting older checkpoint [models\bert-base-uncased-fine-tuned-regression-2\checkpoint-1845] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, __index_level_1__. If __index_level_0__, __index_level_1__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 308
  Batch size = 12
Saving model checkpoint to ./models/bert-base-uncased-fine-tuned-regression-2\checkpoint-2460
Configuration save

TrainOutput(global_step=4100, training_loss=0.6366599191107402, metrics={'train_runtime': 3551.6947, 'train_samples_per_second': 13.83, 'train_steps_per_second': 1.154, 'total_flos': 1.292389899976704e+16, 'train_loss': 0.6366599191107402, 'epoch': 20.0})

In [17]:
trainer.eval_dataset=ds["test"]
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, __index_level_1__. If __index_level_0__, __index_level_1__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 308
  Batch size = 12


{'eval_loss': 1.4995033740997314,
 'eval_mse': 1.2365033740997314,
 'eval_mae': 0.8236053466796875,
 'eval_r2': 0.7110650978080474,
 'eval_accuracy': 0.40584415584415584,
 'eval_runtime': 7.3482,
 'eval_samples_per_second': 41.915,
 'eval_steps_per_second': 3.538,
 'epoch': 20.0}

In [28]:
nb_batches = math.ceil(len(raw_test_ds)/BATCH_SIZE)
y_preds = []

for i in range(nb_batches):
    input_texts = raw_test_ds[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["text"]
    input_labels = raw_test_ds[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["score"]
    encoded = tokenizer(input_texts, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt").to("cuda")
    y_preds += model(**encoded).logits.reshape(-1).tolist()

# pd.set_option('display.max_rows', 500)
df1 = pd.DataFrame([raw_test_ds["text"], raw_test_ds["score"], y_preds], ["Text", "Score", "Prediction"]).T
# incorrect_cases = df[df["Score"] != df["Rounded Prediction"]]
# incorrect_cases

In [29]:
df1

Unnamed: 0,Text,Score,Prediction
0,"Nisennenmondai are a band of small attactive Japanese women who play pummeling, dense instrument...",7.0,6.443225
1,Prince had a curious approach to live albums. After Purple Rain put his crowd-stunning skills on...,9.0,8.661754
2,"Kylesa have always been a moving target. Since their inception, the Savannah, Ga., group has tra...",7.0,5.1486
3,"Among other qualities, jazz has often provided a meeting ground for complexity and catchiness. I...",8.0,8.125548
4,"In 2015, Atlanta rap is just too big to be confined to a singular style. Outsiders might focus o...",6.0,6.391431
5,"When details of Death From Above 1979's breakup surfaced in 2005, it was surprising to learn of ...",6.0,5.123636
6,"No one could accuse Bloc Party of being averse to the dancefloor, or electronic music in general...",5.0,5.422135
7,If you had to single out a band as the embodiment of everything supposedly small-stakes and emot...,4.0,6.564582
8,"On paper, pairing recent Young Money signees Rich Homie Quan and Young Thug together for an 84-m...",8.0,7.05863
9,The Black Ryder is a band out of time. It’s not just that their own music is heavily indebted to...,6.0,7.126931
