# Fine tuning transformers models on a custom dataset in a down-stream classification task

Today, we will return to the dataset that we've used on day 1 of our course: The ImDB data. Go back to the code you've written, and inspect the `recall`, `precision`, and `f1-scores`. 

In this notebook, we will try to improve the performance of our classifier by using `transfer learning`. In this notebook, we will use a `BERT` model, but feel free to check out the HuggingFace liberary whether there are alternatives that you might want to use. 


If your system does not run on GPU's, it is adviced to run this Notebook in Colab. 


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/annekroon/gesis-machine-learning/blob/main/day5/imdb.ipynb)

In [None]:
!pip3 install gdown
!pip3 install transformers

### Install packages

In [None]:
from collections import defaultdict
import gdown
import gzip
import json
import random
import pickle

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score
from pathlib import Path
from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

### Define constants

In [None]:

MODEL = 'distilbert-base-cased'  #Insert here the name of the model that you want to work with. You can inspect different models at huggingface: https://huggingface.co/models
DEVICE = 'cuda'       
MAX_LENGTH = 512   # This is the maximum token length                                                  
CACHED_DIR = 'my-awesome-model'  # directory that we'll use for saving the model 

### Read IMBD data

In [None]:

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)
    return texts, labels

Create train and test samples

In [None]:
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

Split train samples in train and validation samples

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

### Run a simply traditional classifier

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [None]:
model = LogisticRegression(max_iter=1000).fit(X_train, train_labels)
predictions = model.predict(X_test)

In [None]:
print(classification_report(test_labels, predictions))

### Let's start with our transformer-based approach

First, we need to tokenize the data using a tokenizer provided by HuggingFace. In particular, you need a tokenizer that belongs to the particular language model you will be using.

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL) 

tokenize the train/ val and test datasets, apply pedding and truncation. 

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LENGTH)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=MAX_LENGTH)
test_encodings  = tokenizer(test_texts, truncation=True, padding=True, max_length=MAX_LENGTH)

### Use the `PyTorch` Dataset class to transform the data 

In [None]:

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

### Inspect the results of the tokenization proces

In [None]:
' '.join(train_encodings[0].tokens[0:100])

In [None]:
' '.join(test_encodings[0].tokens[0:100])

In [None]:
' '.join(train_dataset.encodings[0].tokens[0:100])
' '.join(test_dataset.encodings[1].tokens[0:100])

### You can custimize the evaluation metrics that the model will provide

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
      'accuracy': acc, 
      'f1' : f1,
  }

In [None]:
# Initialize a ForSequenceClassification model
model = DistilBertForSequenceClassification.from_pretrained(MODEL, num_labels=len(id2label)).to(DEVICE)

### If needed, tweak the `Trainer` class parameter settings, and start training

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()

### Evaluate the model on the validation data

In [None]:
trainer.evaluate()

In [None]:
predicted_validation = trainer.predict(val_dataset)

In [None]:
predicted_val_labels = predicted_validation.predictions.argmax(-1) # Get the highest probability prediction
predicted_val_labels = predicted_val_labels.flatten().tolist()      # Flatten the predictions into a 1D list

In [None]:
print(classification_report(val_labels, predicted_val_labels))

### Evaluation on the test data

In [None]:
predicted_test = trainer.predict(test_dataset)
predicted_test_labels = predicted_test.predictions.argmax(-1) # Get the highest probability prediction

In [None]:
predicted_test_labels = predicted_test_labels.flatten().tolist()     

In [None]:
print(classification_report(test_labels, predicted_test_labels))