<a href="https://colab.research.google.com/github/brownsloth/transformers_concepts_notebooks/blob/main/transformers_2_task_specific_fine_tuning_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
We can add a Head which is compatible with the pre-trained transformer on top of it.
Ex: BERT can help us:
  - classify sentences by adding appropriate head on pooled output of the final layer
  - in question answering task by adding appropriate head on the hidden layer output of the final layer

It cannot help us in seq-to-seq tasks like language translation no matter the head we choose.
"""

In [None]:
!pip install datasets

In [None]:
!pip install evaluate

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from evaluate import load as load_metric
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification,  Trainer, TrainingArguments, AutoTokenizer, AutoModel, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import get_scheduler
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
import os
from tqdm.auto import tqdm

## 1. Prepare dataset

In [None]:
! pip install -q kaggle
from google.colab import files
files.upload()

#Choose the kaggle.json file that you downloaded
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
#Make directory named kaggle and copy kaggle.json file there.
! chmod 600 ~/.kaggle/kaggle.json
#Change the permissions of the file.
! kaggle datasets list

In [None]:
!kaggle datasets download 'rmisra/news-headlines-dataset-for-sarcasm-detection'
! mkdir train
! unzip news-headlines-dataset-for-sarcasm-detection.zip -d train

In [None]:
!cat train/Sarcasm_Headlines_Dataset_v2.json | head -5

In [None]:
train_path = 'train/Sarcasm_Headlines_Dataset_v2.json'
df = pd.read_json(train_path, lines=True)
df.head()

In [None]:
dataset_hf = load_dataset("json", data_files=train_path)
print(dataset_hf)

In [None]:
## Convert to pandas for some pre-processing on the daatset
dataset_hf.set_format('pandas') #on iteration, rows will be returned in that format
dataset_df = dataset_hf['train'][:] #this returns all rows in df format .. basically converting to df
dataset_df.head()

In [None]:
dataset_df.drop(['article_link'], axis=1, inplace=True, errors='ignore')
dataset_df.drop_duplicates(subset=['headline'], inplace=True)
dataset_df.reset_index() #reset index after droppping duplicates
dataset_df = dataset_df[['headline', 'is_sarcastic']]
dataset_df.rename(columns={'headline': 'input', 'is_sarcastic': 'label'}, inplace=True)
dataset_df.head()

In [None]:
## convert back to HF dataset
dataset_hf = Dataset.from_pandas(dataset_df, preserve_index=False)

## Split into train-val-test
train_testval_split = dataset_hf.train_test_split(test_size=0.2, seed=42)

test_val_split = train_testval_split['test'].train_test_split(test_size=0.5, seed=42)

dataset_hf_with_splits = DatasetDict({
    'train': train_testval_split['train'],
    'valid': test_val_split['train'],
    'test': test_val_split['test']
})
print(dataset_hf_with_splits)

## 2. Prepare the dataset for the specific model

In [None]:
checkpoint = 'distilbert-base-uncased'
# Can use AutoModel to use any model as long as its recognizable from the checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)
print(tokenizer)
print(tokenizer.model_max_length)
print(model)

In [None]:
#can see whats inside the model
model.transformer.layer[0].attention

In [None]:
## First step: Tokenize the HF dataset using custom tokenizer
## Here the tokenizer is based on the pretraiend model
## map() method helps apply custom functions fast on the HF dataset

def tokenize(batch):
  #tokenize all (train test val)
  return tokenizer(batch['input'], truncation=True, max_length=tokenizer.model_max_length)

tokenized_dataset = dataset_hf_with_splits.map(tokenize, batched=True)
print(tokenized_dataset)

tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
data_collator  = DataCollatorWithPadding(tokenizer=tokenizer) # just needs to know about the tokenizer that was used

## 3. Write class for the model load and forward pass

In [None]:
PRETRAINED_OUTPUT_DIM = model.embeddings.word_embeddings.embedding_dim ## assuming the output would be this from the final layer

In [None]:
class SarcasmDetector(nn.Module):
  def __init__(self, checkpoint, num_labels):
    super(SarcasmDetector, self).__init__()
    self.num_labels = num_labels
    self.model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint,
                                                                                         output_attention=True,
                                                                                         output_hidden_state=True))

    ## Define New layers here
    self.dropout = nn.Dropout(0.1)
    self.classifier = nn.Linear(PRETRAINED_OUTPUT_DIM, num_labels)

  def forward(self, input_ids=None, attention_mask=None, labels=None):
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    last_hidden_state = outputs[0] #0th will give us last hidden state (tokeniwise embeddings), 1st will give us CLS embedding
    ## Apply new layers here
    sequence_outputs = self.dropout(last_hidden_state)
    print(sequence_outputs.shape)
    logits = self.classifier(sequence_outputs[:,0,:].view(-1, PRETRAINED_OUTPUT_DIM))

    loss = None
    if labels is not None:
      loss_func = nn.CrossEntropyLoss()
      loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1))

    #need to send output as TokenClassifierOutput object with all info so HF knows
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)


## 4. Do other things required for training

In [None]:
## Create dataloader instances

train_dl = DataLoader(
    tokenized_dataset['train'], shuffle=True, batch_size=32, collate_fn = data_collator
)
val_dl = DataLoader(
    tokenized_dataset['valid'], shuffle=True, collate_fn = data_collator
)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
sarcasm_model = SarcasmDetector(checkpoint, num_labels=2).to(device)

In [None]:
optimizer = AdamW(sarcasm_model.parameters(), lr=5e-5)
num_epoch = 3
num_training_steps = num_epoch * len(train_dl)
print(num_training_steps)
lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
metric = load_metric('f1')

## 5. Write the training loop

In [None]:
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(val_dl)))

for epoch in range(num_epoch):
  sarcasm_model.train()
  for batch in train_dl:
      batch = {k:v.to(device) for k,v in batch.items()}
      outputs = sarcasm_model(**batch)
      loss = outputs.loss
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar_train.update()
  sarcasm_model.eval()
  for batch in val_dl:
      batch = {k:v.to(device) for k,v in batch.items()}
      with torch.no_grad():
        outputs = sarcasm_model(**batch)
      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)
      metric.add_batch(predictions=predictions, references=batch['labels'])
      progress_bar_eval.update()

  print(metric.compute())

In [None]:
sarcasm_model.eval()

test_dl = val_dl = DataLoader(
    tokenized_dataset['test'], batch_size=32, collate_fn = data_collator
)

for batch in test_dl:
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
      outputs = sarcasm_model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

print(metric.compute())

We achieved f1 score of 93%