In [None]:
## this notebook was developed in Google Colab, and it may require some adjustments to run in other environments.

In [None]:
import torch
torch.cuda.is_available()

In [None]:
!pip install datasets transformers huggingface_hub

In [None]:
!apt-get install git-lfs

# 1. Preprocessing data

In [None]:
from datasets import load_dataset
imdb = load_dataset("imdb")

In [None]:
## subsetting for faster training and testing

In [None]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [None]:
print(small_train_dataset)

In [None]:
## preparing data using distilBERT

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
## preparing text inputs for the models for both train and test splits using map method

def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)

tokenizer_train = small_train_dataset.map(preprocess_function, batched=True)
tokenizer_test = small_test_dataset.map(preprocess_function, batched=True)

In [None]:
## convert training samples to PyTorch tensors and concat them with correct ampunt of padding

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Training the model

In [None]:
## defining distilBERT as base model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
## metrices use to evaluate the fine tuned model's performance

import numpy as np
!pip install evaluate
import evaluate

def compute_metrics(eval_pred):
  accuracy_metric = evaluate.load("accuracy")
  f1_metric = evaluate.load("f1")

  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=1)

  accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
  f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")["f1"]

  return{"accuracy": accuracy, "f1": f1}

In [None]:
## create token at huggingFace and at wandb

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install wandb
import wandb

wandb.login()

## Fine tuning distilBERT for IMDB

In [None]:
## throwing away the pretraining head of the DistilBERT model and replacing it with a classification head fine-tuned for sentiment analysis.
## This enables us to transfer the knowledge from DistilBERT to our custom model

## For training, I will be using the Trainer API, which is optimized for fine-tuning Transformers models such as DistilBERT, BERT and RoBERTa

In [None]:
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_train,
    eval_dataset=tokenizer_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 3. Training and testing

In [None]:
trainer.train()

In [None]:
## compute evalutaion metrics

trainer.evaluate()

{'eval_loss': 0.7492803931236267,
 'eval_accuracy': 0.8566666666666667,
 'eval_f1': 0.86084142394822,
 'eval_runtime': 5.2781,
 'eval_samples_per_second': 56.839,
 'eval_steps_per_second': 3.6,
 'epoch': 2.0}

# 4. Analyzing new data witht the model

In [None]:
## upload model on huggingface hub

trainer.push_to_hub()

In [None]:
## use our models pipeline class to analyze two new movie reviews
## and see how your model predicts its sentiment with just two lines of code

from transformers import pipeline

sentiment_model = pipeline(model="bioinfo1aditi/finetuning-sentiment-model-3000-samples")
sentiment_model(["I love this move", "This movie sucks!"])