<a href="https://colab.research.google.com/github/ayadassouki/ayadassouki-ACL-Task-A-Sentiment-Analysis/blob/main/distillbert_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import DistilBertModel, DistilBertTokenizerFast
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('eng.csv')

In [None]:
df.head()

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_train_track_a_00001,But not very happy.,0,0,1,1,0
1,eng_train_track_a_00002,Well she's not gon na last the whole song like...,0,0,1,0,0
2,eng_train_track_a_00003,She sat at her Papa's recliner sofa only to mo...,0,0,0,0,0
3,eng_train_track_a_00004,"Yes, the Oklahoma city bombing.",1,1,0,1,1
4,eng_train_track_a_00005,They were dancing to Bolero.,0,0,1,0,0


In [None]:
# Combine sentiment columns into a list of binary labels
df['labels'] = df[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']].values.tolist()

In [None]:
df.head()

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise,labels
0,eng_train_track_a_00001,But not very happy.,0,0,1,1,0,"[0, 0, 1, 1, 0]"
1,eng_train_track_a_00002,Well she's not gon na last the whole song like...,0,0,1,0,0,"[0, 0, 1, 0, 0]"
2,eng_train_track_a_00003,She sat at her Papa's recliner sofa only to mo...,0,0,0,0,0,"[0, 0, 0, 0, 0]"
3,eng_train_track_a_00004,"Yes, the Oklahoma city bombing.",1,1,0,1,1,"[1, 1, 0, 1, 1]"
4,eng_train_track_a_00005,They were dancing to Bolero.,0,0,1,0,0,"[0, 0, 1, 0, 0]"


In [None]:
# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['labels'].tolist(), test_size=0.2, random_state=42
)

In [None]:
from transformers import DistilBertTokenizerFast

# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
import torch

class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)


In [None]:
from transformers import DistilBertForSequenceClassification

# Load DistilBERT with a classification head
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=5,
    problem_type="multi_label_classification"
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
num_warmup_steps = int(0.1 * num_training_steps)  # Example: 10% of training steps for warmup
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [None]:
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm

# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
epochs = 3
loss_fn = BCEWithLogitsLoss()

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        batch = {key: val.to(device) for key, val in batch.items()}

        outputs = model(**batch)
        logits = outputs.logits
        loss = loss_fn(logits, batch['labels'])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        train_loss += loss.item()
    print(f"Epoch {epoch+1} - Training Loss: {train_loss/len(train_loader)}")

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {key: val.to(device) for key, val in batch.items()}

            outputs = model(**batch)
            logits = outputs.logits
            loss = loss_fn(logits, batch['labels'])

            val_loss += loss.item()
    print(f"Epoch {epoch+1} - Validation Loss: {val_loss/len(val_loader)}")


100%|██████████| 139/139 [18:52<00:00,  8.15s/it]


Epoch 1 - Training Loss: 0.5289282406405579
Epoch 1 - Validation Loss: 0.42864853313991


100%|██████████| 139/139 [18:49<00:00,  8.12s/it]


Epoch 2 - Training Loss: 0.3582545094352832
Epoch 2 - Validation Loss: 0.39908541100365774


100%|██████████| 139/139 [18:28<00:00,  7.98s/it]


Epoch 3 - Training Loss: 0.2518387146347718
Epoch 3 - Validation Loss: 0.38298131568091254


In [None]:
model.save_pretrained("distilbert-emotion")
tokenizer.save_pretrained("distilbert-emotion")

('distilbert-emotion/tokenizer_config.json',
 'distilbert-emotion/special_tokens_map.json',
 'distilbert-emotion/vocab.txt',
 'distilbert-emotion/added_tokens.json',
 'distilbert-emotion/tokenizer.json')

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# Generate predictions on validation set
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        batch = {key: val.to(device) for key, val in batch.items()}
        logits = model(**batch).logits
        predictions = torch.sigmoid(logits).cpu().numpy()
        all_preds.extend(predictions)
        all_labels.extend(batch['labels'].cpu().numpy())

# Binarize predictions at threshold 0.5
all_preds = (np.array(all_preds) > 0.5).astype(int)

# Classification report
print(classification_report(all_labels, all_preds, target_names=["Anger", "Fear", "Joy", "Sadness", "Surprise"]))


              precision    recall  f1-score   support

       Anger       0.65      0.43      0.51        61
        Fear       0.78      0.84      0.81       314
         Joy       0.68      0.57      0.62       134
     Sadness       0.72      0.64      0.67       171
    Surprise       0.75      0.60      0.67       172

   micro avg       0.74      0.68      0.71       852
   macro avg       0.72      0.61      0.66       852
weighted avg       0.74      0.68      0.70       852
 samples avg       0.65      0.62      0.61       852



In [None]:
def predict_emotions(text, model, tokenizer):
    encodings = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt")
    encodings = {key: val.to(device) for key, val in encodings.items()}
    with torch.no_grad():
        logits = model(**encodings).logits
        probs = torch.sigmoid(logits).cpu().numpy()
    return {emotion: prob for emotion, prob in zip(["Anger", "Fear", "Joy", "Sadness", "Surprise"], probs[0])}

# Example usage
new_text = "I am so happy and excited!"
print(predict_emotions(new_text, model, tokenizer))

{'Anger': 0.048570856, 'Fear': 0.051257703, 'Joy': 0.947756, 'Sadness': 0.03778019, 'Surprise': 0.21391396}
