<a href="https://colab.research.google.com/github/adith-ds/MAD1_project/blob/main/DLproj_t32025_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Required library installations

In [1]:
!pip install transformers datasets accelerate wandb kaggle



# Kaggle linking

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"adithsenthil","key":"cdd1d2b9ba3fea71a1eedf9c91dfc18d"}'}

In [3]:
!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [4]:
!kaggle competitions download -c "2025-sep-dl-gen-ai-project"

2025-sep-dl-gen-ai-project.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
!unzip 2025-sep-dl-gen-ai-project.zip

Archive:  2025-sep-dl-gen-ai-project.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: sample_submission.csv   
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv                
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv               


# Imports

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# WandB login

In [7]:
import os
from google.colab import userdata
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')

In [8]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33m23f2000934[0m ([33m23f2000934-iitm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Hugging Face login

In [9]:
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [10]:
from huggingface_hub import login
login(token=os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Setup

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device being used is {device}")

device being used is cpu


# Baseline model

## Data Loading

In [12]:
df = pd.read_csv("/content/train.csv")
df.head()

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise,emotions
0,0,the dentist that did the work apparently did a...,1,0,0,1,0,['anger' 'sadness']
1,1,i'm gonna absolutely ~~suck~~ be terrible duri...,0,1,0,1,0,['fear' 'sadness']
2,2,"bridge: so leave me drowning calling houston, ...",0,1,0,1,0,['fear' 'sadness']
3,3,after that mess i went to see my now ex-girlfr...,1,1,0,1,0,['anger' 'fear' 'sadness']
4,4,"as he stumbled i ran off, afraid it might some...",0,1,0,0,0,['fear']


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6827 entries, 0 to 6826
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6827 non-null   int64 
 1   text      6827 non-null   object
 2   anger     6827 non-null   int64 
 3   fear      6827 non-null   int64 
 4   joy       6827 non-null   int64 
 5   sadness   6827 non-null   int64 
 6   surprise  6827 non-null   int64 
 7   emotions  6827 non-null   object
dtypes: int64(6), object(2)
memory usage: 426.8+ KB


In [14]:
MODEL_CHECKPOINT = 'bert-base-uncased'
LABEL_COLUMNS = ['anger', 'fear', 'joy', 'sadness', 'surprise']
MAX_LEN = 128
BATCH_SIZE = 64

In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [16]:
X = df['text'].tolist()
y = df[LABEL_COLUMNS].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [17]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        labels = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

In [18]:
train_set = EmotionDataset(
    texts=X_train,
    labels=y_train,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

val_set = EmotionDataset(
    texts=X_val,
    labels=y_val,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [19]:
train_dl = DataLoader(
    train_set,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_dl = DataLoader(
    val_set,
    batch_size=BATCH_SIZE,
    shuffle=False
)

## Training


In [20]:
MAX_LEN = 128
BATCH_SIZE = 64

In [21]:
wandb.init(
    project="DLproject-MultiLabel-Emotion",
    name="BERT-Initial-Run-v1",
    config={
        "model_checkpoint": "bert-base-uncased",
        "max_len": 128,
        "batch_size": BATCH_SIZE,
        "learning_rate": 2e-5,
        "epochs": 3,
        "num_labels": 5,
    }
)

In [22]:
from transformers import AutoModelForSequenceClassification

NUM_LABELS = 5

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
def compute_metrics(p):
    logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    labels = p.label_ids

    probabilities = 1 / (1 + np.exp(-logits))
    predictions = (probabilities > 0.5).astype(int)

    macro_f1 = f1_score(y_true=labels, y_pred=predictions, average='macro', zero_division=0)

    metrics = precision_recall_fscore_support(
        y_true=labels,
        y_pred=predictions,
        average='macro',
        zero_division=0
    )

    return {
        'macro_f1': macro_f1,
        'macro_precision': metrics[0],
        'macro_recall': metrics[1],
        'macro_support': metrics[3]
    }

In [24]:
from transformers import TrainingArguments, Trainer

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    report_to="wandb",
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics
)

# Training
trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1,Macro Precision,Macro Recall,Macro Support
1,No log,0.586531,0.144673,0.113324,0.2,
2,0.640100,0.397261,0.607317,0.771311,0.560242,
3,0.444700,0.32343,0.736394,0.801179,0.694637,


TrainOutput(global_step=258, training_loss=0.49168677662694177, metrics={'train_runtime': 359.8263, 'train_samples_per_second': 45.53, 'train_steps_per_second': 0.717, 'total_flos': 1077666131996928.0, 'train_loss': 0.49168677662694177, 'epoch': 3.0})

In [26]:
from huggingface_hub import HfFolder

REPO_ID = "adith-ds/emotion-classifier-v1"
tokenizer.push_to_hub(REPO_ID)
trainer.push_to_hub(REPO_ID)

print(f"Model successfully uploaded to: https://huggingface.co/{REPO_ID}")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...results/model.safetensors:   0%|          | 14.2kB /  438MB            

  ...results/training_args.bin:   2%|1         |  89.0B / 5.78kB            

Model successfully uploaded to: https://huggingface.co/adith-ds/emotion-classifier-v1
