In [1]:
import torch
import time
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

In [2]:
df = pd.read_csv('/content/classification_data.csv', encoding='utf-8')

* Define the labels and label mapping

In [3]:
labels = list(df['Emotion'].unique())
label2id = {label: i for i, label in enumerate(labels)}

* Split the dataset into train and validation sets

In [4]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

* Initialize the BERT tokenizer and model

In [5]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* Tokenize the text data

In [6]:
def tokenize_text(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []

    for text in df['Comment']:
        encoding = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return torch.stack(input_ids), torch.stack(attention_masks)

* Adjusting the max_length of the sequence to 50

In [7]:
max_length = 50
train_input_ids, train_attention_masks = tokenize_text(train_df, tokenizer, max_length)
val_input_ids, val_attention_masks = tokenize_text(val_df, tokenizer, max_length)

* Create DataLoader for training and validation

In [8]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor([label2id[label] for label in train_df['Emotion']], dtype=torch.long))
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, torch.tensor([label2id[label] for label in val_df['Emotion']], dtype=torch.long))
val_dataloader = DataLoader(val_dataset, batch_size=512)


* Training hyperparameters

In [9]:
learning_rate = 0.1
epochs = 3

* Define the optimizer and learning rate scheduler

In [10]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)



In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

* Training the model

In [None]:
model.to(device)
for epoch in range(epochs):
    model.train()
    total_loss = 0

    start_time = time.time()
    for num, batch in enumerate(train_dataloader):
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
        optimizer.zero_grad()
        input_ids = input_ids.squeeze(dim=1)
        attention_masks = attention_masks.squeeze(dim=1)
        labels = labels.squeeze()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        elapsed_time = time.time() - start_time

        if num % 2 == 0:
          print(f"{num} : Time taken : {round(elapsed_time, 2)} secs\n")

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1} - Average Training Loss: {avg_train_loss:.2f}')

    model.eval()
    all_preds = []
    all_labels = []

    for batch in val_dataloader:
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
        input_ids = input_ids.squeeze(dim=1)
        attention_masks = attention_masks.squeeze(dim=1)
        labels = labels.squeeze()
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    classification_rep = classification_report(all_labels, all_preds, target_names=labels)
    print(f'Epoch {epoch + 1} - Validation Accuracy: {accuracy:.2f}')
    print(f'Epoch {epoch + 1} - Classification Report:\n{classification_rep}')
