<a href="https://colab.research.google.com/github/avinash064/Avinash_kashyap_21064/blob/main/21064_AvinashKashyap_nlpassignment3_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### NLP Assignment-2

Load data

In [1]:
!wget https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/train_split.csv
!wget https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/test_split.csv

--2024-11-15 12:24:49--  https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/train_split.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 144474 (141K) [text/plain]
Saving to: ‘train_split.csv’


2024-11-15 12:24:49 (7.11 MB/s) - ‘train_split.csv’ saved [144474/144474]

--2024-11-15 12:24:49--  https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/test_split.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35259 (34K) [text/plain]
Saving to: ‘test_split.csv’


2024-11-15 12:24:50 (4.88 MB/s) - ‘te

In [2]:
import torch
from transformers import RobertaTokenizer, RobertaModel, AdamW
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from sklearn.metrics import f1_score
from tqdm import tqdm

In [3]:
import pandas as pd

# Load the training DataFrame
train_data = pd.read_csv('/content/train_split.csv')
test_data = pd.read_csv('/content/test_split.csv')
print(train_data.head())

                                                text  Joy  Fear  Anger  \
0  The light would come all the way up to your ve...    0     1      0   
1                   Well, my birthday is in January.    0     0      0   
2  Tears in my eyes, too much on my mind, and dee...    0     1      0   
3  My eyes scanned quickly into every room, every...    0     1      1   
4  I felt the afterglow of the late afternoon sun...    1     0      0   

   Sadness  Surprise  
0        0         1  
1        0         0  
2        1         0  
3        0         0  
4        0         0  


In [4]:
emotions = ['Joy', 'Sadness', 'Surprise', 'Fear', 'Anger']

def get_texts_and_labels(train_data, emotion_columns):
    texts = train_data['text'].tolist()
    labels = train_data[emotion_columns].values
    return texts, labels

In [5]:
train_texts, train_labels = get_texts_and_labels(train_data, emotions)
test_texts, test_labels = get_texts_and_labels(test_data, emotions)

In [6]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        # Tokenize text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

In [7]:
def preprocess(texts, labels, tokenizer, batch_size=16, max_length=128):
    dataset = EmotionDataset(texts, labels, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

In [8]:
class EmotionClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(EmotionClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        return self.out(pooled_output)


In [9]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = EmotionClassifier(model_name=model_name, num_labels=5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def train(model, train_dataloader, val_dataloader, optimizer, device, epochs=3):
    criterion = nn.BCEWithLogitsLoss()
    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}")

        evaluate(model, val_dataloader, device)


In [11]:
def evaluate(model, val_dataloader, device):
    model.eval()
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.sigmoid(outputs).cpu().numpy() > 0.5

            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(preds)

    # Calculate F1 Score
    f1 = f1_score(true_labels, pred_labels, average="macro")
    print(f"Validation Macro F1 Score: {f1:.4f}")


In [12]:
train_dataloader = preprocess(train_texts, train_labels, tokenizer)
val_dataloader = preprocess(test_texts, test_labels, tokenizer)

In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


train(model, train_dataloader, val_dataloader, optimizer, device, epochs=3)

100%|██████████| 100/100 [00:32<00:00,  3.08it/s]


Epoch 1/3, Training Loss: 0.5706
Validation Macro F1 Score: 0.4838


100%|██████████| 100/100 [00:31<00:00,  3.14it/s]


Epoch 2/3, Training Loss: 0.4092
Validation Macro F1 Score: 0.6024


100%|██████████| 100/100 [00:32<00:00,  3.11it/s]


Epoch 3/3, Training Loss: 0.2997
Validation Macro F1 Score: 0.6437


###Evaluation
An example of evaluation:

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Example of gold labels (y_true) and predictions (y_pred)
y_true = np.array([[1, 0, 0, 1, 0],  # joy, sadness, fear, anger, surprise
                   [0, 1, 1, 0, 0],
                   [1, 0, 1, 0, 1]])

y_pred = np.array([[1, 0, 0, 1, 0],  # joy, sadness, fear, anger, surprise
                   [0, 1, 0, 1, 0],
                   [1, 0, 1, 0, 0]])

# Generate the classification report
report = classification_report(y_true, y_pred, target_names=['joy', 'sadness', 'fear', 'anger', 'surprise'], zero_division=0)

print(report)

              precision    recall  f1-score   support

         joy       1.00      1.00      1.00         2
     sadness       1.00      1.00      1.00         1
        fear       1.00      0.50      0.67         2
       anger       0.50      1.00      0.67         1
    surprise       0.00      0.00      0.00         1

   micro avg       0.83      0.71      0.77         7
   macro avg       0.70      0.70      0.67         7
weighted avg       0.79      0.71      0.71         7
 samples avg       0.83      0.72      0.77         7

