In [None]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install torch
# !pip install transformers


from transformers import RobertaModel, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, hamming_loss, roc_auc_score, average_precision_score
from collections import defaultdict
from torch.amp import autocast, GradScaler
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import time

## Hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 12
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid


## Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_column):
        self.tokenizer = tokenizer
        self.texts = df['text'].tolist()    # Use 'text' column
        self.labels = df[target_column].tolist()  # Use 'label' column
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True, 
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }

## Data
train_file_path = '/kaggle/input/emotion/train.csv'
val_file_path = '/kaggle/input/emotion/val.csv'
test_file_path = '/kaggle/input/emotion/test.csv'

train_df = pd.read_csv(train_file_path)
val_df = pd.read_csv(val_file_path)
test_df = pd.read_csv(test_file_path)

target_column = 'label'

target_names = ["0", "1", "2", "3"]

## Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN, target_column)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN, target_column)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN, target_column)

#print(train_dataset[0])

## Data Loader
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)
## Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

## Model

class RobertaInceptionAttentionImproved(nn.Module):
    def __init__(self, num_classes):
        super(RobertaInceptionAttentionImproved, self).__init__()
        
        # Initialize RoBERTa model
        self.roberta = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True)
        
        # Dropout layer after RoBERTa output
        self.dropout = nn.Dropout(0.3)
        
        # Inception block with 16 channels for each kernel size
        self.conv2 = nn.Conv1d(in_channels=768, out_channels=16, kernel_size=2, padding=0)
        self.conv3 = nn.Conv1d(in_channels=768, out_channels=16, kernel_size=3, padding=0)
        self.conv5 = nn.Conv1d(in_channels=768, out_channels=16, kernel_size=5, padding=0)
        
        # Self-attention layer after Inception block
        self.attention = nn.MultiheadAttention(embed_dim=816, num_heads=4, batch_first=True)  # 768 (RoBERTa) + 48 (Inception)
        
        # Additional dense layer with LayerNorm for refined feature interaction
        self.dense = nn.Sequential(
            nn.Linear(816, 512),
            nn.ReLU(),
            nn.LayerNorm(512)
        )
        
        # Final dropout and classification layer
        self.final_dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(512, num_classes)
        
    def forward(self, input_ids, attention_mask):
        # RoBERTa branch
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, seq_length, 768)
        
        # Apply dropout to RoBERTa embeddings
        hidden_states = self.dropout(hidden_states)
        
        # Inception block with manual padding after convolutions
        hidden_states = hidden_states.permute(0, 2, 1)  # Shape: (batch_size, 768, seq_length)
        
        # Apply convolutions without padding, then pad manually
        conv2_output = F.pad(self.conv2(hidden_states), (0, 1))  # Padding to match max seq length
        conv3_output = F.pad(self.conv3(hidden_states), (1, 1))  # Adjust to max seq length
        conv5_output = F.pad(self.conv5(hidden_states), (2, 2))  # Adjust to max seq length

        # Concatenate along the channel dimension
        inception_output = torch.cat([conv2_output, conv3_output, conv5_output], dim=1)  # Shape: (batch_size, 48, seq_length)
        inception_output = inception_output.permute(0, 2, 1)  # Back to (batch_size, seq_length, 48)

        # Concatenate Inception outputs with original RoBERTa embeddings
        concatenated_features = torch.cat([hidden_states.permute(0, 2, 1), inception_output], dim=2)  # Shape: (batch_size, seq_length, 816)

        # Apply multi-head self-attention
        key_padding_mask = ~attention_mask.bool()  # Shape: (batch_size, seq_length)
        attn_output, _ = self.attention(
            concatenated_features,
            concatenated_features,
            concatenated_features,
            key_padding_mask=key_padding_mask
        )  # Shape: (batch_size, seq_length, 816)
        
        # Permute attn_output to match input shape for adaptive pooling
        pooled_output = F.adaptive_avg_pool1d(attn_output.permute(0, 2, 1), output_size=1).squeeze(-1)  # Shape: (batch_size, 816)
        
        # dense layer
        dense_output = self.dense(pooled_output)
        
        # Final dropout and classification layer
        dense_output = self.final_dropout(dense_output)
        logits = self.fc(dense_output)  # Shape: (batch_size, num_classes)
        
        return logits

## Setting the model
model = RobertaInceptionAttentionImproved(num_classes=len(target_names))
model.to(device)

## Loss & Optimizer
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

# define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)

## Training function
def train_model(training_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    total_batches = len(training_loader)

    model.train()

    for batch_idx, data in enumerate(training_loader):
        ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
        mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long, non_blocking=True)
        targets = data['targets'].to(device, dtype=torch.long, non_blocking=True)

        outputs = model(ids, mask)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())


        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets)
        num_samples += targets.size(0)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    train_accuracy = float(correct_predictions) / num_samples
    average_loss = np.mean(losses)
    print(f"Training Accuracy: {train_accuracy:.4f} | Training Loss: {average_loss:.4f}")
    
    return model, train_accuracy, average_loss


## Evaluator Function
def eval_model(validation_loader, model):
    model.eval()
    final_targets = []
    final_outputs = []
    losses = []

    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
            mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long, non_blocking=True)
            targets = data['targets'].to(device, dtype=torch.long, non_blocking=True)

            outputs = model(ids, mask)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            _, preds = torch.max(outputs, dim=1)
            final_outputs.extend(preds.cpu().numpy())
            final_targets.extend(targets.cpu().numpy())

            torch.cuda.empty_cache()

    acc = accuracy_score(final_targets, final_outputs)
    f1 = f1_score(final_targets, final_outputs, average='weighted')
    precision = precision_score(final_targets, final_outputs, average='weighted')
    recall = recall_score(final_targets, final_outputs, average='weighted')
    hamming = hamming_loss(final_targets, final_outputs)

    average_loss = np.mean(losses)

    print(f"Validation Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Hamming Loss: {hamming}")
    print(f"Average Loss: {average_loss}")
    print("\nClassification Report:\n", classification_report(final_targets, final_outputs, target_names=target_names))

    return acc, average_loss


#Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Training & Evaluation Loop
# recording starting time
start = time.time()

history = defaultdict(list)
best_acc = 0.0  # Initialize best accuracy

for epoch in range(1, EPOCHS + 1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model)

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    scheduler.step()

    # Save the best model based on accuracy
    if val_acc > best_acc:
        torch.save(model.state_dict(), "emotion_RIncDNet16_32.bin")
        best_acc = val_acc

# recording end time
end = time.time()
print(f"Total training and evaluation time: {end - start} seconds")


## Testing
# Loading pretrained model (best model)
print("\n\nTesting\n\n")
model = RobertaInceptionAttentionImproved(num_classes=len(target_names))
model.load_state_dict(torch.load("emotion_RIncDNet16_32.bin"))
model = model.to(device)

# recording starting time
start = time.time()
# Evaluate the model using the test data
eval_model(test_data_loader, model)
# recording end time
end = time.time()
print(f"Total test-set evaluation time: {end - start} seconds")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/12
Training Accuracy: 0.6199 | Training Loss: 0.9395
Validation Accuracy: 0.7487
F1 Score: 0.7475889680107217
Precision: 0.748288567493113
Recall: 0.7486631016042781
Hamming Loss: 0.25133689839572193
Average Loss: 0.6866984913746516

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84       160
           1       0.76      0.69      0.72        97
           2       0.56      0.50      0.53        28
           3       0.65      0.70      0.67        89

    accuracy                           0.75       374
   macro avg       0.70      0.69      0.69       374
weighted avg       0.75      0.75      0.75       374

Epoch 2/12
Training Accuracy: 0.8078 | Training Loss: 0.5517
Validation Accuracy: 0.7807
F1 Score: 0.7783009887285698
Precision: 0.778021959559413
Recall: 0.7807486631016043
Hamming Loss: 0.2192513368983957
Average Loss: 0.5791585718592008

Classification Report:
               precision    recall 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("emotion_RIncDNet16_32_best.bin"))


Validation Accuracy: 0.8304
F1 Score: 0.8292335693350108
Precision: 0.8303014903371018
Recall: 0.8304011259676284
Hamming Loss: 0.16959887403237156
Average Loss: 0.5741800043318007

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87       558
           1       0.86      0.82      0.84       358
           2       0.71      0.68      0.69       123
           3       0.84      0.77      0.80       382

    accuracy                           0.83      1421
   macro avg       0.81      0.80      0.80      1421
weighted avg       0.83      0.83      0.83      1421

Total test-set evaluation time: 6.0712549686431885 seconds
