In [None]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install torch
# !pip install transformers
!pip install emoji


from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, hamming_loss, roc_auc_score, average_precision_score
from collections import defaultdict
from torch.amp import autocast, GradScaler
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import time

## Hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 12
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid


## Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_column):
        self.tokenizer = tokenizer
        self.texts = df['text'].tolist()    
        self.labels = df[target_column].tolist()  
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,  
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }

## Data
train_file_path = '/content/train.csv'
val_file_path = '/content/val.csv'
test_file_path = '/content/test.csv'

train_df = pd.read_csv(train_file_path)
val_df = pd.read_csv(val_file_path)
test_df = pd.read_csv(test_file_path)


target_column = 'label'

target_names = ["0", "1", "2", "3"]

## Tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=True)


train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN, target_column)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN, target_column)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN, target_column)

#print(train_dataset[0])

## Data Loader
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)
## Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

## Model

class InceptionBlock(nn.Module):
    """
    An Inception-style block for 1D sequences with:
    - 4 convolution branches (kernel sizes 2, 3, 5, 7)
    - BatchNorm + ReLU in each branch
    """
    def __init__(self, in_channels=768, branch_out=32):
        super().__init__()
        # Each branch uses Conv1d -> BN -> ReLU
        self.branch2 = nn.Sequential(
            nn.Conv1d(in_channels, branch_out, kernel_size=2),
            nn.BatchNorm1d(branch_out),
            nn.ReLU()
        )
        self.branch3 = nn.Sequential(
            nn.Conv1d(in_channels, branch_out, kernel_size=3),
            nn.BatchNorm1d(branch_out),
            nn.ReLU()
        )
        self.branch5 = nn.Sequential(
            nn.Conv1d(in_channels, branch_out, kernel_size=5),
            nn.BatchNorm1d(branch_out),
            nn.ReLU()
        )
        self.branch7 = nn.Sequential(
            nn.Conv1d(in_channels, branch_out, kernel_size=7),
            nn.BatchNorm1d(branch_out),
            nn.ReLU()
        )

    def forward(self, x):

        # We do manual padding to maintain sequence length
        b2 = F.pad(self.branch2(x), (0, 1))   # kernel_size=2 => right-pad 1
        b3 = F.pad(self.branch3(x), (1, 1))   # kernel_size=3 => left+right-pad 1
        b5 = F.pad(self.branch5(x), (2, 2))   # kernel_size=5 => left+right-pad 2
        b7 = F.pad(self.branch7(x), (3, 3))   # kernel_size=7 => left+right-pad 3

        out = torch.cat([b2, b3, b5, b7], dim=1)  # (batch, 4*branch_out, seq_len)

        return out


class BERTweetInceptionAttention(nn.Module):
    def __init__(self, num_classes, num_transformer_layers=3):
        super(BERTweetInceptionAttention, self).__init__()

        # BERTweet
        self.bert = AutoModel.from_pretrained('vinai/bertweet-base')
        hidden_size = self.bert.config.hidden_size  # Typically 768 for BERTweet-base

        self.dropout = nn.Dropout(0.3)

        # Inception block
        self.branch_out = 32
        self.inception = InceptionBlock(
            in_channels=hidden_size,
            branch_out=self.branch_out
        )

        self.fused_dim = hidden_size + 4 * self.branch_out  # 768 + 128 = 896

        # Self-attention layer after Inception block
        self.attention = nn.MultiheadAttention(
            embed_dim=self.fused_dim,
            num_heads=8,
            batch_first=True
        )

        # Dense block
        self.dense = nn.Sequential(
            nn.Linear(self.fused_dim, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.GELU()
        )

        # Final dropout and classification
        self.final_dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(256, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        # shape => (batch, seq_len, hidden_size)
        hidden_states = outputs.last_hidden_state
        hidden_states = self.dropout(hidden_states)


        # Permute to (batch, hidden_size, seq_len) to apply conv
        x = hidden_states.permute(0, 2, 1)  # (batch, 768, seq_len)
        inception_out = self.inception(x)   # (batch, 128, seq_len)
        # Permute back to (batch, seq_len, 128)
        inception_out = inception_out.permute(0, 2, 1)

        fused_features = torch.cat([hidden_states, inception_out], dim=2)

 

        # key padding mask: True => mask out
        # Our attention_mask is 1 for valid, 0 for pad => invert it
        key_padding_mask = ~(attention_mask.bool())

        attn_output, _ = self.attention(
            fused_features,
            fused_features,
            fused_features,
            key_padding_mask=key_padding_mask
        )

        # Instead of using the CLS token, we average over seq_len
        pooled_features = F.adaptive_avg_pool1d(attn_output.permute(0, 2, 1), output_size=1).squeeze(-1)

        # Dense block -> final
        dense_out = self.dense(pooled_features)      # (batch, 256)
        dense_out = self.final_dropout(dense_out)
        logits = self.fc(dense_out)                  # (batch, num_classes)

        return logits

## Setting the model
model = BERTweetInceptionAttention(num_classes=len(target_names))
model.to(device)

## Loss
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

# define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)

## Training function
def train_model(training_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    total_batches = len(training_loader)

    model.train()

    for batch_idx, data in enumerate(training_loader):
        ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
        mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long, non_blocking=True)
        targets = data['targets'].to(device, dtype=torch.long, non_blocking=True)

        # Forward pass
        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        # Calculate training accuracy
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets)
        num_samples += targets.size(0)

        # Backward pass and optimizer step
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    train_accuracy = float(correct_predictions) / num_samples
    average_loss = np.mean(losses)
    print(f"Training Accuracy: {train_accuracy:.4f} | Training Loss: {average_loss:.4f}")

    return model, train_accuracy, average_loss


## Evaluator Function
def eval_model(validation_loader, model):
    model.eval()
    final_targets = []
    final_outputs = []
    final_probs = []
    losses = []

    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
            mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long, non_blocking=True)
            targets = data['targets'].to(device, dtype=torch.long, non_blocking=True)

            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # Predictions and probabilities
            probs = torch.softmax(outputs, dim=1)  # Softmax for probabilities
            _, preds = torch.max(outputs, dim=1)  # Predicted class indices
            final_outputs.extend(preds.cpu().numpy())
            final_probs.extend(probs.cpu().numpy())
            final_targets.extend(targets.cpu().numpy())

            torch.cuda.empty_cache()

    # Convert to numpy arrays
    final_targets = np.array(final_targets)
    final_outputs = np.array(final_outputs)
    final_probs = np.array(final_probs)

    # Accuracy
    acc = accuracy_score(final_targets, final_outputs)

    # Weighted metrics
    f1 = f1_score(final_targets, final_outputs, average='weighted')
    precision = precision_score(final_targets, final_outputs, average='weighted')
    recall = recall_score(final_targets, final_outputs, average='weighted')

    # Micro-averaged metrics
    micro_f1 = f1_score(final_targets, final_outputs, average='macro')
    micro_precision = precision_score(final_targets, final_outputs, average='macro')
    micro_recall = recall_score(final_targets, final_outputs, average='macro')

    # Hamming Loss
    hamming = hamming_loss(final_targets, final_outputs)

    # AUC-ROC and AUPR
    auc_roc = roc_auc_score(final_targets, final_probs, multi_class='ovr', average='macro')
    aupr = average_precision_score(final_targets, final_probs, average='macro')

    # Average Loss
    average_loss = np.mean(losses)

    # Print metrics
    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Weighted F1 Score: {f1}")
    print(f"Macro F1 Score: {micro_f1}")
    print(f"Weighted Precision: {precision}")
    print(f"Macro Precision: {micro_precision}")
    print(f"Weighted Recall: {recall}")
    print(f"Macro Recall: {micro_recall}")
    print(f"AUC-ROC: {auc_roc}")
    print(f"AUPR: {aupr}")
    print("\nClassification Report:\n", classification_report(final_targets, final_outputs, target_names=target_names))

    return acc, average_loss


#Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Training & Evaluation Loop
# recording starting time
start = time.time()

history = defaultdict(list)
best_acc = 0.0  # Initialize best accuracy

for epoch in range(1, EPOCHS + 1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model)

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    scheduler.step()

    # Save the best model based on accuracy
    if val_acc > best_acc:
        torch.save(model.state_dict(), "emotion_BERTweetIncDNet32.bin")
        best_acc = val_acc

# recording end time
end = time.time()
print(f"Total training and evaluation time: {end - start} seconds")


## Testing
# Loading pretrained model (best model)
print("\n\nTesting\n\n")
model = BERTweetInceptionAttention(num_classes=len(target_names))
model.load_state_dict(torch.load("emotion_BERTweetIncDNet32.bin"))
model = model.to(device)

# recording starting time
start = time.time()
# Evaluate the model using the test data
eval_model(test_data_loader, model)
# recording end time
end = time.time()
print(f"Total test-set evaluation time: {end - start} seconds")

Epoch 1/12
Training Accuracy: 0.6049 | Training Loss: 0.9917
Validation Accuracy: 0.7487
Weighted F1 Score: 0.7409529291293261
Macro F1 Score: 0.6732205385142342
Weighted Precision: 0.7440696558343617
Macro Precision: 0.7216549368234761
Weighted Recall: 0.7486631016042781
Macro Recall: 0.6533608061259949
AUC-ROC: 0.9192559587335395
AUPR: 0.7833998784315904

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.86      0.81       160
           1       0.77      0.72      0.74        97
           2       0.64      0.32      0.43        28
           3       0.71      0.71      0.71        89

    accuracy                           0.75       374
   macro avg       0.72      0.65      0.67       374
weighted avg       0.74      0.75      0.74       374

Epoch 2/12
Training Accuracy: 0.8050 | Training Loss: 0.5760
Validation Accuracy: 0.7647
Weighted F1 Score: 0.7619387216637196
Macro F1 Score: 0.7183018821180436
Weighted Precision:

  model.load_state_dict(torch.load("emotion_BERTweetIncDNet32_32_branch_gelu.bin"))


Validation Accuracy: 0.8445
Weighted F1 Score: 0.8441708155866173
Macro F1 Score: 0.8123442241404708
Weighted Precision: 0.8450476808969742
Macro Precision: 0.8125281157864257
Weighted Recall: 0.844475721323012
Macro Recall: 0.8132921317209217
AUC-ROC: 0.9568285055229864
AUPR: 0.8797865664421204

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89       558
           1       0.82      0.88      0.85       358
           2       0.69      0.69      0.69       123
           3       0.85      0.79      0.82       382

    accuracy                           0.84      1421
   macro avg       0.81      0.81      0.81      1421
weighted avg       0.85      0.84      0.84      1421

Total test-set evaluation time: 2.8951077461242676 seconds
