In [None]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install torch
# !pip install transformers
!pip install emoji


from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, hamming_loss, roc_auc_score, average_precision_score
from collections import defaultdict
from torch.amp import autocast, GradScaler
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import time

## Hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 12
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid


## Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_column):
        self.tokenizer = tokenizer
        self.texts = df['text'].tolist()    # Use 'text' column
        self.labels = df[target_column].tolist()  # Use 'label' column
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,  
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.tensor(self.labels[index], dtype=torch.float)
        }

## Data
train_file_path = '/content/train.csv'
val_file_path = '/content/val.csv'
test_file_path = '/content/test.csv'

train_df = pd.read_csv(train_file_path)
val_df = pd.read_csv(val_file_path)
test_df = pd.read_csv(test_file_path)

# Specify the target column (assumed to be 'label')
target_column = 'label'

target_names = ["0", "1"]

## Tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")


train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN, target_column)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN, target_column)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN, target_column)

#print(train_dataset[0])

## Data Loader
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)
## Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

## Model

class BERTweetInceptionAttention(nn.Module):
    def __init__(self, num_classes):
        super(BERTweetInceptionAttention, self).__init__()

        # Initialize BERTweet model
        self.bertweet = AutoModel.from_pretrained('vinai/bertweet-base')

        # Dropout layer after BERTweet output
        self.dropout = nn.Dropout(0.3)

        # Inception block with fewer channels and smaller kernel sizes
        self.conv2 = nn.Conv1d(in_channels=768, out_channels=16, kernel_size=2, padding=0)
        self.conv3 = nn.Conv1d(in_channels=768, out_channels=16, kernel_size=3, padding=0)
        self.conv5 = nn.Conv1d(in_channels=768, out_channels=16, kernel_size=5, padding=0)

        # Self-attention layer after Inception block
        self.attention = nn.MultiheadAttention(embed_dim=816, num_heads=4, batch_first=True) 

        # Additional dense layer with LayerNorm for refined feature interaction
        self.dense = nn.Sequential(
            nn.Linear(816, 512),
            nn.ReLU(),
            nn.LayerNorm(512)
        )

        # Final dropout and classification layer
        self.final_dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(512, 1)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        # BERTweet branch
        outputs = self.bertweet(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, seq_length, 768)

        # Apply dropout to BERTweet embeddings
        hidden_states = self.dropout(hidden_states)

        # Inception block with manual padding after convolutions
        hidden_states = hidden_states.permute(0, 2, 1)  # Shape: (batch_size, 768, seq_length)

        # Apply convolutions without padding, then pad manually
        conv2_output = F.pad(self.conv2(hidden_states), (0, 1))  # Padding to match max seq length
        conv3_output = F.pad(self.conv3(hidden_states), (1, 1))  # Adjust to max seq length
        conv5_output = F.pad(self.conv5(hidden_states), (2, 2))  # Adjust to max seq length

        # Concatenate along the channel dimension
        inception_output = torch.cat([conv2_output, conv3_output, conv5_output], dim=1)  # Shape: (batch_size, 48, seq_length)
        inception_output = inception_output.permute(0, 2, 1)  # Back to (batch_size, seq_length, 48)

        # Concatenate Inception outputs with original BERTweet embeddings
        concatenated_features = torch.cat([hidden_states.permute(0, 2, 1), inception_output], dim=2)  # Shape: (batch_size, seq_length, 816)

        # Apply multi-head self-attention
        key_padding_mask = ~attention_mask.bool()  # Shape: (batch_size, seq_length)
        attn_output, _ = self.attention(
            concatenated_features,
            concatenated_features,
            concatenated_features,
            key_padding_mask=key_padding_mask
        )  # Shape: (batch_size, seq_length, 816)

        # Adaptive pooling over sequence length
        pooled_output = F.adaptive_avg_pool1d(attn_output.permute(0, 2, 1), output_size=1).squeeze(-1)  # Shape: (batch_size, 816)

        # Additional dense layer with LayerNorm
        dense_output = self.dense(pooled_output)

        # Final dropout and classification layer
        dense_output = self.final_dropout(dense_output)
        logits = self.fc(dense_output)  # Shape: (batch_size, num_classes)

        return logits

## Setting the model
model = BERTweetInceptionAttention(num_classes=len(target_names))
model.to(device)

## Loss
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs.view(-1), targets.float())

# define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

## Training function
def train_model(training_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    total_batches = len(training_loader)

    # Set model to training mode (activate dropout, batch norm)
    model.train()

    for batch_idx, data in enumerate(training_loader):
        ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
        mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long, non_blocking=True)
        targets = data['targets'].to(device, dtype=torch.float, non_blocking=True)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        # Apply sigmoid to get probabilities for binary classification
        probs = torch.sigmoid(outputs).view(-1)  # Flatten the outputs
        preds = (probs > THRESHOLD).float()  # Apply threshold to get binary predictions (0 or 1)
        correct_predictions += torch.sum(preds == targets)  # Compare predictions to targets
        num_samples += targets.size(0)

        # Backward pass and optimizer step
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    train_accuracy = float(correct_predictions) / num_samples
    average_loss = np.mean(losses)
    print(f"Training Accuracy: {train_accuracy:.4f} | Training Loss: {average_loss:.4f}")

    return model, train_accuracy, average_loss


## Evaluator Function
def eval_model(validation_loader, model):
    model.eval()
    final_targets = []
    final_outputs = []
    final_probs = []
    losses = []

    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long, non_blocking=True)
            mask = data['attention_mask'].to(device, dtype=torch.long, non_blocking=True)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long, non_blocking=True)
            targets = data['targets'].to(device, dtype=torch.float, non_blocking=True)

            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # Apply sigmoid to get probabilities for binary classification
            probs = torch.sigmoid(outputs).view(-1)  # Flatten the outputs
            preds = (probs > THRESHOLD).float()  # Apply threshold to get binary predictions (0 or 1)
            final_outputs.extend(preds.cpu().numpy())
            final_probs.extend(probs.cpu().numpy())
            final_targets.extend(targets.cpu().numpy())

            torch.cuda.empty_cache()

    # Convert to numpy arrays
    final_targets = np.array(final_targets)
    final_outputs = np.array(final_outputs)
    final_probs = np.array(final_probs)

    # Accuracy
    acc = accuracy_score(final_targets, final_outputs)

    # Weighted metrics
    f1 = f1_score(final_targets, final_outputs, average='weighted')
    precision = precision_score(final_targets, final_outputs, average='weighted')
    recall = recall_score(final_targets, final_outputs, average='weighted')

    # Micro-averaged metrics
    micro_f1 = f1_score(final_targets, final_outputs, average='macro')
    micro_precision = precision_score(final_targets, final_outputs, average='macro')
    micro_recall = recall_score(final_targets, final_outputs, average='macro')

    # Hamming Loss
    hamming = hamming_loss(final_targets, final_outputs)

    # AUC-ROC and AUPR
    auc_roc = roc_auc_score(final_targets, final_probs, multi_class='ovr', average='macro')
    aupr = average_precision_score(final_targets, final_probs, average='macro')

    # Average Loss
    average_loss = np.mean(losses)

    # Print metrics
    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Weighted F1 Score: {f1}")
    print(f"Macro F1 Score: {micro_f1}")
    print(f"Weighted Precision: {precision}")
    print(f"Macro Precision: {micro_precision}")
    print(f"Weighted Recall: {recall}")
    print(f"Macro Recall: {micro_recall}")
    print(f"AUC-ROC: {auc_roc}")
    print(f"AUPR: {aupr}")
    print("\nClassification Report:\n", classification_report(final_targets, final_outputs, target_names=target_names))

    return acc, average_loss


#Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Training & Evaluation Loop
# recording starting time
start = time.time()

history = defaultdict(list)
best_acc = 0.0  # Initialize best accuracy

for epoch in range(1, EPOCHS + 1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model)

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    scheduler.step()

    # Save the best model based on accuracy
    if val_acc > best_acc:
        torch.save(model.state_dict(), "irony_inceptiveBERTweet16_32_best.bin")
        best_acc = val_acc

# recording end time
end = time.time()
print(f"Total training and evaluation time: {end - start} seconds")


## Testing
# Loading pretrained model (best model)
print("\n\nTesting\n\n")
model = BERTweetInceptionAttention(num_classes=len(target_names))
model.load_state_dict(torch.load("irony_inceptiveBERTweet16_32_best.bin"))
model = model.to(device)

# recording starting time
start = time.time()
# Evaluate the model using the test data
eval_model(test_data_loader, model)
# recording end time
end = time.time()
print(f"Total test-set evaluation time: {end - start} seconds")

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch 1/12


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Training Accuracy: 0.6080 | Training Loss: 0.6390
Validation Accuracy: 0.6963
Weighted F1 Score: 0.6964883275486999
Macro F1 Score: 0.6961588421671625
Weighted Precision: 0.6972522113371576
Macro Precision: 0.696266293573797
Weighted Recall: 0.6963350785340314
Macro Recall: 0.6966630629680414
AUC-ROC: 0.788177231656295
AUPR: 0.7974561044014072

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.69      0.70       499
           1       0.67      0.70      0.69       456

    accuracy                           0.70       955
   macro avg       0.70      0.70      0.70       955
weighted avg       0.70      0.70      0.70       955

Epoch 2/12
Training Accuracy: 0.7540 | Training Loss: 0.4944
Validation Accuracy: 0.7487
Weighted F1 Score: 0.7403742358791795
Macro F1 Score: 0.7379900683146314
Weighted Precision: 0.7727858784427298
Macro Precision: 0.7762394634278982
Weighted Recall: 0.7486910994764397
Macro Recall: 0.7410940301655

  model.load_state_dict(torch.load("irony_BERTweetIncDNet16_32_best_colab_final.bin"))


Validation Accuracy: 0.8533
Weighted F1 Score: 0.8548792590629325
Macro F1 Score: 0.8513131313131312
Weighted Precision: 0.8704078675699396
Macro Precision: 0.8509090672292985
Weighted Recall: 0.8533163265306123
Macro Recall: 0.8663215570042759
AUC-ROC: 0.9375403628749923
AUPR: 0.9094544084427847

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.80      0.87       473
           1       0.76      0.93      0.83       311

    accuracy                           0.85       784
   macro avg       0.85      0.87      0.85       784
weighted avg       0.87      0.85      0.85       784

Total test-set evaluation time: 1.629324197769165 seconds
