In [1]:
!pip install nlpaug



In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, AdamW
import spacy
import re
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from torch.cuda.amp import GradScaler, autocast
import nlpaug.augmenter.word as naw
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

2025-02-12 15:19:56.056051: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-12 15:19:56.068929: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-12 15:19:56.086426: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-12 15:19:56.091687: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-12 15:19:56.104578: I tensorflow/core/platform/cpu_feature_guar

True

In [4]:
aug = naw.SynonymAug(aug_src='wordnet')

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

In [6]:
# cd drive/MyDrive/CodaBench_Sem_Eval/val/

In [7]:
# !ls

In [8]:
# cd val

In [None]:
# GPU/CPU Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Data
train = pd.read_csv('../public_data_test/track_a/train/eng.csv')
val = pd.read_csv('../public_data_test/track_a/dev/eng.csv')
test = pd.read_csv('../public_data_test/track_a/test/eng.csv')
emotions = ["anger", "fear", "joy", "sadness", "surprise"]

# Initialize BERT Tokenizer & Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
nlp = spacy.load("en_core_web_sm")

Using device: cuda


In [None]:
# Preprocessing Function
def pre_process(text):
    text = re.sub(r"[.,;:!?'\"“”()]", "", text)  # Remove punctuation
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
    return encoded_input['input_ids'].squeeze(0).to(device)

# Convert Text to BERT Embeddings
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        input_ids = pre_process(text).unsqueeze(0)
        with torch.no_grad():
            outputs = bert_model(input_ids)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())  # Extract [CLS] token
    return np.vstack(embeddings)

X_train = get_bert_embeddings(train["text"])
X_val = get_bert_embeddings(val["text"])

# POS Feature Extraction
def get_pos_features(texts):
    return [[token.pos_ for token in nlp(text)] for text in texts]

train_pos_tags = get_pos_features(train["text"])
val_pos_tags = get_pos_features(val["text"])

# Convert POS Tags to Indices
pos_vocab = {pos: idx for idx, pos in enumerate(set(tag for tags in train_pos_tags for tag in tags))}
train_pos_indices = [[pos_vocab[tag] for tag in tags] for tags in train_pos_tags]
val_pos_indices = [[pos_vocab.get(tag, 0) for tag in tags] for tags in val_pos_tags]

# Pad POS Sequences to Fixed Length
max_length = max(max(len(seq) for seq in train_pos_indices), max(len(seq) for seq in val_pos_indices))
train_pos_indices = [seq + [0] * (max_length - len(seq)) for seq in train_pos_indices]
val_pos_indices = [seq + [0] * (max_length - len(seq)) for seq in val_pos_indices]

# Convert to PyTorch Tensors
train_pos_indices = torch.tensor(train_pos_indices, dtype=torch.long).to(device)
val_pos_indices = torch.tensor(val_pos_indices, dtype=torch.long).to(device)

# Trainable POS Embedding Layer with LSTM
class POSEmbedding(nn.Module):
    def __init__(self, num_pos_tags, embedding_dim):
        super(POSEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_pos_tags, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)

    def forward(self, pos_indices):
        pos_embeds = self.embedding(pos_indices)
        lstm_out, _ = self.lstm(pos_embeds)
        return lstm_out[:, -1, :]  # Use the last hidden state

pos_embedding_layer = POSEmbedding(len(pos_vocab), embedding_dim=16).to(device)

# Model Definition
class EmotionClassifier(nn.Module):
    def __init__(self, bert_dim=768, pos_dim=16, hidden_dim=128, output_dim=5):
        super(EmotionClassifier, self).__init__()
        self.fc1 = nn.Linear(bert_dim + pos_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, bert_embeddings, pos_indices):
        pos_embeds = pos_embedding_layer(pos_indices)
        combined_features = torch.cat((bert_embeddings, pos_embeds), dim=1)
        x = self.relu(self.fc1(combined_features))
        return self.fc2(x)

# Initialize Model
model = EmotionClassifier().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

# Data Augmentation
aug = naw.SynonymAug(aug_src='wordnet')
train["text"] = train["text"].apply(lambda x: aug.augment(x))

# Prepare Training Data
y_train = torch.tensor(train[emotions].values, dtype=torch.float32).to(device)
y_val = torch.tensor(val[emotions].values, dtype=torch.float32).to(device)

train_features = torch.tensor(X_train, dtype=torch.float32).to(device)
val_features = torch.tensor(X_val, dtype=torch.float32).to(device)

dataset = TensorDataset(train_features, train_pos_indices, y_train)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Training Loop with Adversarial Training
scaler = GradScaler()
epochs = 10

for epoch in tqdm(range(epochs), desc="Training Loop"):
    model.train()
    for features, pos_indices, labels in data_loader:
        optimizer.zero_grad()
        with autocast():
            outputs = model(features, pos_indices)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    if epoch % 1 == 0:
        print(f"Epoch {epoch}: Loss: {round(loss.item(), 3)}")

# Dynamic Thresholding
def find_optimal_thresholds(y_true, y_pred_probs):
    thresholds = {}
    for i, emotion in enumerate(emotions):
        best_threshold = 0.5
        best_f1 = 0
        for threshold in np.arange(0.1, 1.0, 0.05):
            y_pred = (y_pred_probs[:, i] > threshold).astype(int)
            f1 = f1_score(y_true[:, i], y_pred)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        thresholds[emotion] = best_threshold
    return thresholds

# Generate Predictions
model.eval()
with torch.no_grad():
    y_pred_probs = torch.sigmoid(model(val_features, val_pos_indices)).cpu().numpy()

thresholds = find_optimal_thresholds(y_val.cpu().numpy(), y_pred_probs)
y_pred = np.zeros_like(y_pred_probs)
for i, emotion in enumerate(emotions):
    y_pred[:, i] = (y_pred_probs[:, i] > thresholds[emotion]).astype(int)

# Evaluation
print(classification_report(y_val.cpu().numpy(), y_pred, target_names=emotions))

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
