# Installations

In [1]:
!pip install ftfy

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


# Imports

In [2]:
import ftfy
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report, confusion_matrix
from tqdm import tqdm
import os
import re
from torch.nn import BCEWithLogitsLoss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
from transformers import get_linear_schedule_with_warmup
import optuna
from sklearn.metrics import accuracy_score

2025-05-25 15:55:49.998524: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748188550.174274      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748188550.226452      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Determinism

In [3]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.use_deterministic_algorithms(True)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    

# Baseline Performance

In [4]:
def train(model, train_loader, val_loader, epochs=3):
    loss_fn = BCEWithLogitsLoss()
    best_val_f1 = 0
    train_losses = []
    val_accuracies = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)
        print(f"Train Loss: {avg_loss:.4f}")

        val_metrics = evaluate(model, val_loader, return_metrics=True)
        val_accuracies.append(val_metrics['accuracy'])

    plot_learning_curves(train_losses, val_accuracies)

def evaluate(model, data_loader, return_metrics=False, plot_roc=False):
    model.eval()
    all_preds, all_labels, all_logits = [], [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].cpu().numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze().cpu().numpy()
            probs = torch.sigmoid(torch.tensor(logits)).numpy()
            preds = (probs >= 0.5).astype(int)

            all_preds.extend(preds)
            all_labels.extend(labels)
            all_logits.extend(probs)

    acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    roc_auc = roc_auc_score(all_labels, all_logits)

    print(f"Val Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {roc_auc:.4f}")
    print("Confusion Matrix:")
    cm = confusion_matrix(all_labels, all_preds)
    print(cm)
    print(classification_report(all_labels, all_preds))

    if plot_roc:
        # ROC Curve
        fpr, tpr, _ = roc_curve(all_labels, all_logits)
        plt.figure()
        plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})")
        plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("Validation ROC Curve")
        plt.legend()
        plt.grid(True)
        plt.savefig("roc_curve.png")
        plt.close()

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.grid(False)
    plt.savefig("confusion_matrix.png")
    plt.close()

    if return_metrics:
        return {
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': roc_auc
        }

def plot_learning_curves(train_losses, val_accuracies):
    epochs = range(1, len(train_losses) + 1)

    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training Loss over Epochs")
    plt.grid(True)
    plt.legend()
    plt.savefig("train_loss_curve.png")
    plt.close()

    plt.figure()
    plt.plot(epochs, val_accuracies, label="Val Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy over Epochs")
    plt.grid(True)
    plt.legend()
    plt.savefig("val_accuracy_curve.png")
    plt.close()
    
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128, is_test=False):
        self.texts = [clean_tweet(t) for t in df["Text"].tolist()]
        self.ids = df["ID"].tolist()
        self.labels = df["Label"].tolist() if not is_test else None
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        item["ID"] = self.ids[idx]
        if not self.is_test:
            item["Label"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [5]:
# set_seed(42)
        
# def clean_tweet(text):
#     return text.lower()

# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
# test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)
# test_dataset = TweetDataset(test_df, tokenizer, max_len, is_test=True)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3)
# evaluate(model, val_loader, plot_roc=True)

In [6]:
slang_dict = {
    "luv": "love",
     "luvv": "love",
    "xoxo": "kiss",
    "bc" : "because",
    "bcuz": "because",
    "cuze": "because",
    "cuz": "because",
    "lil": "little",
    "fam": "family",
    "bro": "brother",
    "sis": "sister",
    "thang": "thing",
    "aint": "is not",
    "tryna": "try to",
    "neva": "never",
    "bday": "birthday",
    "gr8": "great",
    "4ever": "forever",
    "nvm": "never mind",
    "r" : "are",
    "tryin": "trying",
    "2morow" : "tomorrow",
    "2moro" : "tomorrow",
    "morow" : "tomorrow",
    "tmrw" : "tomorrow",
    "tmrow" : "tomorrow",
    "2morow" : "tomorrow",
    "2morro" : "tomorrow",
    "morrow" : "tomorrow",
    "tmrrw" : "tomorrow",
    "tmrrow" : "tomorrow",
    "b4" : "before",
    "every1" : "everyone",
    "2nd" : "second",
     "h8" : "hate",
    "ppl" : "people",
    "ly" : "love you",
    "2nite" : "tonight",
    "2night" : "tonight",
    "tonite" : "tonight",
    "bday" : "birthday",
    "2day" : "today",
   "1st" : "first",
    "3rd" : "third",
    "str8" : "straight",
    "fk" : "fuck",
    "fkin" : "fucking",
    "fck" : "fuck",
    "fcking": "fucking",
    "fuckin": "fucking",
    "wit": "with",
    "fri":"friday",
    "friggin": "fucking",
    "frigging": "fucking",
    "lovin": "loving",
    "luving": "loving",
   "missin": "missing",
   "freakin":"freaking",
   "killin":"killing",
    "wat":"what",
   "em":"them",
   "hatin" : "hating",
    "recieve": "receive",
    "seperated": "separated",
    "wierd": "weird",
    "loosing": "losing",
    "thier": "their",
    "thx": "thanks",
    "ty": "thank you",
    "pls": "please",
    "plz": "please",
    "skool":"school",
    "frnd":"friend",
    "frnds":"friends",
    "belive":"believe",
    "seein":"seeing",
    "kno":"know",
    "icant":"i cant",
    "bein":"being",
    "bout":"about",
    "wen":"when",
    "jst":"just",
    "xx":"kiss"
}

def remove_apostrophes(text):
    return text.replace("'", "")
    
def replace_slang(text):
    words = text.split()
    replaced_words = [slang_dict.get(word, word) for word in words]
    return " ".join(replaced_words)
def expand_contractions(text):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def fix_mojibake(text): return ftfy.fix_text(text)
def replace_urls(text): return re.sub(r'https?://\S+|www\.\S+', ' url ', text)
def replace_mentions(text): return re.sub(r'\@\w+', ' username ', text)
def reduce_repeated_letters_to_two(word): return re.sub(r"(.)\1{2,}", r"\1\1", word)
def remove_extra_spaces(text): return re.sub(r'\s+', ' ', text).strip()

def replace_emoticon(text):
    text = re.sub(r"<3", " love ", text)
    text = re.sub(r"</3", " heartbroken ", text)
    text = re.sub(r"<33", " love ", text)
    text = re.sub(r"</33", " heartbroken ", text)
    text = re.sub(r":'\(", " sad ", text)
    text = re.sub(r";\)", " wink ", text)
    text = re.sub(r":-d", " laugh ", text)
    text = re.sub(r":P", " playful ", text)
    text = re.sub(r":\*", " kiss ", text)
    text = text.replace("♥", " heart ")
    text = text.replace("♫", " music ")
    text = text.replace("☺", " smile ")
    return text

def clean_tweet(text):
    text = fix_mojibake(text)
    text = text.lower()
    text = replace_urls(text)
    text = replace_mentions(text)
    text = reduce_repeated_letters_to_two(text)
    text = replace_slang(text)
    text = replace_emoticon(text)
    text = remove_extra_spaces(text)
    return text

# Preprocessing Choices

# Baseline After Preprocessing

In [7]:
# def clean_tweet(text):
#     text = fix_mojibake(text)
#     text = text.lower()
#     text = replace_urls(text)
#     text = replace_mentions(text)
#     text = reduce_repeated_letters_to_two(text)
#     text = replace_slang(text)
#     text = replace_emoticon(text)
#     text = remove_extra_spaces(text)
#     return text
    
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
# test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)
# test_dataset = TweetDataset(test_df, tokenizer, max_len, is_test=True)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3)
# evaluate(model, val_loader, plot_roc=True)

# Max Length computation

In [8]:
train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")

train_df["Text"] = train_df["Text"].apply(clean_tweet)
val_df["Text"] = val_df["Text"].apply(clean_tweet)
test_df["Text"] = test_df["Text"].apply(clean_tweet)

idx_train_max = train_df['Text'].str.len().idxmax()
print("Train - Max Length:", len(train_df.loc[idx_train_max, 'Text']))
print("Train - Text:", train_df.loc[idx_train_max, 'Text'])

idx_val_max = val_df['Text'].str.len().idxmax()
print("Validation - Max Length:", len(val_df.loc[idx_val_max, 'Text']))
print("Validation - Text:", val_df.loc[idx_val_max, 'Text'])

idx_test_max = test_df['Text'].str.len().idxmax()
print("Test - Max Length:", len(test_df.loc[idx_test_max, 'Text']))
print("Test - Text:", test_df.loc[idx_test_max, 'Text'])


Train - Max Length: 195
Train - Text: " i could have discussed the abortion ban in south dakota, or how i love brokeback mountain, or how i'd really like to go to a strip club to better understand the sexual politics of the industry.
Validation - Max Length: 197
Validation - Text: - aimee has been here for over a week, and it's been really fun..-the da vinci code sucked ass!.-sean has been a really awesome and supportive friend..-indian reservations are full of white people.
Test - Max Length: 155
Test - Text: username currently obsessed with..watermelons too heart heart one of my favorite things about summer is the sweet watermelon ..xoxo heart heart heart heart


# Max Length to 200

### Max length to 200

In [9]:
# set_seed(42)

# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
# test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")


# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# max_len = 200
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)
# test_dataset = TweetDataset(test_df, tokenizer, max_len, is_test=True)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3)
# evaluate(model, val_loader, plot_roc=True)

## warm up scheduler

In [10]:

# set_seed(42)

# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
# test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")


# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)
# test_dataset = TweetDataset(test_df, tokenizer, max_len, is_test=True)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# num_warmup_steps = int(0.1 * num_training_steps)
# lr_scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=num_warmup_steps,  
#     num_training_steps=num_training_steps
# )

# train(model, train_loader, val_loader, epochs=3)
# evaluate(model, val_loader, plot_roc=True)

## add special tokens

In [11]:
# set_seed(42)

# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
# test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")

# # Load tokenizer and add custom tokens
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# tokenizer.add_special_tokens({"additional_special_tokens": ["url", "username"]})

# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)
# test_dataset = TweetDataset(test_df, tokenizer, max_len, is_test=True)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
# model.resize_token_embeddings(len(tokenizer))


# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3)
# evaluate(model, val_loader, plot_roc=True)

## Gradient clipping + saving best model

In [12]:
def train_grad(model, train_loader, val_loader, epochs=3):
    loss_fn = BCEWithLogitsLoss()
    best_val_f1 = 0
    train_losses = []
    val_accuracies = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            lr_scheduler.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)
        print(f"Train Loss: {avg_loss:.4f}")
        if epoch == 2:
            torch.save(model.state_dict(), "best_model.pt")
        val_metrics = evaluate(model, val_loader, return_metrics=True)
        val_accuracies.append(val_metrics['accuracy'])

    plot_learning_curves(train_losses, val_accuracies)
           

# set_seed(42)

# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
# test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# tokenizer.add_special_tokens({"additional_special_tokens": ["url", "username"]})

# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)
# test_dataset = TweetDataset(test_df, tokenizer, max_len, is_test=True)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
# model.resize_token_embeddings(len(tokenizer))


# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train_grad(model, train_loader, val_loader, epochs=3)
# evaluate(model, val_loader, plot_roc=True)

# Best Parameters

In [13]:
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")

def objective(trial):
    set_seed(42)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tokenizer.add_special_tokens({"additional_special_tokens": ["url", "username"]})
    max_len = 128
    batch_size = 64
    train_dataset = TweetDataset(train_df, tokenizer, max_len)
    val_dataset = TweetDataset(val_df, tokenizer, max_len)
    
    generator = torch.Generator().manual_seed(42)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    lr = trial.suggest_float("lr", 1e-5, 5e-5, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)
    
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    total_steps = len(train_loader) * 2
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    loss_fn = BCEWithLogitsLoss()
    
    for epoch in range(2):
        model.train()
        for batch in tqdm(train_loader, desc=f"Trial Epoch {epoch+1}", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].float().to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

    # Validation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].cpu().numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze().cpu().numpy()
            probs = 1 / (1 + np.exp(-logits))
            preds = (probs >= 0.5).astype(int)

            all_preds.extend(preds)
            all_labels.extend(labels)

    acc = accuracy_score(all_labels, all_preds)
    trial.report(acc, step=epoch)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
    return acc


# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=15)

In [14]:

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")

def objective(trial):
    set_seed(42)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tokenizer.add_special_tokens({"additional_special_tokens": ["url", "username"]})
    max_len = 128
    batch_size = 64
    train_dataset = TweetDataset(train_df, tokenizer, max_len)
    val_dataset = TweetDataset(val_df, tokenizer, max_len)
    
    generator = torch.Generator().manual_seed(42)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    lr = trial.suggest_float("lr", 1e-5, 5e-5, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)
    
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    total_steps = len(train_loader) * 2
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    loss_fn = BCEWithLogitsLoss()
    
    for epoch in range(2):
        model.train()
        for batch in tqdm(train_loader, desc=f"Trial Epoch {epoch+1}", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].float().to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].cpu().numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze().cpu().numpy()
            probs = 1 / (1 + np.exp(-logits))
            preds = (probs >= 0.5).astype(int)

            all_preds.extend(preds)
            all_labels.extend(labels)

    acc = accuracy_score(all_labels, all_preds)
    trial.report(acc, step=epoch)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
    return acc
    
# study = optuna.create_study(
#     direction="maximize",
#     pruner=optuna.pruners.MedianPruner(n_startup_trials=1, n_warmup_steps=1),
#     storage="sqlite:///bert_study.db",
#     study_name="bert_finetune",
#     load_if_exists=True
# )

# # Enqueue the best-known config from previous study that got interrupted
# study.enqueue_trial({
#     "lr": 3.9459916207335216e-05,
#     "weight_decay": 0.004411994057969546
# })

# Run more trials
#study.optimize(objective, n_trials=10)


# Final Evaluation

In [15]:
set_seed(42)

train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.add_special_tokens({"additional_special_tokens": ["url", "username"]})

max_len = 128
batch_size = 64

train_dataset = TweetDataset(train_df, tokenizer, max_len)
val_dataset = TweetDataset(val_df, tokenizer, max_len)
test_dataset = TweetDataset(test_df, tokenizer, max_len, is_test=True)

generator = torch.Generator().manual_seed(42)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
model.resize_token_embeddings(len(tokenizer))


if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
    model = torch.nn.DataParallel(model)

model.to(device)

optimizer = AdamW(model.parameters(), lr=4.7063673868730246e-05, weight_decay=0.00032752350789883044)
num_training_steps = len(train_loader) * 2
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

train(model, train_loader, val_loader, epochs=2)
evaluate(model, val_loader, plot_roc=True)

predictions = []
ids = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.view(-1).cpu()
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).long().numpy()

        predictions.extend(preds)

        ids.extend([int(id) if torch.is_tensor(id) else id for id in batch["ID"]])

submission_df = pd.DataFrame({"ID": ids, "Label": predictions})
submission_df.to_csv("submission.csv", index=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Using 2 GPUs with DataParallel


Epoch 1: 100%|██████████| 2319/2319 [31:03<00:00,  1.24it/s]


Train Loss: 0.3744
Val Accuracy: 0.8525, Precision: 0.8743, Recall: 0.8234, F1: 0.8481, AUC: 0.9315
Confusion Matrix:
[[18688  2509]
 [ 3744 17455]]
              precision    recall  f1-score   support

         0.0       0.83      0.88      0.86     21197
         1.0       0.87      0.82      0.85     21199

    accuracy                           0.85     42396
   macro avg       0.85      0.85      0.85     42396
weighted avg       0.85      0.85      0.85     42396



Epoch 2: 100%|██████████| 2319/2319 [31:11<00:00,  1.24it/s]


Train Loss: 0.2502
Val Accuracy: 0.8577, Precision: 0.8610, Recall: 0.8532, F1: 0.8571, AUC: 0.9326
Confusion Matrix:
[[18278  2919]
 [ 3113 18086]]
              precision    recall  f1-score   support

         0.0       0.85      0.86      0.86     21197
         1.0       0.86      0.85      0.86     21199

    accuracy                           0.86     42396
   macro avg       0.86      0.86      0.86     42396
weighted avg       0.86      0.86      0.86     42396

Val Accuracy: 0.8577, Precision: 0.8610, Recall: 0.8532, F1: 0.8571, AUC: 0.9326
Confusion Matrix:
[[18278  2919]
 [ 3113 18086]]
              precision    recall  f1-score   support

         0.0       0.85      0.86      0.86     21197
         1.0       0.86      0.85      0.86     21199

    accuracy                           0.86     42396
   macro avg       0.86      0.86      0.86     42396
weighted avg       0.86      0.86      0.86     42396

