# Installations

In [1]:
!pip install ftfy

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


# Imports

In [2]:
import ftfy
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report, confusion_matrix
from tqdm import tqdm
import os
import re
from torch.nn import BCEWithLogitsLoss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
from transformers import get_linear_schedule_with_warmup
import optuna
from sklearn.metrics import accuracy_score

2025-06-09 19:47:33.303231: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749498453.740771      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749498453.860150      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Determinism

In [3]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.use_deterministic_algorithms(True)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    

# Baseline Performance

In [4]:
def train(model, train_loader, val_loader, epochs=3, plot_lc=False):
    loss_fn = BCEWithLogitsLoss()
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            total_loss += loss.item()

            probs = torch.sigmoid(logits).detach().cpu().numpy()
            preds = (probs >= 0.5).astype(int)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(train_loader)
        train_acc = accuracy_score(all_labels, all_preds)
        train_losses.append(avg_loss)
        train_accuracies.append(train_acc)

        print(f"Train Loss: {avg_loss:.4f}, Train Accuracy: {train_acc:.4f}")

        val_metrics = evaluate(model, val_loader, return_metrics=True)
        val_accuracies.append(val_metrics['accuracy'])
        val_losses.append(val_metrics['val_loss'])

    if plot_lc:
        plot_learning_curves(train_losses, val_losses, train_accuracies, val_accuracies)



def evaluate(model, data_loader, return_metrics=False, plot=False):
    model.eval()
    all_preds, all_labels, all_logits = [], [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].cpu().numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze().cpu().numpy()
            probs = torch.sigmoid(torch.tensor(logits)).numpy()
            preds = (probs >= 0.5).astype(int)

            all_preds.extend(preds)
            all_labels.extend(labels)
            all_logits.extend(probs)

    acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    roc_auc = roc_auc_score(all_labels, all_logits)

    print(f"Val Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {roc_auc:.4f}")
    print("Confusion Matrix:")
    cm = confusion_matrix(all_labels, all_preds)
    print(cm)
    print(classification_report(all_labels, all_preds))

    if plot:
        fpr, tpr, _ = roc_curve(all_labels, all_logits)
        plt.figure()
        plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})")
        plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("Validation ROC Curve")
        plt.legend()
        plt.grid(True)
        plt.savefig("roc_curve.png")
        plt.close()
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
        disp.plot(cmap=plt.cm.Blues)
        plt.title("Confusion Matrix")
        plt.grid(False)
        plt.savefig("confusion_matrix.png")
        plt.close()

    if return_metrics:
        return {
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': roc_auc
        }
def evaluate(model, data_loader, return_metrics=False, plot=False):
    model.eval()
    all_preds, all_labels, all_logits = [], [], []
    total_loss = 0
    loss_fn = BCEWithLogitsLoss()

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
            all_logits.extend(probs)

    avg_val_loss = total_loss / len(data_loader)
    acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    roc_auc = roc_auc_score(all_labels, all_logits)

    print(f"Val Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {roc_auc:.4f}")
    print("Confusion Matrix:")
    cm = confusion_matrix(all_labels, all_preds)
    print(cm)
    print(classification_report(all_labels, all_preds))

    if plot:
        fpr, tpr, _ = roc_curve(all_labels, all_logits)
        plt.figure()
        plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})")
        plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("Validation ROC Curve")
        plt.legend()
        plt.grid(True)
        plt.savefig("roc_curve.png")
        plt.close()
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
        disp.plot(cmap=plt.cm.Blues)
        plt.title("Confusion Matrix")
        plt.grid(False)
        plt.savefig("confusion_matrix.png")
        plt.close()


    if return_metrics:
        return {
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': roc_auc,
            'val_loss': avg_val_loss
        }

def plot_learning_curves(train_losses, val_losses, train_accuracies, val_accuracies):
    epochs = range(1, len(train_losses) + 1)

    # Loss curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.grid(True)
    plt.legend()
    plt.savefig("loss_curve.png")
    plt.close()

    # Accuracy curve
    plt.figure()
    plt.plot(epochs, train_accuracies, label="Train Accuracy")
    plt.plot(epochs, val_accuracies, label="Val Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Training and Validation Accuracy")
    plt.grid(True)
    plt.legend()
    plt.savefig("accuracy_curve.png")
    plt.close()


    
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128, is_test=False):
        self.texts = [clean_tweet(t) for t in df["Text"].tolist()]
        self.ids = df["ID"].tolist()
        self.labels = df["Label"].tolist() if not is_test else None
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        item["ID"] = self.ids[idx]
        if not self.is_test:
            item["Label"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [5]:
# set_seed(42)
        
# def clean_tweet(text):
#     return text.lower()
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

# Text Preprocessing Choices

## Helping Functions

In [6]:
slang_dict = {
    "luv": "love",
     "luvv": "love",
    "xoxo": "kiss",
    "bc" : "because",
    "bcuz": "because",
    "cuze": "because",
    "cuz": "because",
    "lil": "little",
    "fam": "family",
    "bro": "brother",
    "sis": "sister",
    "thang": "thing",
    "aint": "is not",
    "tryna": "try to",
    "neva": "never",
    "bday": "birthday",
    "gr8": "great",
    "4ever": "forever",
    "nvm": "never mind",
    "r" : "are",
    "tryin": "trying",
    "2morow" : "tomorrow",
    "2moro" : "tomorrow",
    "morow" : "tomorrow",
    "tmrw" : "tomorrow",
    "tmrow" : "tomorrow",
    "2morow" : "tomorrow",
    "2morro" : "tomorrow",
    "morrow" : "tomorrow",
    "tmrrw" : "tomorrow",
    "tmrrow" : "tomorrow",
    "b4" : "before",
    "every1" : "everyone",
    "2nd" : "second",
     "h8" : "hate",
    "ppl" : "people",
    "ly" : "love you",
    "2nite" : "tonight",
    "2night" : "tonight",
    "tonite" : "tonight",
    "bday" : "birthday",
    "2day" : "today",
   "1st" : "first",
    "3rd" : "third",
    "str8" : "straight",
    "fk" : "fuck",
    "fkin" : "fucking",
    "fck" : "fuck",
    "fcking": "fucking",
    "fuckin": "fucking",
    "wit": "with",
    "fri":"friday",
    "friggin": "fucking",
    "frigging": "fucking",
    "lovin": "loving",
    "luving": "loving",
   "missin": "missing",
   "freakin":"freaking",
   "killin":"killing",
    "wat":"what",
   "em":"them",
   "hatin" : "hating",
    "recieve": "receive",
    "seperated": "separated",
    "wierd": "weird",
    "loosing": "losing",
    "thier": "their",
    "thx": "thanks",
    "ty": "thank you",
    "pls": "please",
    "plz": "please",
    "skool":"school",
    "frnd":"friend",
    "frnds":"friends",
    "belive":"believe",
    "seein":"seeing",
    "kno":"know",
    "icant":"i cant",
    "bein":"being",
    "bout":"about",
    "wen":"when",
    "jst":"just",
    "xx":"kiss"
}
    
def replace_slang(text):
    words = text.split()
    replaced_words = [slang_dict.get(word, word) for word in words]
    return " ".join(replaced_words)

def fix_mojibake(text): return ftfy.fix_text(text)
def replace_urls(text): return re.sub(r'https?://\S+|www\.\S+', ' url ', text)
def replace_mentions(text): return re.sub(r'\@\w+', ' username ', text)
def reduce_repeated_letters_to_two(word): return re.sub(r"(.)\1{2,}", r"\1\1", word)
def remove_extra_spaces(text): return re.sub(r'\s+', ' ', text).strip()

def replace_emoticon(text):
    text = re.sub(r"<3", " love ", text)
    text = re.sub(r"</3", " heartbroken ", text)
    text = re.sub(r"<33", " love ", text)
    text = re.sub(r"</33", " heartbroken ", text)
    text = re.sub(r":'\(", " sad ", text)
    text = re.sub(r";\)", " wink ", text)
    text = re.sub(r":-d", " laugh ", text)
    text = re.sub(r":P", " playful ", text)
    text = re.sub(r":\*", " kiss ", text)
    text = text.replace("♥", " heart ")
    text = text.replace("♫", " music ")
    text = text.replace("☺", " smile ")
    return text

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Experiments

In [7]:
# def clean_tweet_modular(text, steps):
#     if "mojibake" in steps: text = fix_mojibake(text)
#     if "lowercase" in steps: text = text.lower()
#     if "urls" in steps: text = replace_urls(text)
#     if "mentions" in steps: text = replace_mentions(text)
#     if "repeated_letters" in steps: text = reduce_repeated_letters_to_two(text)
#     if "slang" in steps: text = replace_slang(text)
#     if "emoticons" in steps: text = replace_emoticon(text)
#     return text


# def evaluate_model(steps, *_):
#     global clean_tweet
#     clean_tweet = lambda text: clean_tweet_modular(text, steps)
#     set_seed(42)
#     tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
#     train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
#     val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")
#     train_dataset = TweetDataset(train_df, tokenizer, max_len=128, is_test=False)
#     val_dataset = TweetDataset(val_df, tokenizer, max_len=128, is_test=False)

#     generator = torch.Generator().manual_seed(42)
#     train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=generator)
#     val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
    

#     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
#     if torch.cuda.device_count() > 1:
#         print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#         model = torch.nn.DataParallel(model)

#     model.to(device)

#     global optimizer, lr_scheduler
#     optimizer = AdamW(model.parameters(), lr=2e-5)
#     num_training_steps = len(train_loader) * 3
#     lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

#     train(model, train_loader, val_loader, epochs=2)
#     metrics = evaluate(model, val_loader, return_metrics=True)
#     return metrics["accuracy"]


# def search_best_preprocessing(train_df=None, val_df=None):
#     all_steps = [
#         "mojibake", "urls", "mentions", "repeated_letters", "slang",
#         "emoticons"
#     ]
#     kept_steps = []  # lowercase is already assumed applied
#     best_acc = 0.8473  # from baseline run

#     for step in all_steps:
#         test_steps = ["lowercase"] + kept_steps + [step]
#         acc = evaluate_model(test_steps)
#         print(f"Trying {test_steps} → Accuracy: {acc:.4f}")
#         if acc > best_acc:
#             best_acc = acc
#             kept_steps.append(step)
#             print(f" Keeping '{step}'")
#         else:
#             print(f" Discarding '{step}'")

#     final_steps = ["lowercase"] + kept_steps
#     print("\n Final Preprocessing Steps:", final_steps)
#     return final_steps
    
#search_best_preprocessing()

## Test Letter Repetition Reduction to 2 Before Emoticons Replacement

In [8]:
# def clean_tweet_modular(text, steps):
#     if "mojibake" in steps: text = fix_mojibake(text)
#     if "lowercase" in steps: text = text.lower()
#     if "urls" in steps: text = replace_urls(text)
#     if "mentions" in steps: text = replace_mentions(text)
#     if "repeated_letters" in steps: text = reduce_repeated_letters_to_two(text)
#     if "slang" in steps: text = replace_slang(text)
#     if "emoticons" in steps: text = replace_emoticon(text)
#     return text


# def evaluate_model(steps, *_):
#     global clean_tweet
#     clean_tweet = lambda text: clean_tweet_modular(text, steps)
#     set_seed(42)
#     tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
#     train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
#     val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")
#     train_dataset = TweetDataset(train_df, tokenizer, max_len=128, is_test=False)
#     val_dataset = TweetDataset(val_df, tokenizer, max_len=128, is_test=False)

#     generator = torch.Generator().manual_seed(42)
#     train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=generator)
#     val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
    

#     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
#     if torch.cuda.device_count() > 1:
#         print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#         model = torch.nn.DataParallel(model)

#     model.to(device)

#     global optimizer, lr_scheduler
#     optimizer = AdamW(model.parameters(), lr=2e-5)
#     num_training_steps = len(train_loader) * 3
#     lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

#     train(model, train_loader, val_loader, epochs=2)
#     metrics = evaluate(model, val_loader, return_metrics=True)
#     return metrics["accuracy"]


# test_steps =["lowercase","mojibake", "mentions", "repeated_letters","emoticons"]
# acc = evaluate_model(test_steps)


## Final clean_tweet Function

In [9]:
def clean_tweet(text):
    text = fix_mojibake(text)
    text = text.lower()
    text = replace_mentions(text)
    text = reduce_repeated_letters_to_two(text)
    text = replace_emoticon(text)
    text = remove_extra_spaces(text)
    return text


# Baseline Model Performance After Text Preprocessing

In [10]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

# Experiments with max_len 

## Tweets Length Statistics Computation

In [11]:
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")
# test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/test_dataset.csv")

# train_df["Text"] = train_df["Text"].apply(clean_tweet)
# val_df["Text"] = val_df["Text"].apply(clean_tweet)
# test_df["Text"] = test_df["Text"].apply(clean_tweet)

# # Max lengths
# idx_train_max = train_df['Text'].str.len().idxmax()
# print("Train - Max Length:", len(train_df.loc[idx_train_max, 'Text']))

# idx_val_max = val_df['Text'].str.len().idxmax()
# print("Validation - Max Length:", len(val_df.loc[idx_val_max, 'Text']))

# idx_test_max = test_df['Text'].str.len().idxmax()
# print("Test - Max Length:", len(test_df.loc[idx_test_max, 'Text']))

# # Average lengths
# print("Train - Average Length:", train_df["Text"].str.len().mean())
# print("Validation - Average Length:", val_df["Text"].str.len().mean())
# print("Test - Average Length:", test_df["Text"].str.len().mean())


## max_len = 100

In [12]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 100
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

## max_len = 150

In [13]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 150
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

# Experiments with Batch Size

## batch_size = 128

In [14]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 128

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

## batch_size = 32

In [15]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 32

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

# Scheduler Warm Up 

## 10% of total steps

In [16]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# num_warmup_steps = int(0.1 * num_training_steps)
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

## 5% of total steps

In [17]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# num_warmup_steps = int(0.05 * num_training_steps)
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

# Special Token - username

In [18]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# tokenizer.add_special_tokens({"additional_special_tokens": ["username"]})
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
# model.resize_token_embeddings(len(tokenizer))

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

# Best Parameters - Optuna

## First Broad Study

In [19]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# set_seed(42)

# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")
# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 64
# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# def objective(trial):
#     set_seed(42)
#     generator = torch.Generator().manual_seed(42)
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
#     val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

#     lr = trial.suggest_float("lr", 2e-5, 4e-5, log=True)
#     weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-5, log=True)

#     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
#     model.to(device)
    
#     if torch.cuda.device_count() > 1:
#         model = torch.nn.DataParallel(model)

#     optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
#     total_steps = len(train_loader) * 3
#     lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

#     loss_fn = BCEWithLogitsLoss()
#     best_epoch_acc = 0

#     for epoch in range(3):
#         model.train()
#         for batch in tqdm(train_loader, desc=f"Trial Epoch {epoch+1}", leave=False):
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             labels = batch["Label"].float().to(device)

#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             logits = outputs.logits.squeeze()
#             loss = loss_fn(logits, labels)

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#             lr_scheduler.step()

#         # Validation
#         model.eval()
#         all_preds, all_labels = [], []
#         with torch.no_grad():
#             for batch in val_loader:
#                 input_ids = batch["input_ids"].to(device)
#                 attention_mask = batch["attention_mask"].to(device)
#                 labels = batch["Label"].cpu().numpy()

#                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#                 logits = outputs.logits.squeeze().cpu().numpy()
#                 probs = 1 / (1 + np.exp(-logits))
#                 preds = (probs >= 0.5).astype(int)

#                 all_preds.extend(preds)
#                 all_labels.extend(labels)

#         acc = accuracy_score(all_labels, all_preds)
#         trial.report(acc, step=epoch)

#         if acc > best_epoch_acc:
#             best_epoch_acc = acc

#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()

#     return best_epoch_acc

# # save in case study gets stuck
# storage_path = "sqlite:///optuna_study.db"
# study_name = "distilbert_accuracy"

# study = optuna.create_study(
#     direction="maximize",
#     study_name=study_name,
#     storage=storage_path,
#     load_if_exists=True
# )

# study.optimize(objective, n_trials=10)

In [20]:
# import shutil

# shutil.copy("/kaggle/input/optunadb/optuna_study (1).db", "/kaggle/working/optuna_study.db")
# storage_path = "sqlite:///optuna_study.db" 
# study_name = "distilbert_accuracy"
# study = optuna.load_study(
#     study_name=study_name,
#     storage=storage_path,
# )

# # continue previous study that got stuck
# study.optimize(objective, n_trials=8)

## Refined Second Study

In [21]:

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# set_seed(42)

# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")
# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 64
# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# storage_path = "sqlite:///study_refined.db"
# study_name = "distilbert_refined"

# study = optuna.create_study(
#     direction="maximize",
#     study_name=study_name,
#     storage=storage_path,
#     load_if_exists=True
# )

# def objective(trial):
#     set_seed(42)
#     generator = torch.Generator().manual_seed(42)
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
#     val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

#     lr = trial.suggest_float("lr", 2.5e-5, 4.5e-5, log=True)
#     weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-5, log=True)

#     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
#     model.to(device)
#     if torch.cuda.device_count() > 1:
#         model = torch.nn.DataParallel(model)

#     optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
#     total_steps = len(train_loader) * 3
#     lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)
#     loss_fn = BCEWithLogitsLoss()

#     best_acc = 0
#     for epoch in range(3):
#         model.train()
#         for batch in tqdm(train_loader, desc=f"Trial Epoch {epoch+1}", leave=False):
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             labels = batch["Label"].float().to(device)

#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             logits = outputs.logits.squeeze()
#             loss = loss_fn(logits, labels)

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#             lr_scheduler.step()

#         model.eval()
#         all_preds, all_labels = [], []
#         with torch.no_grad():
#             for batch in val_loader:
#                 input_ids = batch["input_ids"].to(device)
#                 attention_mask = batch["attention_mask"].to(device)
#                 labels = batch["Label"].cpu().numpy()

#                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#                 logits = outputs.logits.squeeze().cpu().numpy()
#                 probs = 1 / (1 + np.exp(-logits))
#                 preds = (probs >= 0.5).astype(int)

#                 all_preds.extend(preds)
#                 all_labels.extend(labels)

#         acc = accuracy_score(all_labels, all_preds)
#         trial.report(acc, step=epoch)
#         if acc > best_acc:
#             best_acc = acc

#     return best_acc
    
#study.optimize(objective, n_trials=10)

# Best Parameters Full Evaluation

In [22]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")
# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=3.401062711560044e-05,  weight_decay= 1.7126212382046062e-06)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

# Gradient Clipping at Best Parameters Model

In [23]:
def train_with_grad_norm(model, train_loader, val_loader, epochs=3, plot_lc=False):
    loss_fn = BCEWithLogitsLoss()
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
            optimizer.step()
            lr_scheduler.step()
            total_loss += loss.item()

            probs = torch.sigmoid(logits).detach().cpu().numpy()
            preds = (probs >= 0.5).astype(int)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(train_loader)
        train_acc = accuracy_score(all_labels, all_preds)
        train_losses.append(avg_loss)
        train_accuracies.append(train_acc)

        print(f"Train Loss: {avg_loss:.4f}, Train Accuracy: {train_acc:.4f}")

        val_metrics = evaluate(model, val_loader, return_metrics=True)
        val_accuracies.append(val_metrics['accuracy'])
        val_losses.append(val_metrics['val_loss'])

    if plot_lc:
        plot_learning_curves(train_losses, val_losses, train_accuracies, val_accuracies)

In [24]:
# set_seed(42)
        
# train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
# val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# max_len = 128
# batch_size = 64

# train_dataset = TweetDataset(train_df, tokenizer, max_len)
# val_dataset = TweetDataset(val_df, tokenizer, max_len)

# generator = torch.Generator().manual_seed(42)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     model = torch.nn.DataParallel(model)

# model.to(device)

# optimizer = AdamW(model.parameters(), lr=3.401062711560044e-05,  weight_decay= 1.7126212382046062e-06)
# num_training_steps = len(train_loader) * 3
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# train_with_grad_norm(model, train_loader, val_loader, epochs=3, plot_lc=True)
# evaluate(model, val_loader, plot=True)

# Final Submission

In [25]:
set_seed(42)
        
train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/train_dataset.csv")
val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/val_dataset.csv")
test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3-distil-bert/test_dataset.csv")

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
max_len = 128
batch_size = 64

train_dataset = TweetDataset(train_df, tokenizer, max_len)
val_dataset = TweetDataset(val_df, tokenizer, max_len)

generator = torch.Generator().manual_seed(42)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
    model = torch.nn.DataParallel(model)

model.to(device)

optimizer = AdamW(model.parameters(), lr=3.401062711560044e-05,  weight_decay= 1.7126212382046062e-06)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

train_with_grad_norm(model, train_loader, val_loader, epochs=2, plot_lc=False)

test_dataset = TweetDataset(test_df, tokenizer, max_len=128, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model.eval()
predictions = []
ids = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.view(-1).cpu()
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).long().numpy()

        predictions.extend(preds)

        ids.extend([int(id) if torch.is_tensor(id) else int(id) for id in batch["ID"]])

submission_df = pd.DataFrame({"ID": ids, "Label": predictions})
submission_df.to_csv("submission.csv", index=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 2 GPUs with DataParallel


Epoch 1: 100%|██████████| 2319/2319 [15:38<00:00,  2.47it/s]


Train Loss: 0.3923, Train Accuracy: 0.8239
Val Accuracy: 0.8397, Precision: 0.8633, Recall: 0.8073, F1: 0.8344, AUC: 0.9225
Confusion Matrix:
[[18487  2710]
 [ 4085 17114]]
              precision    recall  f1-score   support

         0.0       0.82      0.87      0.84     21197
         1.0       0.86      0.81      0.83     21199

    accuracy                           0.84     42396
   macro avg       0.84      0.84      0.84     42396
weighted avg       0.84      0.84      0.84     42396



Epoch 2: 100%|██████████| 2319/2319 [15:42<00:00,  2.46it/s]


Train Loss: 0.2938, Train Accuracy: 0.8763
Val Accuracy: 0.8497, Precision: 0.8540, Recall: 0.8436, F1: 0.8488, AUC: 0.9270
Confusion Matrix:
[[18139  3058]
 [ 3315 17884]]
              precision    recall  f1-score   support

         0.0       0.85      0.86      0.85     21197
         1.0       0.85      0.84      0.85     21199

    accuracy                           0.85     42396
   macro avg       0.85      0.85      0.85     42396
weighted avg       0.85      0.85      0.85     42396

