# Installations

In [1]:
!pip install -q ftfy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h

# Seeds - Determinism

In [2]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" 

import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

Using device: cuda


# Imports

In [3]:
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random, re, os
import ftfy
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from gensim.models import KeyedVectors
import optuna
import torch.nn.functional as F
from optuna.exceptions import TrialPruned
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
import seaborn as sns

# From Tweet to Vector Function

In [4]:
EMBEDDING_DIM = 300

def get_tweet_vector(tokens, w2v_model, embedding_dim, agg="mean", seed=42):
    np.random.seed(seed)
    random.seed(seed)
    vectors = []
    for token in tokens:
        if token in w2v_model.wv:
            vectors.append(w2v_model.wv[token])
    
    if not vectors:
        # If no tokens found in the model, return a random vector
        return np.random.normal(0, 1, embedding_dim)
    
    vectors = np.array(vectors)
    
    if agg == "mean":
        return vectors.mean(axis=0)
    elif agg == "sum":
        return vectors.sum(axis=0)
    elif agg == "max":
        return vectors.max(axis=0)
    else:
        raise ValueError(f"Unknown aggregation method: {agg}")

# Baseline Model

In [5]:
class BaselineNet(nn.Module):
    def __init__(self, D_in=EMBEDDING_DIM, H1=128, H2=64, H3=32):
        super(BaselineNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, 1) 
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.relu(self.linear3(x))
        return self.linear4(x)

# Preprocessing Choice

## Help Functions

In [6]:
contractions_dict = {
    "dont": "do not", "doesnt": "does not", "didnt": "did not",
    "cant": "can not", "wont": "will not", "shouldnt": "should not",
    "couldnt": "could not", "isnt": "is not", "arent": "are not",
    "hasnt": "has not", "havent": "have not", "hadnt": "had not",
    "wouldnt": "would not", "mustnt": "must not", "wasnt": "was not",
    "werent": "were not", "mightnt": "might not", "shant": "shall not",

    "im": "i am", "youre": "you are", "ure":"you are", "hes": "he is", "shes": "she is",
    "its": "it is", "were": "we are", "theyre": "they are",
    "thats": "that is", "whats": "what is", "wheres": "where is",
    "whos": "who is", "hows": "how is", "heres": "here is",
    "theres": "there is",

    "ive": "i have", "youve": "you have", "uve":"you have","weve": "we have", "theyve": "they have",
    "whove": "who have", "whatve": "what have", "whereve": "where have",
    "howve": "how have",

    "ill": "i will", "youll": "you will", "ull":"you will", "hell": "he will",
    "shell": "she will", "itll": "it will", "well": "we will",
    "theyll": "they will", "wholl": "who will", "whatll": "what will",
    "wherell": "where will", "howll": "how will",

    "id": "i would", "youd": "you would", "ud":"you would","hed": "he would",
    "shed": "she would", "itd": "it would", "wed": "we would",
    "theyd": "they would", "whod": "who would", "whatd": "what would",
    "whered": "where would", "howd": "how would",

    "gimme": "give me", "gonna": "going to", "gota": "got to",
    "lemme": "let me", "wanna": "want to", "hafta": "have to",
     "dunno": "do not know", "yall": "you all", "cmon": "come on",
     "aint": "is not"
}


def replace_mentions(text):
    """Replaces @mentions with the keyword 'username', even if after punctuation."""
    return  re.sub(r'\@\w+', ' username ', text)

def replace_emoticon(text):
    text = re.sub(r"<3", " love ", text)
    text = re.sub(r"<33", " love ", text)
    text = re.sub(r"</33", " heartbroken ", text)
    text = re.sub(r"</3", " heartbroken ", text)
    text = re.sub(r";\)", " wink ", text)
    text = re.sub(r";-\)", " playful ", text)
    text = re.sub(r":-d", " laugh ", text)
    text = re.sub(r"\(:", " smile ", text)
    text = re.sub(r":P", " playful ", text)
    text = re.sub(r":\*", " kiss ", text)
    text = re.sub(r":'\(", " sad ", text)
    text = re.sub(r":\|", " neutral ", text)
    text = re.sub(r" :o ", " wow ", text)
    text = re.sub(r"&", " and ", text)
    text = text.replace("♥", " heart ")
    text = text.replace("♫", " music ")
    text = text.replace("☺", " smile ")
    text = re.sub(r"\bw\/out\b", " without ", text)
    text = re.sub(r"\bw\/o\b", " without ", text)
    text = re.sub(r"\bw\/(?=\s|\w)", " with ", text)
    text = re.sub(r"\bw\b", " with ", text)
    text = re.sub(r"\bb\/c\b", " because ", text)
    text = re.sub(r"\bh\/w\b", " homework ", text)
    text = re.sub(r"\bno\s*-\s*one\b", " noone ", text)
    text = re.sub(r"\bb\s*-\s*day\b", " birthday ", text)
    text = re.sub(r"\bhell\b", " xhellx ", text)
    return text

def remove_specific_punctuation(text):
    return re.sub(r"[\'@*]", "", text)

def replace_punctuation_with_space(text):
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"[_]", " ", text)  # Replace underscore with space
    return text

def fix_mojibake(text):
    return ftfy.fix_text(text)

slang_dict = {
    "luv": "love",
     "luvv": "love",
    "xoxo": "kiss",
    "bc" : "because",
    "bcuz": "because",
    "cuze": "because",
    "cuz": "because",
    "lil": "little",
    "fam": "family",
    "bro": "brother",
    "sis": "sister",
    "thang": "thing",
    "aint": "is not",
    "tryna": "try to",
    "neva": "never",
    "bday": "birthday",
    "gr8": "great",
    "4ever": "forever",
    "nvm": "never mind",
    "r" : "are",
    "tryin": "trying",
    "2morow" : "tomorrow",
    "2moro" : "tomorrow",
    "morow" : "tomorrow",
    "tmrw" : "tomorrow",
    "tmrow" : "tomorrow",
    "2morow" : "tomorrow",
    "2morro" : "tomorrow",
    "morrow" : "tomorrow",
    "tmrrw" : "tomorrow",
    "tmrrow" : "tomorrow",
    "b4" : "before",
    "every1" : "everyone",
    "2nd" : "second",
     "h8" : "hate",
    "ppl" : "people",
    "ly" : "love you",
    "2nite" : "tonight",
    "2night" : "tonight",
    "tonite" : "tonight",
    "bday" : "birthday",
    "2day" : "today",
   "1st" : "first",
    "3rd" : "third",
    "str8" : "straight",
    "fk" : "fuck",
    "fkin" : "fucking",
    "fck" : "fuck",
    "fcking": "fucking",
    "fuckin": "fucking",
    "wit": "with",
    "fri":"friday",
    "friggin": "fucking",
    "frigging": "fucking",
    "lovin": "loving",
    "luving": "loving",
   "missin": "missing",
   "freakin":"freaking",
   "killin":"killing",
    "wat":"what",
   "em":"them",
   "hatin" : "hating",
    "recieve": "receive",
    "seperated": "separated",
    "wierd": "weird",
    "loosing": "losing",
    "thier": "their",
    "thx": "thanks",
    "ty": "thank you",
    "pls": "please",
    "plz": "please",
    "skool":"school",
    "frnd":"friend",
    "frnds":"friends",
    "belive":"believe",
    "seein":"seeing",
    "kno":"know",
    "icant":"i cant",
    "bein":"being",
    "bout":"about",
    "wen":"when",
    "jst":"just",
    "xx":"kiss"
}
def replace_slang(text):
    words = text.split()
    replaced_words = [slang_dict.get(word, word) for word in words]
    return " ".join(replaced_words)

def remove_short_words(text):
    return ' '.join(word for word in text.split() if len(word) > 1)

def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

def replace_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', ' url ', text)

def reduce_repeated_letters_to_two(word):
    return re.sub(r"(.)\1{2,}", r"\1\1", word)

def expand_contractions(text):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def handle_negations(text):
    negation_words = {'not', 'never', 'no'}
    skip_words = {'a', 'an'}  # words to skip after negation
    tokens = text.split()
    result = []
    i = 0

    while i < len(tokens):
        token = tokens[i]
        result.append(token)
        if token.lower() in negation_words:
            j = i + 1
            # Skip "a", "an", etc.
            while j < len(tokens) and tokens[j].lower() in skip_words:
                result.append(tokens[j])
                j += 1
            if j < len(tokens):
                result.append('NOT_' + tokens[j])
                i = j  
        i += 1

    return ' '.join(result)


## Cleaning Text Function Choice

In [7]:
# 1. NO PREPROCESSING
def clean_text_1(text):
    return text

# 2. ONLY FIX_MOJ AND LOWER
def clean_text_2(text):
    text = fix_mojibake(text)
    text = text.lower()
    return text

# 3. ONLY FIX_MOJ AND LOWER + URL + MENTION
def clean_text_3(text):
    text = fix_mojibake(text)
    text = text.lower()
    text = replace_urls(text)
    text = replace_mentions(text)
    return text

# 4. ONLY FIX_MOJ AND LOWER + URL + MENTION + REPETITION TO 2
def clean_text_4(text):
    text = fix_mojibake(text)
    text = text.lower()
    text = replace_urls(text)
    text = replace_mentions(text)
    text = reduce_repeated_letters_to_two(text)
    return text

# 5. + EMOTICON + PUNCT REMOV/REPLACE + EXTRA SPACES
def clean_text_5(text):
    text = fix_mojibake(text)
    text = text.lower()
    text = replace_urls(text)
    text = replace_mentions(text)
    text = reduce_repeated_letters_to_two(text)
    text = replace_emoticon(text)
    text = remove_specific_punctuation(text)
    text = replace_punctuation_with_space(text)
    text = remove_extra_spaces(text)
    return text

# 6. + SLANG 
def clean_text_6(text):
    text = fix_mojibake(text)
    text = text.lower()
    text = replace_urls(text)
    text = replace_mentions(text)
    text = reduce_repeated_letters_to_two(text)
    text = replace_emoticon(text)
    text = remove_specific_punctuation(text)
    text = replace_punctuation_with_space(text)
    text = replace_slang(text)  
    text = remove_extra_spaces(text)
    return text

# 7. + SHORT WORD REMOVAL
def clean_text_7(text):
    text = fix_mojibake(text)
    text = text.lower()
    text = replace_urls(text)
    text = replace_mentions(text)
    text = reduce_repeated_letters_to_two(text)
    text = replace_emoticon(text)
    text = remove_specific_punctuation(text)
    text = replace_punctuation_with_space(text)
    text = replace_slang(text) 
    text = remove_short_words(text)
    text = remove_extra_spaces(text)
    return text

# 8. + CONTRACTIONS + NEGATION
def clean_text(text):
    text = fix_mojibake(text)
    text = text.lower()
    text = replace_urls(text)
    text = replace_mentions(text)
    text = reduce_repeated_letters_to_two(text)
    text = replace_emoticon(text)
    text = remove_specific_punctuation(text)
    text = replace_punctuation_with_space(text)
    text = replace_slang(text)  
    text = remove_short_words(text)
    text = expand_contractions(text)
    text = handle_negations(text)
    text = remove_extra_spaces(text)
    return text



clean_text_versions = {
    "no_preprocessing": clean_text_1,
    "fixmoj_lower": clean_text_2,
    "fixmoj_lower_url_mention": clean_text_3,
    "fixmoj_lower_url_mention_repetition": clean_text_4,
    "fixmoj_lower_url_mention_repetition_emotic_punct": clean_text_5,
    "fixmoj_lower_url_mention_repetition_emotic_punct_slang": clean_text_6,
    "fixmoj_lower_url_mention_repetition_emotic_punct_slang_short": clean_text_7,
    "fixmoj_lower_url_mention_repetition_emotic_punct_slang_short_contraction_negation": clean_text
}



# for name, clean_text_func in clean_text_versions.items():
#     print(f"\n=== Running with {name} ===\n")
    
#     train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/train_dataset.csv")
#     val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/val_dataset.csv")

#     train_df["Text"] = train_df["Text"].apply(clean_text_func)
#     val_df["Text"] = val_df["Text"].apply(clean_text_func)

#     train_tokens = train_df["Text"].astype(str).apply(lambda x: x.split())
#     val_tokens   = val_df["Text"].astype(str).apply(lambda x: x.split())
#     w2v_model = Word2Vec(sentences=train_tokens, vector_size=EMBEDDING_DIM, sg=1, min_count=1, seed=SEED, workers=1)

#     train_df["vector"] = train_tokens.apply(lambda tokens: get_tweet_vector(tokens, w2v_model, EMBEDDING_DIM))
#     val_df["vector"]   = val_tokens.apply(lambda tokens: get_tweet_vector(tokens, w2v_model, EMBEDDING_DIM))

#     X_train = np.vstack(train_df["vector"].values)
#     y_train = train_df["Label"].values
#     X_val = np.vstack(val_df["vector"].values)
#     y_val = val_df["Label"].values

#     x_train_tensor = torch.tensor(X_train, dtype=torch.float32)
#     y_train_tensor = torch.tensor(y_train, dtype=torch.float32) 
#     x_val_tensor = torch.tensor(X_val, dtype=torch.float32)
#     y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

#     train_dataset = torch.utils.data.TensorDataset(x_train_tensor, y_train_tensor)
#     val_dataset = torch.utils.data.TensorDataset(x_val_tensor, y_val_tensor)

#     g = torch.Generator()
#     g.manual_seed(SEED)
#     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, generator=g)
#     val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64)

#     torch.manual_seed(SEED)
#     model = BaselineNet().to(DEVICE)
#     criterion = nn.BCEWithLogitsLoss()
#     optimizer = torch.optim.Adam(model.parameters())

#     for epoch in range(5):
#         model.train()
#         batch_losses = []

#         for x_batch, y_batch in train_loader:
#             x_batch = x_batch.to(DEVICE)
#             y_batch = y_batch.to(DEVICE).unsqueeze(1)

#             y_pred = model(x_batch)
#             loss = criterion(y_pred, y_batch)
#             batch_losses.append(loss.item())

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#     model.eval()
#     val_preds, val_labels = [], []

#     with torch.no_grad():
#         for x_batch, y_batch in val_loader:
#             x_batch = x_batch.to(DEVICE)
#             outputs = model(x_batch)
#             preds = (torch.sigmoid(outputs).cpu().numpy() > 0.5).astype(int)
#             val_preds.extend(preds.flatten())
#             val_labels.extend(y_batch.cpu().numpy().astype(int))

#     acc = accuracy_score(val_labels, val_preds)
#     prec, rec, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')
#     print(f"Validation → Acc: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")

# From Text to Tokens

In [8]:
train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/train_dataset.csv")
val_df   = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/val_dataset.csv")
test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/test_dataset.csv")

for df in [train_df, val_df, test_df]:
    df["Text"] = df["Text"].apply(clean_text)

train_tokens = train_df["Text"].astype(str).apply(lambda x: x.split())
val_tokens   = val_df["Text"].astype(str).apply(lambda x: x.split())
test_tokens = test_df["Text"].astype(str).apply(lambda x: x.split())
w2v_model = Word2Vec(sentences=train_tokens, vector_size=EMBEDDING_DIM, sg=1, min_count=1, workers=1, seed=SEED)

# Word Embeddings

## Aggregation function Choice

In [9]:
def evaluate_aggregations_binary(aggregations=["mean", "sum", "max"]):

    for agg in aggregations:
        train_df["vector"] = train_tokens.apply(lambda tokens: get_tweet_vector(tokens, w2v_model, EMBEDDING_DIM, agg=agg))
        val_df["vector"]   = val_tokens.apply(lambda tokens: get_tweet_vector(tokens, w2v_model, EMBEDDING_DIM, agg=agg))

        X_train = np.vstack(train_df["vector"].values)
        y_train = train_df["Label"].values
        X_val   = np.vstack(val_df["vector"].values)
        y_val   = val_df["Label"].values

        x_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
        x_val_tensor   = torch.tensor(X_val, dtype=torch.float32)
        y_val_tensor   = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

        train_dataset = torch.utils.data.TensorDataset(x_train_tensor, y_train_tensor)
        val_dataset   = torch.utils.data.TensorDataset(x_val_tensor, y_val_tensor)
        g = torch.Generator()
        g.manual_seed(SEED)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, generator=g)
        val_loader   = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

        torch.manual_seed(SEED)
        model = BaselineNet().to(DEVICE)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

        for epoch in range(5):
            model.train()
            for xb, yb in train_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                y_pred = model(xb)
                loss = criterion(y_pred, yb)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(DEVICE)
                probs = torch.sigmoid(model(xb))
                preds = (probs > 0.5).int().cpu().numpy()
                val_preds.extend(preds.flatten())
                val_labels.extend(yb.cpu().numpy().flatten().astype(int))

        acc = accuracy_score(val_labels, val_preds)
        prec, rec, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')

        print(f"Results with {agg.upper()} pooling:")
        print(f"  Accuracy : {acc:.4f}")
        print(f"  Precision: {prec:.4f}")
        print(f"  Recall   : {rec:.4f}")
        print(f"  F1 Score : {f1:.4f}")
        
#evaluate_aggregations_binary()


## Features Scaling

In [10]:
def evaluate_scaler_settings():
    results = []
    train_vectors = np.vstack([get_tweet_vector(tokens, w2v_model, EMBEDDING_DIM, agg="sum") for tokens in train_tokens])
    val_vectors   = np.vstack([get_tweet_vector(tokens, w2v_model, EMBEDDING_DIM, agg="sum") for tokens in val_tokens])

    for with_mean in [True, False]:
        for with_std in [True, False]:
            print(f"\n Testing with_mean={with_mean}, with_std={with_std}")
            
            scaler = StandardScaler(with_mean=with_mean, with_std=with_std)
            train_scaled = scaler.fit_transform(train_vectors)
            val_scaled   = scaler.transform(val_vectors)

            x_train_tensor = torch.tensor(train_scaled, dtype=torch.float32)
            y_train_tensor = torch.tensor(train_df["Label"].values, dtype=torch.float32)
            x_val_tensor   = torch.tensor(val_scaled, dtype=torch.float32)
            y_val_tensor   = torch.tensor(val_df["Label"].values, dtype=torch.float32)

            train_dataset = torch.utils.data.TensorDataset(x_train_tensor, y_train_tensor)
            val_dataset   = torch.utils.data.TensorDataset(x_val_tensor, y_val_tensor)

            g = torch.Generator()
            g.manual_seed(SEED)
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, generator=g)
            val_loader   = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

            torch.manual_seed(SEED)
            model = BaselineNet().to(DEVICE)
            criterion = nn.BCEWithLogitsLoss()
            optimizer = torch.optim.Adam(model.parameters())

            for epoch in range(5):
                model.train()
                for xb, yb in train_loader:
                    xb = xb.to(DEVICE)
                    yb = yb.to(DEVICE).unsqueeze(1)
                    pred = model(xb)
                    loss = criterion(pred, yb)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            model.eval()
            val_preds, val_labels = [], []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb = xb.to(DEVICE)
                    probs = torch.sigmoid(model(xb))
                    preds = (probs > 0.5).int().cpu().numpy()
                    val_preds.extend(preds.flatten())
                    val_labels.extend(yb.cpu().numpy().flatten().astype(int))

            acc = accuracy_score(val_labels, val_preds)
            prec, rec, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')

            print(f"   Acc: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
            results.append(((with_mean, with_std), acc, prec, rec, f1))

    return results
    
# evaluate_scaler_settings()
# for (mean, std), acc, prec, rec, f1 in results:
#     print(f"with_mean={mean}, with_std={std} → F1: {f1:.4f} | Acc: {acc:.4f}")

# 2 Layers Model

In [11]:
class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H1, H2, dropout=0.0, activation="relu"):
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, 1)
        self.dropout = nn.Dropout(dropout)
        self.activation = activation

    def forward(self, x):
        x = self.activate(self.linear1(x))
        x = self.dropout(x)
        x = self.activate(self.linear2(x))
        x = self.dropout(x)
        return self.linear3(x)

    def activate(self, x):
        if self.activation == "relu":
            return F.relu(x)
        elif self.activation == "leaky_relu":
            return F.leaky_relu(x)
        else:
            raise ValueError(f"Unsupported activation function: {self.activation}")

## Parameters Turing - Broad Study

In [12]:
def objective(trial):
    set_seed(42)

    vector_size = trial.suggest_categorical("vector_size", [200, 300])
    window = trial.suggest_int("window", 3, 7)
    min_count = trial.suggest_int("min_count", 1, 5)
    sg = 1

    w2v_model = Word2Vec(
        sentences=train_tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=sg,
        workers=1,
        seed=SEED,
    )

    X_train = np.vstack(train_tokens.apply(lambda tokens: get_tweet_vector(tokens, w2v_model, embedding_dim=vector_size, agg="sum")).values)
    X_val   = np.vstack(val_tokens.apply(lambda tokens: get_tweet_vector(tokens, w2v_model, embedding_dim=vector_size, agg="sum")).values)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val   = scaler.transform(X_val)

    y_train = train_df["Label"].values
    y_val   = val_df["Label"].values

    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.long)
    )
    val_dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.long)
    )
    g = torch.Generator()
    g.manual_seed(SEED)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    H1 = trial.suggest_int("H1", 128, 512)
    H2 = trial.suggest_int("H2", 64, H1)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    activation = trial.suggest_categorical("activation", ["relu", "leaky_relu"])
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "adamw"])
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    torch.manual_seed(SEED)
    model = TwoLayerNet(vector_size, H1, H2, dropout=dropout, activation=activation).to(DEVICE)

    if optimizer_name == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(5):
        model.train()
        for xb, yb in train_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE).unsqueeze(1).float()

            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            outputs = model(xb)
            preds = (torch.sigmoid(outputs).cpu().numpy() > 0.5).astype(int)
            val_preds.extend(preds.flatten())
            val_labels.extend(yb.numpy())

    acc = accuracy_score(val_labels, val_preds)
    trial.report(acc, step=0)

    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return acc
    
# pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# study = optuna.create_study(direction="maximize", pruner=pruner)
# study.optimize(objective, n_trials=40, n_jobs=4)

## Parameters Turing - Refined Study

In [13]:
def objective(trial):
    set_seed(42)
    vector_size = 300
    window = trial.suggest_int("window", 6, 7)
    min_count = trial.suggest_int("min_count", 3, 4)
    w2v_model = Word2Vec(
        sentences=train_tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=1,
        seed=SEED
    )

    X_train = np.vstack([get_tweet_vector(tokens, w2v_model, vector_size, agg="sum") for tokens in train_tokens])
    X_val = np.vstack([get_tweet_vector(tokens, w2v_model, vector_size, agg="sum") for tokens in val_tokens])

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    y_train = train_df["Label"].values.astype(np.float32)
    y_val = val_df["Label"].values.astype(np.float32)

    batch_size = trial.suggest_categorical("batch_size", [64, 128])
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train).unsqueeze(1))
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val).unsqueeze(1))

    g = torch.Generator().manual_seed(SEED)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    H1 = trial.suggest_int("H1", 350, 480)
    H2 = trial.suggest_int("H2", 200, H1)
    dropout = trial.suggest_float("dropout", 0.25, 0.3)
    activation = trial.suggest_categorical("activation", ["relu", "leaky_relu"])
    lr = trial.suggest_float("lr", 1e-4, 3e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "adamw"])
    
    torch.manual_seed(SEED)
    model = TwoLayerNet(
        D_in=vector_size, H1=H1, H2=H2,
        dropout=dropout, activation=activation
    ).to(DEVICE)

    optimizer = (
        torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        if optimizer_name == "adam"
        else torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    )
    criterion = nn.BCEWithLogitsLoss()

    model.train()
    for _ in range(5):
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            preds = torch.sigmoid(model(xb)).cpu().numpy() > 0.5
            val_preds.extend(preds.flatten())
            val_labels.extend(yb.cpu().numpy().flatten())

    acc = accuracy_score(val_labels, val_preds)
    trial.report(acc, step=0)
    
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return acc
        
# pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# study = optuna.create_study(direction="maximize", pruner=pruner)
# study.optimize(objective, n_trials=40, n_jobs=4)

# 3 Layers Model

In [14]:
class ThreeLayerNet(nn.Module):
    def __init__(self, D_in, H1, H2, H3, dropout=0.0, activation='relu'):
        super(ThreeLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, 1)
        self.dropout = nn.Dropout(dropout)
        self.activation = activation

    def forward(self, x):
        x = self.activate(self.linear1(x))
        x = self.dropout(x)
        x = self.activate(self.linear2(x))
        x = self.dropout(x)
        x = self.activate(self.linear3(x))
        x = self.dropout(x)
        x = self.linear4(x) 
        return x

    def activate(self, x):
        if self.activation == 'relu':
            return F.relu(x)
        elif self.activation == 'leaky_relu':
            return F.leaky_relu(x)
        else:
            raise ValueError(f"Unsupported activation: {self.activation}")


## Parameters Turing - Broad Study

In [15]:
def objective(trial):
    set_seed(42)
    vector_size = trial.suggest_categorical("vector_size", [200, 300])
    window = trial.suggest_int("window", 4, 8)
    min_count = trial.suggest_int("min_count", 1, 5)

    w2v_model = Word2Vec(
        sentences=train_tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=1,
        seed=SEED,
    )

    EMBEDDING_DIM = vector_size

    X_train = np.vstack([
        get_tweet_vector(tokens, w2v_model, embedding_dim=vector_size, agg="sum")
        for tokens in train_tokens
    ])
    X_val = np.vstack([
        get_tweet_vector(tokens, w2v_model, embedding_dim=vector_size, agg="sum")
        for tokens in val_tokens
    ])

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    y_train = train_df["Label"].values
    y_val = val_df["Label"].values

    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

    train_dataset = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.long)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.long)
    )
    g = torch.Generator()
    g.manual_seed(SEED)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    H1 = trial.suggest_int("H1", 128, 512)
    H2 = trial.suggest_int("H2", 64, H1)
    H3 = trial.suggest_int("H3", 32, H2)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    activation = trial.suggest_categorical("activation", ["relu", "leaky_relu"])
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "adamw"])
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    
    torch.manual_seed(SEED)
    model = ThreeLayerNet(
        D_in=EMBEDDING_DIM,
        H1=H1,
        H2=H2,
        H3=H3,
        dropout=dropout,
        activation=activation
    ).to(DEVICE)

    if optimizer_name == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    criterion = nn.BCEWithLogitsLoss()

    num_epochs = 5
    for epoch in range(num_epochs):
        model.train()
        for xb, yb in train_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE).unsqueeze(1).float()

            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            preds = model(xb)
            preds = (torch.sigmoid(preds) > 0.5).long().squeeze(1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(yb.numpy())

    acc = accuracy_score(val_labels, val_preds)

    trial.report(acc, step=0)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return acc


# pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# study = optuna.create_study(direction="maximize", pruner=pruner)
# study.optimize(objective, n_trials=40, n_jobs=4)

## Parameters Turing - Refined Study

In [16]:
def objective(trial):
    set_seed(42)
    vector_size = 300  
    window = trial.suggest_int("window", 6, 7)
    min_count = trial.suggest_int("min_count", 3, 4)

    w2v_model = Word2Vec(
        sentences=train_tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=1,
        seed=SEED,
        epochs=5
    )

    X_train = np.vstack([
        get_tweet_vector(tokens, w2v_model, vector_size, agg="sum")
        for tokens in train_tokens
    ])
    X_val = np.vstack([
        get_tweet_vector(tokens, w2v_model, vector_size, agg="sum")
        for tokens in val_tokens
    ])

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    y_train = train_df["Label"].values.astype(np.float32)
    y_val = val_df["Label"].values.astype(np.float32)

    batch_size = trial.suggest_categorical("batch_size", [64, 128])
    train_dataset = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
    )
    g = torch.Generator().manual_seed(SEED)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    H1 = trial.suggest_int("H1", 440, 480)
    H2 = trial.suggest_int("H2", 360, H1)
    H3 = trial.suggest_int("H3", 290, H2)
    dropout = trial.suggest_float("dropout", 0.13, 0.21)
    activation = trial.suggest_categorical("activation", ["relu"])
    optimizer_name = trial.suggest_categorical("optimizer", ["adamw"])
    lr = trial.suggest_float("lr", 1e-4, 3e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)

    model = ThreeLayerNet(
        D_in=vector_size,
        H1=H1, H2=H2, H3=H3,
        dropout=dropout,
        activation=activation
    ).to(DEVICE)

    if optimizer_name == "adamw":
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        raise ValueError("Only AdamW supported in refined search")

    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(5):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            preds = (torch.sigmoid(model(xb)) > 0.5).float().cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(yb.cpu().numpy())

    acc = accuracy_score(val_labels, val_preds)

    trial.report(acc, step=0)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return acc

# pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# study = optuna.create_study(direction="maximize", pruner=pruner)
# study.optimize(objective, n_trials=40, n_jobs=4)


# 4 Layers Model - Parameters

In [17]:
class FourLayerNet(nn.Module):
    def __init__(self, D_in, H1, H2, H3, H4, D_out=1, dropout=0.0, activation='relu'):
        super(FourLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, H4)
        self.linear5 = nn.Linear(H4, D_out)
        self.dropout = nn.Dropout(dropout)
        self.activation = activation

    def activate(self, x):
        if self.activation == 'relu':
            return F.relu(x)
        elif self.activation == 'leaky_relu':
            return F.leaky_relu(x)
        else:
            raise ValueError(f"Unsupported activation: {self.activation}")

    def forward(self, x):
        x = self.dropout(self.activate(self.linear1(x)))
        x = self.dropout(self.activate(self.linear2(x)))
        x = self.dropout(self.activate(self.linear3(x)))
        x = self.dropout(self.activate(self.linear4(x)))
        return self.linear5(x) 

## Parameters Turing - Broad Study

In [18]:
def objective(trial):
    
    set_seed(42)
    vector_size = trial.suggest_categorical("vector_size", [200, 300])
    window = trial.suggest_int("window", 4, 8)
    min_count = trial.suggest_int("min_count", 1, 5)

    w2v_model = Word2Vec(
        sentences=train_tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=1,
        epochs=5,
        seed=SEED,
    )

    EMBEDDING_DIM = vector_size

    X_train = np.vstack([
        get_tweet_vector(tokens, w2v_model, embedding_dim=vector_size, agg="sum", seed=SEED)
        for tokens in train_tokens
    ])
    X_val = np.vstack([
        get_tweet_vector(tokens, w2v_model, embedding_dim=vector_size, agg="sum", seed=SEED)
        for tokens in val_tokens
    ])

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    y_train = train_df["Label"].values.astype(np.float32)
    y_val = val_df["Label"].values.astype(np.float32)

    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    train_dataset = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
    )

    g = torch.Generator().manual_seed(SEED)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    H1 = trial.suggest_int("H1", 256, 512)
    H2 = trial.suggest_int("H2", 128, H1)
    H3 = trial.suggest_int("H3", 64, H2)
    H4 = trial.suggest_int("H4", 32, H3)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    activation = trial.suggest_categorical("activation", ["relu", "leaky_relu"])
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "adamw"])

    torch.manual_seed(SEED)
    model = FourLayerNet(
        D_in=EMBEDDING_DIM,
        H1=H1, H2=H2, H3=H3, H4=H4,
        dropout=dropout,
        activation=activation
    ).to(DEVICE)

    if optimizer_name == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    criterion = nn.BCEWithLogitsLoss()

    for _ in range(5):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            preds = torch.sigmoid(model(xb)) > 0.5
            val_preds.extend(preds.cpu().squeeze().int().numpy())
            val_labels.extend(yb.cpu().squeeze().int().numpy())

    acc = accuracy_score(val_labels, val_preds)
    trial.report(acc, step=0)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
    return acc


# pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# study = optuna.create_study(direction="maximize", pruner=pruner)
# study.optimize(objective, n_trials=40, n_jobs=4)

## Parameters Turing - Refined Study

In [19]:
def objective(trial):
    set_seed(42)
    vector_size = 300
    window = trial.suggest_int("window", 6, 8)
    min_count = trial.suggest_int("min_count", 2, 5)

    w2v_model = Word2Vec(
        sentences=train_tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=1,
        epochs=5,
        seed=SEED,
    )

    EMBEDDING_DIM = vector_size

    X_train = np.vstack([
        get_tweet_vector(tokens, w2v_model, embedding_dim=vector_size, agg="sum", seed=SEED)
        for tokens in train_tokens
    ])
    X_val = np.vstack([
        get_tweet_vector(tokens, w2v_model, embedding_dim=vector_size, agg="sum", seed=SEED)
        for tokens in val_tokens
    ])

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    y_train = train_df["Label"].values.astype(np.float32)
    y_val = val_df["Label"].values.astype(np.float32)

    batch_size = trial.suggest_categorical("batch_size", [64, 128])
    train_dataset = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
    )

    g = torch.Generator().manual_seed(SEED)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    H1 = trial.suggest_int("H1", 340, 500)
    H2 = trial.suggest_int("H2", 200, H1)
    H3 = trial.suggest_int("H3", 160, H2)
    H4 = trial.suggest_int("H4", 120, H3)
    dropout = trial.suggest_float("dropout", 0.15, 0.3)
    lr = trial.suggest_float("lr", 8e-5, 3e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    activation = trial.suggest_categorical("activation", ["relu", "leaky_relu"])
    optimizer_name = "adamw"

    torch.manual_seed(SEED)
    model = FourLayerNet(
        D_in=EMBEDDING_DIM,
        H1=H1, H2=H2, H3=H3, H4=H4,
        dropout=dropout,
        activation=activation
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCEWithLogitsLoss()

    for _ in range(5):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            preds = torch.sigmoid(model(xb)) > 0.5
            val_preds.extend(preds.cpu().squeeze().int().numpy())
            val_labels.extend(yb.cpu().squeeze().int().numpy())

    acc = accuracy_score(val_labels, val_preds)
    trial.report(acc, step=0)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
    return acc
    
# pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# study = optuna.create_study(direction="maximize", pruner=pruner)
# study.optimize(objective, n_trials=40, n_jobs=4)

# 2 Layers Model - Evaluation

## Learning Curve

In [20]:
# set_seed(42)
# params = {
#     'window': 7,
#     'min_count': 4,
#     'batch_size': 64,
#     'H1': 375,
#     'H2': 374,
#     'dropout': 0.27041264306882673,
#     'activation': 'leaky_relu',
#     'lr': 1.0702e-4,
#     'weight_decay': 3.5708e-6,
#     'optimizer': 'adamw',
#     'vector_size': 300,
#     'batch_size' : 64
# }


# w2v_model = Word2Vec(
#     sentences=train_tokens,
#     vector_size=params['vector_size'],
#     window=params['window'],
#     min_count=params['min_count'],
#     sg=1,
#     workers=1,
#     epochs=5,
#     seed=SEED
# )

# X_train = np.vstack([get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in train_tokens])
# X_val = np.vstack([get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in val_tokens])
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

# y_train = train_df["Label"].values.astype(np.float32)
# y_val = val_df["Label"].values.astype(np.float32)

# train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train).unsqueeze(1))
# val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val).unsqueeze(1))
# g = torch.Generator().manual_seed(SEED)
# train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, generator=g)
# val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)

# torch.manual_seed(SEED)
# model = TwoLayerNet(
#     D_in=params['vector_size'],
#     H1=params['H1'],
#     H2=params['H2'],
#     dropout=params['dropout'],
#     activation=params['activation']
# ).to(DEVICE)

# optimizer_cls = torch.optim.AdamW if params['optimizer'] == 'adamw' else torch.optim.Adam
# optimizer = optimizer_cls(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
# criterion = nn.BCEWithLogitsLoss()

# EPOCHS = 20
# train_losses, val_losses = [], []
# train_accuracies, val_accuracies = [], []

# for epoch in range(EPOCHS):
#     model.train()
#     train_preds, train_labels, train_loss_total = [], [], 0
#     for xb, yb in train_loader:
#         xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = criterion(preds, yb)
#         loss.backward()
#         optimizer.step()
#         train_loss_total += loss.item() * xb.size(0)
#         train_preds += (torch.sigmoid(preds) > 0.5).cpu().numpy().astype(int).tolist()
#         train_labels += yb.cpu().numpy().astype(int).tolist()
#     train_losses.append(train_loss_total / len(train_loader.dataset))
#     train_accuracies.append(accuracy_score(train_labels, train_preds))

#     model.eval()
#     val_preds, val_labels, val_loss_total = [], [], 0
#     with torch.no_grad():
#         for xb, yb in val_loader:
#             xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#             preds = model(xb)
#             loss = criterion(preds, yb)
#             val_loss_total += loss.item() * xb.size(0)
#             val_preds += (torch.sigmoid(preds) > 0.5).cpu().numpy().astype(int).tolist()
#             val_labels += yb.cpu().numpy().astype(int).tolist()
#     val_losses.append(val_loss_total / len(val_loader.dataset))
#     val_accuracies.append(accuracy_score(val_labels, val_preds))

#     print(f"Epoch {epoch+1}/{EPOCHS} | Train Acc: {train_accuracies[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}")

# plt.figure(figsize=(10, 6))
# plt.plot(epochs, train_losses, label='Train Loss', color='blue', linewidth=2)
# plt.plot(epochs, val_losses, label='Validation Loss', color='green', linewidth=2)
# plt.plot(epochs, train_accuracies, label='Train Accuracy', color='orange', linewidth=2)
# plt.plot(epochs, val_accuracies, label='Validation Accuracy', color='red', linewidth=2)

# plt.title('Learning Curves', fontsize=16)
# plt.xlabel('Epoch', fontsize=14)
# plt.ylabel('Loss / Accuracy', fontsize=14)
# plt.xticks(ticks=range(0, EPOCHS + 1, 5))
# plt.yticks(fontsize=12)
# plt.ylim(0.35, 0.9)
# plt.grid(True, linestyle='--', linewidth=0.6)

# plt.legend(
#     loc='center left',
#     bbox_to_anchor=(1, 0.5),
#     fontsize=12,
#     frameon=False
# )

# plt.tight_layout()
# plt.savefig("learning_curves_2layer.png", dpi=300, bbox_inches='tight')
# plt.show()


## Confusion Matrix - ROC - Classification Report

In [21]:
# set_seed(42)
# EPOCHS = 50
# PATIENCE = 5
# best_val_loss = float("inf")
# patience_counter = 0

# model = TwoLayerNet(
#     D_in=300,
#     H1=375,
#     H2=374,
#     dropout=0.2704,
#     activation='leaky_relu'
# ).to(DEVICE)

# optimizer = torch.optim.AdamW(model.parameters(), lr=1.07e-4, weight_decay=3.57e-6)
# criterion = nn.BCEWithLogitsLoss()

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=torch.Generator().manual_seed(SEED))
# val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# train_losses, val_losses = [], []

# for epoch in range(EPOCHS):
#     model.train()
#     running_loss = 0.0
#     for xb, yb in train_loader:
#         xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = criterion(preds, yb)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#     train_losses.append(running_loss / len(train_loader))

#     model.eval()
#     val_loss = 0.0
#     all_probs, all_labels = [], []
#     with torch.no_grad():
#         for xb, yb in val_loader:
#             xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#             logits = model(xb)
#             val_loss += criterion(logits, yb).item()
#             all_probs.extend(torch.sigmoid(logits).cpu().numpy())
#             all_labels.extend(yb.cpu().numpy())

#     val_losses.append(val_loss / len(val_loader))
#     print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_losses[-1]:.4f} | Val Loss: {val_losses[-1]:.4f}")

#     if val_losses[-1] < best_val_loss:
#         best_val_loss = val_losses[-1]
#         patience_counter = 0
#         torch.save(model.state_dict(), "best_2layer_model.pt")
#     else:
#         patience_counter += 1
#         if patience_counter >= PATIENCE:
#             print("Early stopping triggered.")
#             break

# model.load_state_dict(torch.load("best_2layer_model.pt"))

# model.eval()
# all_probs, all_labels = [], []
# with torch.no_grad():
#     for xb, yb in val_loader:
#         xb = xb.to(DEVICE)
#         probs = torch.sigmoid(model(xb)).cpu().numpy()
#         all_probs.extend(probs)
#         all_labels.extend(yb.numpy())

# y_true = np.array(all_labels).astype(int).flatten()
# y_scores = np.array(all_probs).flatten()
# y_pred = (y_scores >= 0.5).astype(int)

# fpr, tpr, _ = roc_curve(y_true, y_scores)
# roc_auc = auc(fpr, tpr)

# plt.figure(figsize=(6, 6))
# plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}", color="darkorange", lw=2)
# plt.plot([0, 1], [0, 1], "k--", lw=1)
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve (2-layer MLP)")
# plt.legend(loc="lower right")
# plt.grid(True)
# plt.tight_layout()
# plt.savefig("roc_curve_2layer.png", dpi=300)
# plt.show()

# cm = confusion_matrix(y_true, y_pred)
# plt.figure(figsize=(5, 4))
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1], yticklabels=[0, 1])
# plt.xlabel("Predicted Label")
# plt.ylabel("True Label")
# plt.title("Confusion Matrix (2-layer MLP)")
# plt.tight_layout()
# plt.savefig("confusion_matrix_2layer.png", dpi=300)
# plt.show()

# print("Classification Report (2-layer MLP):")
# print(classification_report(y_true, y_pred, digits=4))


# 3 Layers Model - Evaluation

## Learning Curve

In [22]:
# set_seed(42)
# params = {
#     'vector_size': 300,
#     'window': 7,
#     'min_count': 3,
#     'batch_size': 64,
#     'H1': 476,
#     'H2': 362,
#     'H3': 293,
#     'dropout': 0.191,
#     'activation': 'relu',
#     'lr': 2.698e-4,
#     'weight_decay': 2.351e-6,
#     'optimizer': 'adamw'
# }


# w2v_model = Word2Vec(
#     sentences=train_tokens,
#     vector_size=params['vector_size'],
#     window=params['window'],
#     min_count=params['min_count'],
#     sg=1,
#     workers=1,
#     epochs=5,
#     seed=SEED
# )

# X_train = np.vstack([get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in train_tokens])
# X_val = np.vstack([get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in val_tokens])
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

# y_train = train_df["Label"].values.astype(np.float32)
# y_val = val_df["Label"].values.astype(np.float32)

# train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train).unsqueeze(1))
# val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val).unsqueeze(1))
# g = torch.Generator().manual_seed(SEED)
# train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, generator=g)
# val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)

# torch.manual_seed(SEED)
# model = ThreeLayerNet(
#     D_in=params['vector_size'],
#     H1=params['H1'],
#     H2=params['H2'],
#     H3=params['H3'],
#     dropout=params['dropout'],
#     activation=params['activation']
# ).to(DEVICE)

# optimizer_cls = torch.optim.AdamW if params['optimizer'] == 'adamw' else torch.optim.Adam
# optimizer = optimizer_cls(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
# criterion = nn.BCEWithLogitsLoss()

# EPOCHS = 20
# train_losses, val_losses = [], []
# train_accuracies, val_accuracies = [], []

# for epoch in range(EPOCHS):
#     model.train()
#     train_preds, train_labels, train_loss_total = [], [], 0
#     for xb, yb in train_loader:
#         xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = criterion(preds, yb)
#         loss.backward()
#         optimizer.step()
#         train_loss_total += loss.item() * xb.size(0)
#         train_preds += (torch.sigmoid(preds) > 0.5).cpu().numpy().astype(int).tolist()
#         train_labels += yb.cpu().numpy().astype(int).tolist()
#     train_losses.append(train_loss_total / len(train_loader.dataset))
#     train_accuracies.append(accuracy_score(train_labels, train_preds))

#     model.eval()
#     val_preds, val_labels, val_loss_total = [], [], 0
#     with torch.no_grad():
#         for xb, yb in val_loader:
#             xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#             preds = model(xb)
#             loss = criterion(preds, yb)
#             val_loss_total += loss.item() * xb.size(0)
#             val_preds += (torch.sigmoid(preds) > 0.5).cpu().numpy().astype(int).tolist()
#             val_labels += yb.cpu().numpy().astype(int).tolist()
#     val_losses.append(val_loss_total / len(val_loader.dataset))
#     val_accuracies.append(accuracy_score(val_labels, val_preds))

#     print(f"Epoch {epoch+1}/{EPOCHS} | Train Acc: {train_accuracies[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}")

# epochs = list(range(1, EPOCHS + 1))
# plt.figure(figsize=(10, 6))
# plt.plot(epochs, train_losses, label='Train Loss', color='blue', linewidth=2)
# plt.plot(epochs, val_losses, label='Validation Loss', color='green', linewidth=2)
# plt.plot(epochs, train_accuracies, label='Train Accuracy', color='orange', linewidth=2)
# plt.plot(epochs, val_accuracies, label='Validation Accuracy', color='red', linewidth=2)
# plt.title('3-Layer MLP Learning Curves', fontsize=16)
# plt.xlabel('Epoch', fontsize=14)
# plt.ylabel('Loss / Accuracy', fontsize=14)
# plt.xticks(ticks=range(0, EPOCHS + 1, 5))
# plt.yticks(fontsize=12)
# plt.ylim(0.2, 0.9)
# plt.grid(True, linestyle='--', linewidth=0.6)
# plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12, frameon=False)
# plt.tight_layout()
# plt.savefig("learning_curves_3layer.png", dpi=300, bbox_inches='tight')
# plt.show()

## Confusion Matrix - ROC - Classification Report

In [23]:
# set_seed(42)

# EPOCHS = 50
# PATIENCE = 5
# best_val_loss = float("inf")
# patience_counter = 0

# model = ThreeLayerNet(
#     D_in=300,
#     H1=453,
#     H2=395,
#     H3=302,
#     dropout=0.1602,
#     activation='relu'
# ).to(DEVICE)

# optimizer = torch.optim.AdamW(model.parameters(), lr=1.1958e-4, weight_decay=6.7527e-6)
# criterion = torch.nn.BCEWithLogitsLoss()

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=torch.Generator().manual_seed(SEED))
# val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# train_losses, val_losses = [], []

# for epoch in range(EPOCHS):
#     model.train()
#     running_loss = 0.0
#     for xb, yb in train_loader:
#         xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = criterion(preds, yb)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#     train_losses.append(running_loss / len(train_loader))

#     model.eval()
#     val_loss = 0.0
#     all_probs, all_labels = [], []
#     with torch.no_grad():
#         for xb, yb in val_loader:
#             xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#             logits = model(xb)
#             val_loss += criterion(logits, yb).item()
#             all_probs.extend(torch.sigmoid(logits).cpu().numpy())
#             all_labels.extend(yb.cpu().numpy())
#     val_losses.append(val_loss / len(val_loader))
#     print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_losses[-1]:.4f} | Val Loss: {val_losses[-1]:.4f}")

#     if val_losses[-1] < best_val_loss:
#         best_val_loss = val_losses[-1]
#         patience_counter = 0
#         torch.save(model.state_dict(), "best_3layer_model.pt")
#     else:
#         patience_counter += 1
#         if patience_counter >= PATIENCE:
#             print("Early stopping triggered.")
#             break

# model.load_state_dict(torch.load("best_3layer_model.pt"))

# model.eval()
# all_probs, all_labels = [], []
# with torch.no_grad():
#     for xb, yb in val_loader:
#         xb = xb.to(DEVICE)
#         probs = torch.sigmoid(model(xb)).cpu().numpy()
#         all_probs.extend(probs)
#         all_labels.extend(yb.numpy())

# y_true = np.array(all_labels).astype(int).flatten()
# y_scores = np.array(all_probs).flatten()
# y_pred = (y_scores >= 0.5).astype(int)

# fpr, tpr, _ = roc_curve(y_true, y_scores)
# roc_auc = auc(fpr, tpr)

# plt.figure(figsize=(6, 6))
# plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}", color="darkorange", lw=2)
# plt.plot([0, 1], [0, 1], "k--", lw=1)
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve (3-layer MLP)")
# plt.legend(loc="lower right")
# plt.grid(True)
# plt.tight_layout()
# plt.savefig("roc_curve_3layer.png", dpi=300)
# plt.show()

# cm = confusion_matrix(y_true, y_pred)
# plt.figure(figsize=(5, 4))
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1], yticklabels=[0, 1])
# plt.xlabel("Predicted Label")
# plt.ylabel("True Label")
# plt.title("Confusion Matrix (3-layer MLP)")
# plt.tight_layout()
# plt.savefig("confusion_matrix_3layer.png", dpi=300)
# plt.show()

# print("Classification Report (3-layer MLP):")
# print(classification_report(y_true, y_pred, digits=4))



# 4 Layers Model - Evaluation

## Learning Curve

In [24]:
# set_seed(42)
# params = {
#     'window': 7,
#     'min_count': 4,
#     'batch_size': 64,
#     'H1': 387,
#     'H2': 359,
#     'H3': 320,
#     'H4': 257,
#     'dropout': 0.1775,
#     'activation': 'relu',
#     'lr': 1.3767e-4,
#     'weight_decay': 1.0213e-4,
#     'optimizer': 'adamw',
#     'vector_size': 300
# }


# w2v_model = Word2Vec(
#     sentences=train_tokens,
#     vector_size=params['vector_size'],
#     window=params['window'],
#     min_count=params['min_count'],
#     sg=1,
#     workers=1,
#     epochs=5,
#     seed=SEED
# )

# X_train = np.vstack([get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in train_tokens])
# X_val = np.vstack([get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in val_tokens])
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

# y_train = train_df["Label"].values.astype(np.float32)
# y_val = val_df["Label"].values.astype(np.float32)

# train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train).unsqueeze(1))
# val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val).unsqueeze(1))
# g = torch.Generator().manual_seed(SEED)
# train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, generator=g)
# val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)

# model = FourLayerNet(
#     D_in=params['vector_size'],
#     H1=params['H1'],
#     H2=params['H2'],
#     H3=params['H3'],
#     H4=params['H4'],
#     dropout=params['dropout'],
#     activation=params['activation']
# ).to(DEVICE)

# optimizer_cls = torch.optim.AdamW if params['optimizer'] == 'adamw' else torch.optim.Adam
# optimizer = optimizer_cls(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
# criterion = nn.BCEWithLogitsLoss()

# EPOCHS = 20
# train_losses, val_losses = [], []
# train_accuracies, val_accuracies = [], []

# for epoch in range(EPOCHS):
#     model.train()
#     train_preds, train_labels, train_loss_total = [], [], 0
#     for xb, yb in train_loader:
#         xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = criterion(preds, yb)
#         loss.backward()
#         optimizer.step()
#         train_loss_total += loss.item() * xb.size(0)
#         train_preds += (torch.sigmoid(preds) > 0.5).cpu().numpy().astype(int).tolist()
#         train_labels += yb.cpu().numpy().astype(int).tolist()
#     train_losses.append(train_loss_total / len(train_loader.dataset))
#     train_accuracies.append(accuracy_score(train_labels, train_preds))

#     model.eval()
#     val_preds, val_labels, val_loss_total = [], [], 0
#     with torch.no_grad():
#         for xb, yb in val_loader:
#             xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#             preds = model(xb)
#             loss = criterion(preds, yb)
#             val_loss_total += loss.item() * xb.size(0)
#             val_preds += (torch.sigmoid(preds) > 0.5).cpu().numpy().astype(int).tolist()
#             val_labels += yb.cpu().numpy().astype(int).tolist()
#     val_losses.append(val_loss_total / len(val_loader.dataset))
#     val_accuracies.append(accuracy_score(val_labels, val_preds))

#     print(f"Epoch {epoch+1}/{EPOCHS} | Train Acc: {train_accuracies[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}")

# epochs = list(range(1, EPOCHS + 1))
# plt.figure(figsize=(10, 6))
# plt.plot(epochs, train_losses, label='Train Loss', color='blue', linewidth=2)
# plt.plot(epochs, val_losses, label='Validation Loss', color='green', linewidth=2)
# plt.plot(epochs, train_accuracies, label='Train Accuracy', color='orange', linewidth=2)
# plt.plot(epochs, val_accuracies, label='Validation Accuracy', color='red', linewidth=2)

# plt.title('Learning Curves (4-layer MLP)', fontsize=16)
# plt.xlabel('Epoch', fontsize=14)
# plt.ylabel('Loss / Accuracy', fontsize=14)
# plt.xticks(ticks=range(0, EPOCHS + 1, 5))
# plt.yticks(fontsize=12)
# plt.ylim(0.3, 0.9)
# plt.grid(True, linestyle='--', linewidth=0.6)

# plt.legend(
#     loc='center left',
#     bbox_to_anchor=(1, 0.5),
#     fontsize=12,
#     frameon=False
# )

# plt.tight_layout()
# plt.savefig("learning_curves_4layer.png", dpi=300, bbox_inches='tight')
# plt.show()

## Confusion Matrix - ROC - Classification Report

In [25]:
# set_seed(42)

# train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, generator=torch.Generator().manual_seed(SEED))
# val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)

# torch.manual_seed(SEED)
# model = FourLayerNet(
#     D_in=params['vector_size'],
#     H1=params['H1'],
#     H2=params['H2'],
#     H3=params['H3'],
#     H4=params['H4'],
#     dropout=params['dropout'],
#     activation=params['activation']
# ).to(DEVICE)

# optimizer = torch.optim.AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
# criterion = nn.BCEWithLogitsLoss()

# EPOCHS = 50
# PATIENCE = 5
# best_val_loss = float("inf")
# patience_counter = 0

# train_losses, val_losses = [], []

# for epoch in range(EPOCHS):
#     model.train()
#     total_train_loss = 0
#     for xb, yb in train_loader:
#         xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = criterion(preds, yb)
#         loss.backward()
#         optimizer.step()
#         total_train_loss += loss.item()
#     train_losses.append(total_train_loss / len(train_loader))

#     model.eval()
#     val_loss = 0.0
#     all_probs, all_labels = [], []
#     with torch.no_grad():
#         for xb, yb in val_loader:
#             xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#             logits = model(xb)
#             val_loss += criterion(logits, yb).item()
#             all_probs.extend(torch.sigmoid(logits).cpu().numpy())
#             all_labels.extend(yb.cpu().numpy())

#     val_losses.append(val_loss / len(val_loader))
#     print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_losses[-1]:.4f} | Val Loss: {val_losses[-1]:.4f}")

#     if val_losses[-1] < best_val_loss:
#         best_val_loss = val_losses[-1]
#         patience_counter = 0
#         torch.save(model.state_dict(), "best_4layer_model.pt")
#     else:
#         patience_counter += 1
#         if patience_counter >= PATIENCE:
#             print("Early stopping triggered.")
#             break

# model.load_state_dict(torch.load("best_4layer_model.pt"))
# model.eval()

# all_probs, all_labels = [], []
# with torch.no_grad():
#     for xb, yb in val_loader:
#         xb = xb.to(DEVICE)
#         probs = torch.sigmoid(model(xb)).cpu().numpy()
#         all_probs.extend(probs)
#         all_labels.extend(yb.numpy())

# y_true = np.array(all_labels).astype(int).flatten()
# y_scores = np.array(all_probs).flatten()
# y_pred = (y_scores >= 0.5).astype(int)

# fpr, tpr, _ = roc_curve(y_true, y_scores)
# roc_auc = auc(fpr, tpr)

# plt.figure(figsize=(6, 6))
# plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}", color="darkorange", lw=2)
# plt.plot([0, 1], [0, 1], "k--", lw=1)
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve (4-layer MLP)")
# plt.legend(loc="lower right")
# plt.grid(True)
# plt.tight_layout()
# plt.savefig("roc_curve_4layer.png", dpi=300)
# plt.show()

# cm = confusion_matrix(y_true, y_pred)
# plt.figure(figsize=(5, 4))
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1], yticklabels=[0, 1])
# plt.xlabel("Predicted Label")
# plt.ylabel("True Label")
# plt.title("Confusion Matrix (4-layer MLP)")
# plt.tight_layout()
# plt.savefig("confusion_matrix_4layer.png", dpi=300)
# plt.show()

# print("Classification Report (4-layer MLP):")
# print(classification_report(y_true, y_pred, digits=4))

# Best Model - Submission

In [26]:
set_seed(42)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Parameters ---
params = {
    'vector_size': 300,
    'window': 7,
    'min_count': 4,
    'batch_size': 64,
    'H1': 375,
    'H2': 374,
    'dropout': 0.2704,
    'activation': 'leaky_relu',
    'lr': 1.0702e-4,
    'weight_decay': 3.5708e-6,
    'optimizer': 'adamw'
}

# --- Train Word2Vec ---
w2v_model = Word2Vec(
    sentences=train_tokens,
    vector_size=params['vector_size'],
    window=params['window'],
    min_count=params['min_count'],
    sg=1,
    workers=1,
    epochs=5,
    seed=SEED
)

X_train = np.vstack([get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in train_tokens])
X_val = np.vstack([get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in val_tokens])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

y_train = train_df["Label"].values.astype(np.float32)
y_val = val_df["Label"].values.astype(np.float32)

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train).unsqueeze(1))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val).unsqueeze(1))

g = torch.Generator().manual_seed(SEED)
train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)

X_test = np.vstack([
    get_tweet_vector(tokens, w2v_model, params['vector_size']) for tokens in test_tokens
])
X_test = scaler.transform(X_test)  # Use the same scaler as train/val

test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32))
test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

torch.manual_seed(SEED)
model = TwoLayerNet(
    D_in=params['vector_size'],
    H1=params['H1'],
    H2=params['H2'],
    dropout=params['dropout'],
    activation=params['activation']
).to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
criterion = nn.BCEWithLogitsLoss()

EPOCHS = 50
PATIENCE = 5
best_val_loss = float("inf")
patience_counter = 0
train_losses, val_losses = [], []

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(train_loader))

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            preds = model(xb)
            loss = criterion(preds, yb)
            val_loss += loss.item()
    val_losses.append(val_loss / len(val_loader))

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_losses[-1]:.4f} | Val Loss: {val_losses[-1]:.4f}")

    if val_losses[-1] < best_val_loss:
        best_val_loss = val_losses[-1]
        patience_counter = 0
        torch.save(model.state_dict(), "best_2layer_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("Early stopping triggered.")
            break

print("✅ Training complete. Best model saved as best_2layer_model.pt")
model.load_state_dict(torch.load("best_2layer_model.pt"))
model.eval()

preds = []
with torch.no_grad():
    for (xb,) in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        probs = torch.sigmoid(logits).cpu().numpy().flatten()
        preds.extend((probs >= 0.5).astype(int))

# --- Save submission ---
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "Label": preds
})
submission_df.to_csv("/kaggle/working/submission.csv", index=False)
print("✅ submission.csv saved and ready for upload!")

Epoch 1/50 | Train Loss: 0.4870 | Val Loss: 0.4612
Epoch 2/50 | Train Loss: 0.4626 | Val Loss: 0.4527
Epoch 3/50 | Train Loss: 0.4544 | Val Loss: 0.4494
Epoch 4/50 | Train Loss: 0.4484 | Val Loss: 0.4470
Epoch 5/50 | Train Loss: 0.4437 | Val Loss: 0.4468
Epoch 6/50 | Train Loss: 0.4400 | Val Loss: 0.4445
Epoch 7/50 | Train Loss: 0.4357 | Val Loss: 0.4476
Epoch 8/50 | Train Loss: 0.4319 | Val Loss: 0.4424
Epoch 9/50 | Train Loss: 0.4295 | Val Loss: 0.4411
Epoch 10/50 | Train Loss: 0.4263 | Val Loss: 0.4419
Epoch 11/50 | Train Loss: 0.4229 | Val Loss: 0.4407
Epoch 12/50 | Train Loss: 0.4201 | Val Loss: 0.4419
Epoch 13/50 | Train Loss: 0.4174 | Val Loss: 0.4413
Epoch 14/50 | Train Loss: 0.4148 | Val Loss: 0.4422
Epoch 15/50 | Train Loss: 0.4125 | Val Loss: 0.4423
Epoch 16/50 | Train Loss: 0.4096 | Val Loss: 0.4419
Early stopping triggered.
✅ Training complete. Best model saved as best_2layer_model.pt


  model.load_state_dict(torch.load("best_2layer_model.pt"))


✅ submission.csv saved and ready for upload!
