In [1]:
import os, time, duckdb, torch, timm, gc, copy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from PIL import Image
from sklearn.model_selection import ParameterGrid

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier

import torchvision.transforms as T
from pathlib import Path

from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import duckdb, torch
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizer

from scipy.sparse import load_npz, hstack, save_npz
import torch.nn as nn

In [3]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


# CROSS-ATTENTION TRAIN

In [4]:
# Aggregate the image tokens related to the same post and compute the mean
def aggregate_image_tokens_per_post(ids_txt, ids_img, img_tokens):
    img_tokens_post = []

    for pid in ids_txt:
        mask = (ids_img == pid)
        tokens_p = img_tokens[mask]
        
        if tokens_p.shape[0] == 0:
            agg = np.zeros((50, img_tokens.shape[-1]), dtype=img_tokens.dtype)
        else:
            agg = tokens_p.mean(axis=0)
        
        img_tokens_post.append(agg)

    return np.stack(img_tokens_post, axis=0)


In [5]:
def simple_batch_loader(txt, img, y, batch_size=32, shuffle=True):
    N = len(y)
    idxs = np.arange(N)

    if shuffle:
        np.random.shuffle(idxs)

    for start in range(0, N, batch_size):
        end = start + batch_size
        batch_idx = idxs[start:end]

        yield txt[batch_idx], img[batch_idx], y[batch_idx]

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

dim_txt = 512
dim_img_in = 768
dim = 512
num_heads = 8

# Image projection to get 512 dimension
W_img = nn.Linear(dim_img_in, dim).to(device)

# Cross-attention layers
attn_t2i = nn.MultiheadAttention(dim, num_heads, batch_first=True).to(device)
attn_i2t = nn.MultiheadAttention(dim, num_heads, batch_first=True).to(device)

# Feed-forward 
ff_t = nn.Sequential(
    nn.Linear(dim, 4*dim),
    nn.ReLU(),
    nn.Linear(4*dim, dim)
).to(device)

ff_i = nn.Sequential(
    nn.Linear(dim, 4*dim),
    nn.ReLU(),
    nn.Linear(4*dim, dim)
).to(device)

# Layer Norm 
norm_t1 = nn.LayerNorm(dim).to(device)
norm_t2 = nn.LayerNorm(dim).to(device)
norm_i1 = nn.LayerNorm(dim).to(device)
norm_i2 = nn.LayerNorm(dim).to(device)

# CLASSIFIER
classifier = nn.Linear(2*dim, 5).to(device)

# OPTIMIZER: it defines the parameters to train during the training of the last layer (the ones defined before)
params = (
    list(W_img.parameters()) +
    list(attn_t2i.parameters()) +
    list(attn_i2t.parameters()) +
    list(ff_t.parameters()) +
    list(ff_i.parameters()) +
    list(norm_t1.parameters()) +
    list(norm_t2.parameters()) +
    list(norm_i1.parameters()) +
    list(norm_i2.parameters()) +
    list(classifier.parameters())
)

optimizer = torch.optim.Adam(params, lr=1e-4)
criterion = nn.CrossEntropyLoss() # Loss function to minimize

In [7]:
def multimodal_forward(txt, img):

    # Image projection
    img = W_img(img)

    # TEXT to IMAGE
    t2i, _ = attn_t2i(txt, img, img)
    txt2 = norm_t1(txt + t2i)
    txt3 = norm_t2(txt2 + ff_t(txt2))

    # IMAGE to TEXT
    i2t, _ = attn_i2t(img, txt3, txt3)
    img2 = norm_i1(img + i2t)
    img3 = norm_i2(img2 + ff_i(img2))

    # Pooling
    txt_repr = txt3.mean(dim=1)
    img_repr = img3.mean(dim=1)

    fused = torch.cat([txt_repr, img_repr], dim=-1)  

    logits = classifier(fused)

    return logits, fused

In [8]:
def train_one_epoch(txt, img, y, batch_size=32):
    model_loss = 0
    N = len(y)
    idx = torch.randperm(N)

    for start in range(0, N, batch_size):
        end = start + batch_size
        b = idx[start:end]

        txt_b = txt[b].to(device)
        img_b = img[b].to(device)
        y_b   = y[b].to(device)

        optimizer.zero_grad()

        logits, _ = multimodal_forward(txt_b, img_b)
        loss = criterion(logits, y_b)

        loss.backward()
        optimizer.step()

        model_loss += loss.item()

    return model_loss / (N // batch_size)

In [9]:
def evaluate(txt, img, y, batch_size=32):
    total_loss = 0
    correct = 0
    N = len(y)
    with torch.no_grad():
        for start in range(0, N, batch_size):
            end = start + batch_size
            txt_b = txt[start:end].to(device)
            img_b = img[start:end].to(device)
            y_b   = y[start:end].to(device)

            logits, _ = multimodal_forward(txt_b, img_b)
            loss = criterion(logits, y_b)
            total_loss += loss.item()

            preds = logits.argmax(dim=-1)
            correct += (preds == y_b).sum().item()

    return total_loss / (N // batch_size), correct / N

In [10]:
import copy
import torch

def get_best_state():
    return {
        "W_img": copy.deepcopy(W_img.state_dict()),
        "attn_t2i": copy.deepcopy(attn_t2i.state_dict()),
        "attn_i2t": copy.deepcopy(attn_i2t.state_dict()),
        "ff_t": copy.deepcopy(ff_t.state_dict()),
        "ff_i": copy.deepcopy(ff_i.state_dict()),
        "norm_t1": copy.deepcopy(norm_t1.state_dict()),
        "norm_t2": copy.deepcopy(norm_t2.state_dict()),
        "norm_i1": copy.deepcopy(norm_i1.state_dict()),
        "norm_i2": copy.deepcopy(norm_i2.state_dict()),
        "classifier": copy.deepcopy(classifier.state_dict()),
    }

def load_best_state(state):
    W_img.load_state_dict(state["W_img"])
    attn_t2i.load_state_dict(state["attn_t2i"])
    attn_i2t.load_state_dict(state["attn_i2t"])
    ff_t.load_state_dict(state["ff_t"])
    ff_i.load_state_dict(state["ff_i"])
    norm_t1.load_state_dict(state["norm_t1"])
    norm_t2.load_state_dict(state["norm_t2"])
    norm_i1.load_state_dict(state["norm_i1"])
    norm_i2.load_state_dict(state["norm_i2"])
    classifier.load_state_dict(state["classifier"])


In [11]:
text_npz_tr  = "D:/dataset/clip_cross_attention_emb25/clip-vit-base-patch32_TOKENS_train_ids_y.npz"
image_npz_tr = "D:/dataset/clip_cross_attention_emb25/clip_vit_b32_IMG_TOKENS_train_ids_y.npz"

t_tr = np.load(text_npz_tr, allow_pickle = True)
i_tr = np.load(image_npz_tr, allow_pickle = True)

ids_txt_tr = t_tr["ids"]
txt_tokens_tr = t_tr["embeddings"]

ids_img_tr = i_tr["ids"]
img_tokens_tr = i_tr["embeddings"]

img_tokens_post_tr = aggregate_image_tokens_per_post(ids_txt_tr, ids_img_tr, img_tokens_tr)
print(txt_tokens_tr.shape, img_tokens_tr.shape, img_tokens_post_tr.shape)
set_txt = set(t_tr["ids"])
set_img = set(i_tr["ids"])

print("Post solo nel testo:", set_txt - set_img)
print("Post solo nelle immagini:", set_img - set_txt)

(24993, 77, 512) (30893, 50, 768) (24993, 50, 768)
Post solo nel testo: set()
Post solo nelle immagini: set()


In [12]:
y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_5.npy", allow_pickle = True)
y_tr

array([0, 0, 0, ..., 2, 2, 1])

In [13]:
# del t_tr, i_tr, 
del img_tokens_tr, set_txt, set_img
gc.collect()

684

In [14]:
device = torch.device("cpu")

txt_tr = torch.tensor(txt_tokens_tr, dtype=torch.float32)
img_tr = torch.tensor(img_tokens_post_tr, dtype=torch.float32)
print(txt_tr.shape, img_tr.shape)

torch.Size([24993, 77, 512]) torch.Size([24993, 50, 768])


In [15]:
del txt_tokens_tr, img_tokens_post_tr
gc.collect()

0

In [16]:
text_npz_va  = "D:/dataset/clip_cross_attention_emb/clip-vit-base-patch32_TOKENS_validation_ids_y.npz"
image_npz_va = "D:/dataset/clip_cross_attention_emb/clip_vit_b32_IMG_TOKENS_validation_ids_y.npz"

t_va = np.load(text_npz_va, allow_pickle = True)
i_va = np.load(image_npz_va, allow_pickle = True)

ids_txt_va = t_va["ids"]
txt_tokens_va = t_va["embeddings"] 

ids_img_va = i_va["ids"]
img_tokens_va = i_va["embeddings"]

img_tokens_post_va = aggregate_image_tokens_per_post(ids_txt_va, ids_img_va, img_tokens_va)
print(txt_tokens_va.shape, img_tokens_va.shape, img_tokens_post_va.shape)
set_txt = set(t_va["ids"])
set_img = set(i_va["ids"])

print("Post solo nel testo:", set_txt - set_img)
print("Post solo nelle immagini:", set_img - set_txt)

(5000, 77, 512) (6665, 50, 768) (5000, 50, 768)
Post solo nel testo: set()
Post solo nelle immagini: set()


In [17]:
# del t_va, i_va, 
del img_tokens_va, set_txt, set_img
gc.collect()

66

In [18]:
device = torch.device("cpu")

txt_va = torch.tensor(txt_tokens_va, dtype=torch.float32)
img_va = torch.tensor(img_tokens_post_va, dtype=torch.float32)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_5.npy", allow_pickle = True)
print(txt_va.shape, img_va.shape, y_va.shape)

torch.Size([5000, 77, 512]) torch.Size([5000, 50, 768]) (5000,)


In [19]:
del txt_tokens_va, img_tokens_post_va
gc.collect()

11

In [20]:
y_va

array([3, 3, 4, ..., 2, 2, 1])

In [21]:
y_tr = torch.tensor(y_tr, dtype=torch.long)
y_va = torch.tensor(y_va, dtype=torch.long)

In [22]:
epochs = 10
patience = 3
min_delta = 1e-4

best_val = float("inf")
best_state = None
bad_epochs = 0

for ep in range(1, epochs + 1):
    train_loss = train_one_epoch(txt_tr, img_tr, y_tr, batch_size=32)
    val_loss, val_acc = evaluate(txt_va, img_va, y_va, batch_size=32)

    print(f"Epoch {ep}/{epochs} - train_loss: {train_loss:.4f} | val_loss: {val_loss:.4f} | val_acc: {val_acc:.3f}")

    if val_loss < best_val - min_delta:
        best_val = val_loss
        best_state = get_best_state()
        bad_epochs = 0
    else:
        bad_epochs += 1

    if bad_epochs >= patience:
        print(f"Early stopping: val_loss non migliora da {patience} epoche. Best val_loss = {best_val:.4f}")
        break

if best_state is not None:
    load_best_state(best_state)

Epoch 1/10 - train_loss: 1.6389 | val_loss: 1.6689 | val_acc: 0.207
Epoch 2/10 - train_loss: 1.6002 | val_loss: 1.6823 | val_acc: 0.187
Epoch 3/10 - train_loss: 1.5642 | val_loss: 1.6725 | val_acc: 0.185
Epoch 4/10 - train_loss: 1.4788 | val_loss: 1.7656 | val_acc: 0.205
Early stopping: val_loss non migliora da 3 epoche. Best val_loss = 1.6689


In [23]:
def generate_fused_features(txt, img, batch_size=32):
    fused_all = []
    with torch.no_grad():
        for start in range(0, len(txt), batch_size):
            end = start + batch_size
            txt_b = txt[start:end].to(device)
            img_b = img[start:end].to(device)

            _, fused = multimodal_forward(txt_b, img_b)
            fused_all.append(fused.cpu().numpy())

    return np.vstack(fused_all)

In [24]:
# During the optimization prosses, the pytorch modules have been updated to the correct version, and uses it 
fused_train = generate_fused_features(txt_tr, img_tr)
np.save("D:/dataset/clip_cross_attention_emb/fused_train_finetuned.npy", fused_train)

In [25]:
fused_train.shape

(24993, 1024)

In [26]:
fused_val = generate_fused_features(txt_va, img_va)
np.save("D:/dataset/clip_cross_attention_emb/fused_val_finetuned.npy", fused_val)

In [27]:
fused_val.shape

(5000, 1024)

# CROSS-ATTENTION TEST

In [28]:
del txt_tr, img_tr, txt_va, img_va, y_tr, y_va
gc.collect()

0

In [29]:
text_npz_te  = "D:/dataset/clip_cross_attention_emb/clip-vit-base-patch32_TOKENS_test_ids_y.npz"
image_npz_te = "D:/dataset/clip_cross_attention_emb/clip_vit_b32_IMG_TOKENS_test_ids_y.npz"

t_te = np.load(text_npz_te, allow_pickle = True)
i_te = np.load(image_npz_te, allow_pickle = True)

ids_txt_te = t_te["ids"]
txt_tokens_te = t_te["embeddings"]     

ids_img_te = i_te["ids"]
img_tokens_te = i_te["embeddings"] 

del t_te, i_te
gc.collect()

44

In [30]:
img_tokens_post_te = aggregate_image_tokens_per_post(ids_txt_te, ids_img_te, img_tokens_te)

In [31]:
print(txt_tokens_te.shape, img_tokens_post_te.shape)

(5000, 77, 512) (5000, 50, 768)


In [32]:
y_te = np.load("D:/dataset/clip_cross_attention_emb/y_te_5.npy", allow_pickle = True)

In [33]:
device = torch.device("cpu")

txt_te = torch.tensor(txt_tokens_te, dtype=torch.float32)
img_te = torch.tensor(img_tokens_post_te, dtype=torch.float32)
y_te = torch.tensor(y_te, dtype=torch.long)

In [34]:
print("Unique classes:", np.unique(y_te))
print("shape:", y_te.shape)
print(txt_te.shape, img_te.shape, y_te.shape)

del txt_tokens_te, img_tokens_post_te
gc.collect()

Unique classes: [0 1 2 3 4]
shape: torch.Size([5000])
torch.Size([5000, 77, 512]) torch.Size([5000, 50, 768]) torch.Size([5000])


11

In [35]:
fused_test  = generate_fused_features(txt_te, img_te)
np.save("D:/dataset/clip_cross_attention_emb/fused_test_finetuned.npy", fused_test)

In [36]:
fused_test.shape

(5000, 1024)

# FUSION

In [37]:
len(ids_txt_tr)

24993

In [38]:
# METADATA TRAIN
meta_train_final = pd.read_csv("D:/dataset/meta_classification/meta_train_final.csv")
# Filtra dati che sono in ids_txt_tr
meta_train_final = meta_train_final[meta_train_final["post_id"].isin(ids_txt_tr)]
# Riordina seguento l'ordine degli embeddings
meta_train_final = meta_train_final.set_index("post_id").loc[ids_txt_tr].reset_index()
# Check allineamento
assert (meta_train_final["post_id"].to_numpy() == ids_txt_tr).all()
print("Aligned")
# Remove post_id
X_meta_train = meta_train_final.drop(["post_id"], axis=1)
X_meta_train = X_meta_train.to_numpy(dtype=np.float32)

fused_train = np.load("D:/dataset/clip_cross_attention_emb/fused_train_finetuned.npy", allow_pickle = True)
X_tr = np.hstack([fused_train, X_meta_train])

Aligned


In [39]:
X_tr.shape

(24993, 1052)

In [40]:
np.save("D:/dataset/clip_cross_attention_emb/X_train_ft5.npy", X_tr)

In [41]:
# METADATA VALIDATION
meta_val_final = pd.read_csv("D:/dataset/meta_classification/meta_val_final.csv")
# Filtra dati che sono in ids_txt_tr
meta_val_final = meta_val_final[meta_val_final["post_id"].isin(ids_txt_va)]
# Riordina seguento l'ordine degli embeddings
meta_val_final = meta_val_final.set_index("post_id").loc[ids_txt_va].reset_index()
# Check allineamento
assert (meta_val_final["post_id"].to_numpy() == ids_txt_va).all()
print("Aligned")
# Remove post_id
X_meta_val = meta_val_final.drop(["post_id"], axis=1)
X_meta_val = X_meta_val.to_numpy(dtype=np.float32)

fused_val = np.load("D:/dataset/clip_cross_attention_emb/fused_val_finetuned.npy", allow_pickle = True)
X_va = np.hstack([fused_val, X_meta_val])

Aligned


In [42]:
X_va.shape

(5000, 1052)

In [43]:
np.save("D:/dataset/clip_cross_attention_emb/X_val_ft5.npy", X_va)

In [44]:
# METADATA TEST
meta_test_final = pd.read_csv("D:/dataset/meta_classification/meta_test_final.csv")
# Filtra dati che sono in ids_txt_tr
meta_test_final = meta_test_final[meta_test_final["post_id"].isin(ids_txt_te)]
# Riordina seguento l'ordine degli embeddings
meta_test_final = meta_test_final.set_index("post_id").loc[ids_txt_te].reset_index()
# Check allineamento
assert (meta_test_final["post_id"].to_numpy() == ids_txt_te).all()
print("Aligned")
# Remove post_id
X_meta_test = meta_test_final.drop(["post_id"], axis=1)
X_meta_test = X_meta_test.to_numpy(dtype=np.float32)

fused_test = np.load("D:/dataset/clip_cross_attention_emb/fused_test_finetuned.npy", allow_pickle = True)
X_te = np.hstack([fused_test, X_meta_test])

Aligned


In [45]:
X_te.shape

(5000, 1052)

In [46]:
np.save("D:/dataset/clip_cross_attention_emb/X_test_ft5.npy", X_te)

# CLASSIFICATION

In [49]:
X_tr = np.load("D:/dataset/clip_cross_attention_emb/X_train_ft5.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/clip_cross_attention_emb/X_val_ft5.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_5.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_5.npy", allow_pickle = True)

In [50]:
print(X_tr.shape, y_tr.shape, X_va.shape, y_va.shape)

(24993, 1052) (24993,) (5000, 1052) (5000,)


In [51]:
y_tr

array([0, 0, 0, ..., 2, 2, 1])

In [52]:
y_va

array([3, 3, 4, ..., 2, 2, 1])

In [53]:
X_va

array([[ 1.3042749 , -0.07988649,  0.18901303, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.11272   , -0.25869903,  0.4618878 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.80691516, -0.40040484, -0.19250205, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.8263543 , -0.32952374,  0.0795309 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.94892555, -0.8995705 , -0.93573654, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.508135  , -0.97229767, -0.6595417 , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [13]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.18562670582595545 | accuracy (val): 0.2058

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.18370460551107254 | accuracy (val): 0.193

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.1786325284442139 | accuracy (val): 0.191

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.18989751165887273 | accuracy (val): 0.194

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.17252566575132527 | accuracy (val): 0.1856

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.18551386517307436 | accuracy (val): 0.1916

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.1757454917038634 | accuracy (val): 0.1908

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.18596168038440036 | accuracy (val): 0.193

Best hyperparameter configuration:
{'alpha': 0.0001, 'class_weight': 

In [54]:
# NAIVE BAYES - GAUSSIAN

param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.1897 | accuracy (val): 0.1948

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.1897 | accuracy (val): 0.1948

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.1897 | accuracy (val): 0.1948

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.1895 | accuracy (val): 0.1946

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.18969039937255408

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.189690        0.1948
1   1.000000e-08      0.189690        0.1948
2   1.000000e-07      0.189690        0.1948
3   1.000000e-06      0.189529        0.1946


In [56]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1599 | accuracy (val): 0.1902

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.1624 | accuracy (val): 0.1982

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.1498 | accuracy (val): 0.1878

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.1645 | accuracy (val): 0.1994

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.1542 | accuracy (val): 0.1916

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.1545 | accuracy (val): 0.1964

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1580 | accuracy (val

In [57]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    clf.fit(X_tr, y_tr_enc)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1822 | accuracy (val): 0.1964

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1818 | accuracy (val): 0.1936

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1896 | accuracy (val): 0.1992

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1889 | accuracy (val): 0.1978

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1871 | accuracy (val): 0.2006

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE ON TEST 5 CLASSES

In [2]:
X_tr = np.load("D:/dataset/clip_cross_attention_emb/X_train_ft5.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/clip_cross_attention_emb/X_val_ft5.npy", allow_pickle = True).astype(np.float32)
X_te = np.load("D:/dataset/clip_cross_attention_emb/X_test_ft5.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_5.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_5.npy", allow_pickle = True)
y_te = np.load("D:/dataset/clip_cross_attention_emb/y_te_5.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0).astype(np.float32)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, X_va, y_tr, y_va
gc.collect()

675

In [3]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 0.0001,
        average = True,
        class_weight = 'balanced',
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_trva, y_trva)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2497 | accuracy (test): 0.2594


In [3]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=10, max_features=0.05, min_samples_leaf=5, n_estimators=30, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 1, learning_rate = 0.1, max_depth= 6, n_estimators= 100, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")

del X_trva, X_te, y_trva, y_te
gc.collect()


Configuration: GaussianNB()
macro-F1 (train): 0.2221 | accuracy (train): 0.2250

Configuration: RandomForestClassifier(max_depth=10, max_features=0.05, min_samples_leaf=5,
                       n_estimators=30, n_jobs=-1, random_state=42)
macro-F1 (train): 0.2162 | accuracy (train): 0.2242

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=1,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy

581