In [1]:
import os, time, duckdb, torch, timm, gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from PIL import Image
from sklearn.model_selection import ParameterGrid

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier

import torchvision.transforms as T
from pathlib import Path
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import duckdb, torch
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizer

from scipy.sparse import load_npz, hstack, save_npz

In [2]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


# TEXT TOKENS

In [4]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [5]:
def load_split(split_name):
    print(f"Loading {split_name}...")
    df = con.sql(f"""
        SELECT post_id, caption_bert_clip, er_bins
        FROM clip_full_sample
        WHERE split = '{split_name}'
    """).df()
    ids = df["post_id"].to_numpy()
    texts = df["caption_bert_clip"].tolist()
    y = df["er_bins"]
    del df; gc.collect()
    print(f"{split_name} done.")
    return ids, texts, y

In [6]:
train_ids, Xtr_text, y_tr = load_split("train")
val_ids, Xva_text, y_val  = load_split("validation")
test_ids, Xte_text, y_te  = load_split("test")

con.close()

Loading train...
train done.
Loading validation...
validation done.
Loading test...
test done.


In [7]:
print(len(Xtr_text), len(Xva_text), len(Xte_text))

24993 5000 5000


In [8]:
clip_name = "openai/clip-vit-base-patch32"

tokenizer = CLIPTokenizer.from_pretrained(clip_name)
model = CLIPModel.from_pretrained(clip_name)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [9]:
def embed_clip_text_tokens(texts, bs=64):
    all_tokens = []

    with torch.no_grad():
        for i in range(0, len(texts), bs):
            # print(f" processing batch {i}/{len(texts)}")
            batch = texts[i:i+bs]

            inputs = tokenizer(
                batch,
                padding="max_length",
                max_length=77,
                truncation=True,
                return_tensors="pt"
            ).to(device)

            # text_model is used for text embeddings
            out = model.text_model(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"]
            )

            # last_hidden_state allows to extract the last layer tokens, before they are pooled together (for the cross-attention fusion)
            tokens = out.last_hidden_state

            all_tokens.append(tokens.cpu())

    return torch.cat(all_tokens, dim=0)

In [10]:
def embed_sharded_cached_tokens(texts, ids, y, split_name, emb_dir, shard=5000):
    
    prefix = f"{clip_name.split('/')[-1]}_TOKENS_{split_name}"
    out_files = []

    n = len(texts)
    assert len(ids) == n
    assert len(y) == n

    for i in range(0, n, shard):
        f = emb_dir / f"{prefix}_{i:07d}.npy"
        
        if f.exists():
            print(f"[skip] {f.name}")
            out_files.append(f)
            continue

        part = texts[i:i+shard]
        t0 = time.time()

        T = embed_clip_text_tokens(part, bs=1)
        T = T.numpy().astype("float32")

        np.save(f, T)
        out_files.append(f)

        dt = time.time() - t0
        print(f"[saved] {f.name}  [{i+len(part)}/{n}]  ({len(part)} samples in {dt:.1f}s)")

    arrays = [np.load(f) for f in out_files]
    E_all = np.concatenate(arrays, axis=0).astype("float32")

    assert E_all.shape[0] == n, f"Rows {E_all.shape[0]} != {n}"

    np.save(emb_dir / f"{prefix}_ALL.npy", E_all)

    npz_path = emb_dir / f"{prefix}_ids_y.npz"
    np.savez(
        npz_path,
        ids=np.asarray(ids),
        embeddings=E_all,
        y=np.asarray(y)
    )

    print(f"[done] Saved ids + embeddings + y → {npz_path.name}")

    return E_all


In [11]:
emb_dir = Path('D:/dataset/clip_cross_attention_emb')

Etr = embed_sharded_cached_tokens(
    texts=Xtr_text,
    ids=train_ids,
    y=y_tr,
    split_name="train",
    emb_dir=emb_dir,
    shard = 1000
)


[saved] clip-vit-base-patch32_TOKENS_train_0000000.npy  [1000/24993]  (1000 samples in 83.7s)
[saved] clip-vit-base-patch32_TOKENS_train_0001000.npy  [2000/24993]  (1000 samples in 80.6s)
[saved] clip-vit-base-patch32_TOKENS_train_0002000.npy  [3000/24993]  (1000 samples in 83.0s)
[saved] clip-vit-base-patch32_TOKENS_train_0003000.npy  [4000/24993]  (1000 samples in 84.5s)
[saved] clip-vit-base-patch32_TOKENS_train_0004000.npy  [5000/24993]  (1000 samples in 86.8s)
[saved] clip-vit-base-patch32_TOKENS_train_0005000.npy  [6000/24993]  (1000 samples in 85.7s)
[saved] clip-vit-base-patch32_TOKENS_train_0006000.npy  [7000/24993]  (1000 samples in 83.5s)
[saved] clip-vit-base-patch32_TOKENS_train_0007000.npy  [8000/24993]  (1000 samples in 85.8s)
[saved] clip-vit-base-patch32_TOKENS_train_0008000.npy  [9000/24993]  (1000 samples in 88.5s)
[saved] clip-vit-base-patch32_TOKENS_train_0009000.npy  [10000/24993]  (1000 samples in 85.9s)
[saved] clip-vit-base-patch32_TOKENS_train_0010000.npy  [11

In [12]:
Eva = embed_sharded_cached_tokens(
    texts=Xva_text,
    ids=val_ids,
    y=y_val,
    split_name="validation",
    emb_dir=emb_dir,
    shard = 500)

[saved] clip-vit-base-patch32_TOKENS_validation_0000000.npy  [500/5000]  (500 samples in 45.8s)
[saved] clip-vit-base-patch32_TOKENS_validation_0000500.npy  [1000/5000]  (500 samples in 42.6s)
[saved] clip-vit-base-patch32_TOKENS_validation_0001000.npy  [1500/5000]  (500 samples in 42.0s)
[saved] clip-vit-base-patch32_TOKENS_validation_0001500.npy  [2000/5000]  (500 samples in 42.4s)
[saved] clip-vit-base-patch32_TOKENS_validation_0002000.npy  [2500/5000]  (500 samples in 39.4s)
[saved] clip-vit-base-patch32_TOKENS_validation_0002500.npy  [3000/5000]  (500 samples in 41.3s)
[saved] clip-vit-base-patch32_TOKENS_validation_0003000.npy  [3500/5000]  (500 samples in 45.1s)
[saved] clip-vit-base-patch32_TOKENS_validation_0003500.npy  [4000/5000]  (500 samples in 42.5s)
[saved] clip-vit-base-patch32_TOKENS_validation_0004000.npy  [4500/5000]  (500 samples in 42.9s)
[saved] clip-vit-base-patch32_TOKENS_validation_0004500.npy  [5000/5000]  (500 samples in 45.5s)
[done] Saved ids + embeddings +

In [13]:
Ete = embed_sharded_cached_tokens(
    texts=Xte_text,
    ids=test_ids,
    y=y_te,
    split_name="test",
    emb_dir=emb_dir,
    shard = 500)

[saved] clip-vit-base-patch32_TOKENS_test_0000000.npy  [500/5000]  (500 samples in 39.4s)
[saved] clip-vit-base-patch32_TOKENS_test_0000500.npy  [1000/5000]  (500 samples in 41.7s)
[saved] clip-vit-base-patch32_TOKENS_test_0001000.npy  [1500/5000]  (500 samples in 45.2s)
[saved] clip-vit-base-patch32_TOKENS_test_0001500.npy  [2000/5000]  (500 samples in 47.1s)
[saved] clip-vit-base-patch32_TOKENS_test_0002000.npy  [2500/5000]  (500 samples in 52.3s)
[saved] clip-vit-base-patch32_TOKENS_test_0002500.npy  [3000/5000]  (500 samples in 56.0s)
[saved] clip-vit-base-patch32_TOKENS_test_0003000.npy  [3500/5000]  (500 samples in 61.0s)
[saved] clip-vit-base-patch32_TOKENS_test_0003500.npy  [4000/5000]  (500 samples in 58.1s)
[saved] clip-vit-base-patch32_TOKENS_test_0004000.npy  [4500/5000]  (500 samples in 51.0s)
[saved] clip-vit-base-patch32_TOKENS_test_0004500.npy  [5000/5000]  (500 samples in 47.1s)
[done] Saved ids + embeddings + y → clip-vit-base-patch32_TOKENS_test_ids_y.npz


# IMAGES PATCHES

In [3]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [4]:
con.execute("""CREATE OR REPLACE TABLE img_splits_sample AS
SELECT m.post_id, m.split, i.full_image_file, m.er_bins
FROM clip_full_sample m
JOIN images_manifest1718_clean i ON m.post_id = i.post_id
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x1d9e2d211b0>

In [5]:
MODEL_NAME = "openai/clip-vit-base-patch32"
MODEL_TAG  = "clip_vit_b32"
IMG_DIR = r"D:/dataset/images_224_rgb"
BATCH_SIZE = 1
SHARD_SIZE = 50
OUT_DIR    = "D:/dataset/clip_cross_attention_emb"
device = torch.device("cpu")

num_threads = max(1, (os.cpu_count() or 4) // 2)
torch.set_num_threads(num_threads)
print(f"Uso dispositivo: {device} | PyTorch threads: {num_threads}")

print(f"Carico CLIP: {MODEL_NAME} ...")
clip_model = CLIPModel.from_pretrained(MODEL_NAME)
clip_processor = CLIPProcessor.from_pretrained(MODEL_NAME)

for p in clip_model.parameters():
    p.requires_grad = False

clip_model.eval()
clip_model.to(device)

# To undesrtand the image embedding shape
with torch.no_grad():
    dummy = torch.randn(1, 3, 224, 224)  
    # vision_model for image embeddings
    out = clip_model.vision_model(pixel_values=dummy.to(device))
    num_tokens = out.last_hidden_state.shape[1] # 50 to get the last layer of image patches
    feat_dim  = out.last_hidden_state.shape[-1] # 768
print("CLIP patch tokens:", num_tokens, "token dim:", feat_dim)

Uso dispositivo: cpu | PyTorch threads: 4
Carico CLIP: openai/clip-vit-base-patch32 ...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIP patch tokens: 50 token dim: 768


In [6]:
df = con.sql("""
    SELECT *
    FROM img_splits_sample
""").df()
print("Tabella img_splits caricata. Righe:", len(df))

if "full_image_file" not in df.columns:
    if "image_file" not in df.columns:
        raise ValueError("Non trovo né 'full_image_file' né 'image_file' in img_splits.")
    df["full_image_file"] = df["image_file"].apply(lambda x: os.path.join(IMG_DIR, x))
else:
    df["full_image_file"] = df["full_image_file"].apply(lambda x: os.path.join(IMG_DIR, x))

print(df[["post_id", "split", "full_image_file"]].head())

# Split
train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "validation"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print(f"train = {len(train_df)} | val = {len(val_df)} | test = {len(test_df)}")

Tabella img_splits caricata. Righe: 44688
                                post_id       split  \
0  coffeethentravel-1821049604449171679       train   
1      coffeevschai-1910450713893530118  validation   
2        cohlab_nyc-1761057847632557771       train   
3        cohlab_nyc-1805534068554420444       train   
4        cohlab_nyc-1827671707013133135       train   

                                     full_image_file  
0  D:/dataset/images_224_rgb\coffeethentravel-182...  
1  D:/dataset/images_224_rgb\coffeevschai-1910450...  
2  D:/dataset/images_224_rgb\cohlab_nyc-176105784...  
3  D:/dataset/images_224_rgb\cohlab_nyc-180553406...  
4  D:/dataset/images_224_rgb\cohlab_nyc-182767170...  
train = 31090 | val = 6665 | test = 6933


In [7]:
def embed_clip_image_tokens(image_paths, bs=1):
    all_tokens = []

    with torch.no_grad():
        for i in range(0, len(image_paths), bs):
            # print(f"processing batch {i}/{len(image_paths)}")

            batch_paths = image_paths[i:i+bs]

            images = [Image.open(p).convert("RGB") for p in batch_paths]
            inputs = clip_processor(images=images, return_tensors="pt").pixel_values.to(device)

            # vision_model for images
            out = clip_model.vision_model(pixel_values=inputs)

            # Patch image tokens with last_hidden_state
            tokens = out.last_hidden_state
            all_tokens.append(tokens.cpu())

    return torch.cat(all_tokens, dim=0)

In [8]:
def embed_sharded_cached_image_tokens(image_paths, ids, y, split_name, emb_dir, shard=500):
    
    prefix = f"{MODEL_TAG}_IMG_TOKENS_{split_name}"
    out_files = []
    
    n = len(image_paths)
    assert len(ids) == n
    assert len(y) == n

    for i in range(0, n, shard):
        f = emb_dir / f"{prefix}_{i:07d}.npy"

        if f.exists():
            print(f"[skip] {f.name}")
            out_files.append(f)
            continue

        part_paths = image_paths[i:i+shard]
        t0 = time.time()

        T = embed_clip_image_tokens(part_paths, bs=1)
        T = T.numpy().astype("float32")

        np.save(f, T)
        out_files.append(f)

        dt = time.time() - t0
        print(f"[saved] {f.name}  [{i+len(part_paths)}/{n}]  ({len(part_paths)} images in {dt:.1f}s)")

    arrays = [np.load(f) for f in out_files]
    E_all = np.concatenate(arrays, axis=0).astype("float32")
    assert E_all.shape[0] == n, f"Embeddings rows {E_all.shape[0]} != n={n}"

    np.save(emb_dir / f"{prefix}_ALL.npy", E_all)

    npz_path = emb_dir / f"{prefix}_ids_y.npz"
    np.savez(
        npz_path,
        ids=np.asarray(ids),
        embeddings=E_all,
        y=np.asarray(y)
    )

    print(f"[done] Saved final {split_name} → {npz_path.name}")
    
    return E_all

In [9]:
train_df.head()

Unnamed: 0,post_id,split,full_image_file,er_bins
0,coffeethentravel-1821049604449171679,train,D:/dataset/images_224_rgb\coffeethentravel-182...,high
1,cohlab_nyc-1761057847632557771,train,D:/dataset/images_224_rgb\cohlab_nyc-176105784...,medium
2,cohlab_nyc-1805534068554420444,train,D:/dataset/images_224_rgb\cohlab_nyc-180553406...,medium
3,cohlab_nyc-1827671707013133135,train,D:/dataset/images_224_rgb\cohlab_nyc-182767170...,medium
4,cohlab_nyc-1870432573244539951,train,D:/dataset/images_224_rgb\cohlab_nyc-187043257...,high


In [10]:
emb_dir = Path("D:/dataset/clip_cross_attention_emb")
image_paths = train_df["full_image_file"].tolist()
train_ids = train_df["post_id"].tolist()
train_y = train_df["er_bins"].tolist()

Etr_img = embed_sharded_cached_image_tokens(
    image_paths=image_paths,
    ids=train_ids,
    y=train_y,
    split_name="train",
    emb_dir=emb_dir,
    shard=1000
)


[saved] clip_vit_b32_IMG_TOKENS_train_0000000.npy  [1000/31090]  (1000 images in 159.4s)
[saved] clip_vit_b32_IMG_TOKENS_train_0001000.npy  [2000/31090]  (1000 images in 158.9s)
[saved] clip_vit_b32_IMG_TOKENS_train_0002000.npy  [3000/31090]  (1000 images in 154.9s)
[saved] clip_vit_b32_IMG_TOKENS_train_0003000.npy  [4000/31090]  (1000 images in 155.4s)
[saved] clip_vit_b32_IMG_TOKENS_train_0004000.npy  [5000/31090]  (1000 images in 162.0s)
[saved] clip_vit_b32_IMG_TOKENS_train_0005000.npy  [6000/31090]  (1000 images in 163.4s)
[saved] clip_vit_b32_IMG_TOKENS_train_0006000.npy  [7000/31090]  (1000 images in 168.1s)
[saved] clip_vit_b32_IMG_TOKENS_train_0007000.npy  [8000/31090]  (1000 images in 166.6s)
[saved] clip_vit_b32_IMG_TOKENS_train_0008000.npy  [9000/31090]  (1000 images in 157.5s)
[saved] clip_vit_b32_IMG_TOKENS_train_0009000.npy  [10000/31090]  (1000 images in 155.1s)
[saved] clip_vit_b32_IMG_TOKENS_train_0010000.npy  [11000/31090]  (1000 images in 153.0s)
[saved] clip_vit_b3

In [11]:
emb_dir = Path("D:/dataset/clip_cross_attention_emb")
image_paths = val_df["full_image_file"].tolist()
val_ids = val_df["post_id"].tolist()
val_y = val_df["er_bins"].tolist()

Eva_img = embed_sharded_cached_image_tokens(
    image_paths=image_paths,
    ids=val_ids,
    y=val_y,
    split_name="validation",
    emb_dir=emb_dir,
    shard=500
)


[saved] clip_vit_b32_IMG_TOKENS_validation_0000000.npy  [500/6665]  (500 images in 85.1s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0000500.npy  [1000/6665]  (500 images in 83.4s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0001000.npy  [1500/6665]  (500 images in 82.6s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0001500.npy  [2000/6665]  (500 images in 81.6s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0002000.npy  [2500/6665]  (500 images in 81.9s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0002500.npy  [3000/6665]  (500 images in 82.3s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0003000.npy  [3500/6665]  (500 images in 81.1s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0003500.npy  [4000/6665]  (500 images in 79.3s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0004000.npy  [4500/6665]  (500 images in 81.0s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0004500.npy  [5000/6665]  (500 images in 81.4s)
[saved] clip_vit_b32_IMG_TOKENS_validation_0005000.npy  [5500/6665]  (500 images in 80.2s)


In [12]:
emb_dir = Path("D:/dataset/clip_cross_attention_emb")
image_paths = test_df["full_image_file"].tolist()
test_ids = test_df["post_id"].tolist()
test_y = test_df["er_bins"].tolist()

Ete_img = embed_sharded_cached_image_tokens(
    image_paths=image_paths,
    ids=test_ids,
    y=test_y,
    split_name="test",
    emb_dir=emb_dir,
    shard=500
)


[saved] clip_vit_b32_IMG_TOKENS_test_0000000.npy  [500/6933]  (500 images in 81.5s)
[saved] clip_vit_b32_IMG_TOKENS_test_0000500.npy  [1000/6933]  (500 images in 78.3s)
[saved] clip_vit_b32_IMG_TOKENS_test_0001000.npy  [1500/6933]  (500 images in 81.0s)
[saved] clip_vit_b32_IMG_TOKENS_test_0001500.npy  [2000/6933]  (500 images in 78.4s)
[saved] clip_vit_b32_IMG_TOKENS_test_0002000.npy  [2500/6933]  (500 images in 74.5s)
[saved] clip_vit_b32_IMG_TOKENS_test_0002500.npy  [3000/6933]  (500 images in 77.1s)
[saved] clip_vit_b32_IMG_TOKENS_test_0003000.npy  [3500/6933]  (500 images in 80.2s)
[saved] clip_vit_b32_IMG_TOKENS_test_0003500.npy  [4000/6933]  (500 images in 78.9s)
[saved] clip_vit_b32_IMG_TOKENS_test_0004000.npy  [4500/6933]  (500 images in 77.3s)
[saved] clip_vit_b32_IMG_TOKENS_test_0004500.npy  [5000/6933]  (500 images in 76.4s)
[saved] clip_vit_b32_IMG_TOKENS_test_0005000.npy  [5500/6933]  (500 images in 77.1s)
[saved] clip_vit_b32_IMG_TOKENS_test_0005500.npy  [6000/6933]  (50

# CROSS-ATTENTION TRAIN

In [5]:
# Define required functions

# Aggregate the image tokens related to the same post and compute the mean
def aggregate_image_tokens_per_post(ids_txt, ids_img, img_tokens):
    img_tokens_post = []

    for pid in ids_txt:
        mask = (ids_img == pid)
        tokens_p = img_tokens[mask]
        
        if tokens_p.shape[0] == 0:
            agg = np.zeros((50, img_tokens.shape[-1]), dtype=img_tokens.dtype)
        else:
            agg = tokens_p.mean(axis=0)
        
        img_tokens_post.append(agg)

    return np.stack(img_tokens_post, axis=0)


# Define a loader to load the required data in the correct format
def simple_batch_loader(txt, img, y, batch_size=32, shuffle=True):
    N = len(y)
    idxs = np.arange(N)

    if shuffle:
        np.random.shuffle(idxs)

    for start in range(0, N, batch_size):
        end = start + batch_size
        batch_idx = idxs[start:end]

        yield txt[batch_idx], img[batch_idx], y[batch_idx]

# Defines the computations required to compute cross attention between text and images
def cross_attention_encode(txt, img):

    # Image projection to aligne the image dimension to the text one (for the attention mechanism)
    # It is needed as CLIP image produced embeddings with dimension 768 instead of 512
    img = img_proj(img)

    # TEXT looks at the IMAGE
    t2i, _ = attn_t2i(query=txt, key=img, value=img) # cross-attention: each text token looks at the image
    # t2i is the additional information that text got from looking at the image
    txt2 = norm_t1(txt + t2i) # the new info is added to the original text, then the norm is computed for robustness
    txt2 = norm_t2(txt2 + ff_t(txt2)) #ff_t adds non-linearity with the feed-forward step 

    # IMAGE looks at the TEXT
    i2t, _ = attn_i2t(query=img, key=txt2, value=txt2) # cross-attention: each image patch looks at the text
    img2 = norm_i1(img + i2t)
    img2 = norm_i2(img2 + ff_i(img2))

    # Pooling to get a single vector for caption/image
    txt_repr = txt2.mean(dim=1)
    img_repr = img2.mean(dim=1)

    fused = torch.cat([txt_repr, img_repr], dim=-1)

    return fused

# Fuses together text and image
def generate_fused_features(txt, img, batch_size=32):
    fused_list = []

    with torch.no_grad():
        for txt_b, img_b, _ in simple_batch_loader(
            txt, img, torch.zeros(len(txt)), batch_size=batch_size, shuffle=False
        ):
            fused = cross_attention_encode(txt_b, img_b)
            fused_list.append(fused.cpu().numpy())

    return np.vstack(fused_list)

In [4]:
text_npz_tr  = "D:/dataset/clip_cross_attention_emb/clip-vit-base-patch32_TOKENS_train_ids_y.npz"
image_npz_tr = "D:/dataset/clip_cross_attention_emb/clip_vit_b32_IMG_TOKENS_train_ids_y.npz"

t_tr = np.load(text_npz_tr, allow_pickle = True)
i_tr = np.load(image_npz_tr, allow_pickle = True)

ids_txt_tr = t_tr["ids"]
txt_tokens_tr = t_tr["embeddings"]

ids_img_tr = i_tr["ids"]
img_tokens_tr = i_tr["embeddings"] 

In [5]:
img_tokens_post_tr = aggregate_image_tokens_per_post(ids_txt_tr, ids_img_tr, img_tokens_tr)

In [6]:
print(txt_tokens_tr.shape, img_tokens_tr.shape, img_tokens_post_tr.shape)

(24993, 77, 512) (31090, 50, 768) (24993, 50, 768)


In [7]:
set_txt = set(t_tr["ids"])
set_img = set(i_tr["ids"])

print("Post solo nel testo:", set_txt - set_img)
print("Post solo nelle immagini:", set_img - set_txt)

Post solo nel testo: set()
Post solo nelle immagini: set()


In [8]:
dim_text = 512 # token text
dim_img  = 768 # patch images
dim = dim_text # common space must be 512

# Projection to get the image to a 512 dimension
img_proj = nn.Linear(dim_img, dim)

num_heads = 8

# Attention layers text to image and image to text
attn_t2i = nn.MultiheadAttention(dim, num_heads, batch_first=True)
attn_i2t = nn.MultiheadAttention(dim, num_heads, batch_first=True)

# Feed-forward operations for text and images
ff_t = nn.Sequential(
    nn.Linear(dim, 4*dim),
    nn.ReLU(),
    nn.Linear(4*dim, dim)
)

ff_i = nn.Sequential(
    nn.Linear(dim, 4*dim),
    nn.ReLU(),
    nn.Linear(4*dim, dim)
)

# Layer norms
norm_t1 = nn.LayerNorm(dim)
norm_t2 = nn.LayerNorm(dim)
norm_i1 = nn.LayerNorm(dim)
norm_i2 = nn.LayerNorm(dim)

In [9]:
device = torch.device("cpu")

txt_tr = torch.tensor(txt_tokens_tr, dtype=torch.float32)


In [10]:
del txt_tokens_tr, img_tokens_tr
gc.collect()

126

In [11]:
img_tr = torch.tensor(img_tokens_post_tr, dtype=torch.float32)

fused_train = generate_fused_features(txt_tr, img_tr, batch_size=32)
np.save("D:/dataset/clip_cross_attention_emb/fused_train.npy", fused_train)

In [12]:
fused_train.shape

(24993, 1024)

In [13]:
# target

# 5 classes
y_df = con.execute("""
    SELECT post_id, er_bins, er_bins3, er_bins2
    FROM clip_full_sample
    WHERE split = 'train'""").df()


y_tr_aligned5 = (
    y_df.set_index("post_id")
        .loc[ids_txt_tr, "er_bins"]
        .to_numpy()
)


# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "very_low": 0,
    "low": 1,
    "medium": 2,
    "high": 3,
    "very_high": 4
}
y_tr_int = np.array([class_to_int[c] for c in y_tr_aligned5])
y_tr_5 = torch.tensor(y_tr_int, dtype=torch.long)

print("Unique classes:", np.unique(y_tr_int))
print("shape:", y_tr_5.shape)
print(txt_tr.shape, img_tr.shape, y_tr_5.shape)
np.save("D:/dataset/clip_cross_attention_emb/y_tr_5.npy", y_tr_5)

# 3 classes

y_tr_aligned3 = (
    y_df.set_index("post_id")
        .loc[ids_txt_tr, "er_bins3"]
        .to_numpy()
)

# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "low": 0,
    "medium": 1,
    "high": 2
}
y_tr_int = np.array([class_to_int[c] for c in y_tr_aligned3])
y_tr_3 = torch.tensor(y_tr_int, dtype=torch.long)

print("Unique classes:", np.unique(y_tr_int))
print("shape:", y_tr_3.shape)
print(txt_tr.shape, img_tr.shape, y_tr_3.shape)

np.save("D:/dataset/clip_cross_attention_emb/y_tr_3.npy", y_tr_3)

y_tr_aligned2 = (
    y_df.set_index("post_id")
        .loc[ids_txt_tr, "er_bins2"]
        .to_numpy()
)

# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "low": 0,
    "high": 1
}
y_tr_int = np.array([class_to_int[c] for c in y_tr_aligned2])
y_tr_2 = torch.tensor(y_tr_int, dtype=torch.long)

print("Unique classes:", np.unique(y_tr_int))
print("shape:", y_tr_2.shape)
print(txt_tr.shape, img_tr.shape, y_tr_2.shape)


np.save("D:/dataset/clip_cross_attention_emb/y_tr_2.npy", y_tr_2)

Unique classes: [0 1 2 3 4]
shape: torch.Size([24993])
torch.Size([24993, 77, 512]) torch.Size([24993, 50, 768]) torch.Size([24993])
Unique classes: [0 1 2]
shape: torch.Size([24993])
torch.Size([24993, 77, 512]) torch.Size([24993, 50, 768]) torch.Size([24993])
Unique classes: [0 1]
shape: torch.Size([24993])
torch.Size([24993, 77, 512]) torch.Size([24993, 50, 768]) torch.Size([24993])


# CROSS-ATTENTION VALIDATION

In [3]:
text_npz_va  = "D:/dataset/clip_cross_attention_emb/clip-vit-base-patch32_TOKENS_validation_ids_y.npz"
image_npz_va = "D:/dataset/clip_cross_attention_emb/clip_vit_b32_IMG_TOKENS_validation_ids_y.npz"

t_va = np.load(text_npz_va, allow_pickle = True)
i_va = np.load(image_npz_va, allow_pickle = True)

ids_txt_va = t_va["ids"]
txt_tokens_va = t_va["embeddings"]
y_va = t_va["y"]

ids_img_va = i_va["ids"]
img_tokens_va = i_va["embeddings"]

In [6]:
img_tokens_post_va = aggregate_image_tokens_per_post(ids_txt_va, ids_img_va, img_tokens_va)

In [7]:
print(txt_tokens_va.shape, img_tokens_va.shape, img_tokens_post_va.shape, y_va.shape)

(5000, 77, 512) (6665, 50, 768) (5000, 50, 768) (5000,)


In [8]:
set_txt = set(t_va["ids"])
set_img = set(i_va["ids"])

print("Post solo nel testo:", set_txt - set_img)
print("Post solo nelle immagini:", set_img - set_txt)

Post solo nel testo: set()
Post solo nelle immagini: set()


In [9]:
# target

# 5 classes
y_df = con.execute("""
    SELECT post_id, er_bins, er_bins3, er_bins2
    FROM clip_full_sample
    WHERE split = 'validation'""").df()


y_va_aligned5 = (
    y_df.set_index("post_id")
        .loc[ids_txt_va, "er_bins"]
        .to_numpy()
)


# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "very_low": 0,
    "low": 1,
    "medium": 2,
    "high": 3,
    "very_high": 4
}
y_va_int = np.array([class_to_int[c] for c in y_va_aligned5])
y_va_5 = torch.tensor(y_va_int, dtype=torch.long)

print("Unique classes:", np.unique(y_va_int))
print("shape:", y_va_5.shape)
# print(txt_va.shape, img_va.shape, y_va_5.shape)
np.save("D:/dataset/clip_cross_attention_emb/y_va_5.npy", y_va_5)

# 3 classes

y_va_aligned3 = (
    y_df.set_index("post_id")
        .loc[ids_txt_va, "er_bins3"]
        .to_numpy()
)

# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "low": 0,
    "medium": 1,
    "high": 2
}
y_va_int = np.array([class_to_int[c] for c in y_va_aligned3])
y_va_3 = torch.tensor(y_va_int, dtype=torch.long)

print("Unique classes:", np.unique(y_va_int))
print("shape:", y_va_3.shape)
# print(txt_va.shape, img_va.shape, y_va_3.shape)

np.save("D:/dataset/clip_cross_attention_emb/y_va_3.npy", y_va_3)

y_va_aligned2 = (
    y_df.set_index("post_id")
        .loc[ids_txt_va, "er_bins2"]
        .to_numpy()
)

# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "low": 0,
    "high": 1
}
y_va_int = np.array([class_to_int[c] for c in y_va_aligned2])
y_va_2 = torch.tensor(y_va_int, dtype=torch.long)

print("Unique classes:", np.unique(y_va_int))
print("shape:", y_va_2.shape)
# print(txt_va.shape, img_va.shape, y_va_2.shape)


np.save("D:/dataset/clip_cross_attention_emb/y_va_2.npy", y_va_2)

Unique classes: [0 1 2 3 4]
shape: torch.Size([5000])
Unique classes: [0 1 2]
shape: torch.Size([5000])
Unique classes: [0 1]
shape: torch.Size([5000])


In [10]:
dim_text = 512 # token text
dim_img  = 768 # patch images
dim = dim_text # common space must be 512

# Projection to get the image to a 512 dimension
img_proj = nn.Linear(dim_img, dim)

num_heads = 8

# Attention layers text to image and image to text
attn_t2i = nn.MultiheadAttention(dim, num_heads, batch_first=True)
attn_i2t = nn.MultiheadAttention(dim, num_heads, batch_first=True)

# Feed-forward operations for text and images
ff_t = nn.Sequential(
    nn.Linear(dim, 4*dim),
    nn.ReLU(),
    nn.Linear(4*dim, dim)
)

ff_i = nn.Sequential(
    nn.Linear(dim, 4*dim),
    nn.ReLU(),
    nn.Linear(4*dim, dim)
)

# Layer norms
norm_t1 = nn.LayerNorm(dim)
norm_t2 = nn.LayerNorm(dim)
norm_i1 = nn.LayerNorm(dim)
norm_i2 = nn.LayerNorm(dim)

In [11]:
device = torch.device("cpu")

txt_va = torch.tensor(txt_tokens_va, dtype=torch.float32)
img_va = torch.tensor(img_tokens_post_va, dtype=torch.float32)

fused_val = generate_fused_features(txt_va, img_va, batch_size=32)
np.save("D:/dataset/clip_cross_attention_emb/fused_val.npy", fused_val)

In [12]:
fused_val.shape

(5000, 1024)

# CROSS-ATTENTION TEST

In [13]:
text_npz_te  = "D:/dataset/clip_cross_attention_emb/clip-vit-base-patch32_TOKENS_test_ids_y.npz"
image_npz_te = "D:/dataset/clip_cross_attention_emb/clip_vit_b32_IMG_TOKENS_test_ids_y.npz"

t_te = np.load(text_npz_te, allow_pickle = True)
i_te = np.load(image_npz_te, allow_pickle = True)

ids_txt_te = t_te["ids"]
txt_tokens_te = t_te["embeddings"]

ids_img_te = i_te["ids"]
img_tokens_te = i_te["embeddings"]

In [14]:
img_tokens_post_te = aggregate_image_tokens_per_post(ids_txt_te, ids_img_te, img_tokens_te)

In [15]:
print(txt_tokens_te.shape, img_tokens_te.shape, img_tokens_post_te.shape)

(5000, 77, 512) (6933, 50, 768) (5000, 50, 768)


In [16]:
set_txt = set(t_te["ids"])
set_img = set(i_te["ids"])

print("Post solo nel testo:", set_txt - set_img)
print("Post solo nelle immagini:", set_img - set_txt)

Post solo nel testo: set()
Post solo nelle immagini: set()


In [17]:
device = torch.device("cpu")

txt_te = torch.tensor(txt_tokens_te, dtype=torch.float32)
img_te = torch.tensor(img_tokens_post_te, dtype=torch.float32)

# target

# 5 classes
y_df = con.execute("""
    SELECT post_id, er_bins, er_bins3, er_bins2
    FROM clip_full_sample
    WHERE split = 'test'""").df()


y_te_aligned5 = (
    y_df.set_index("post_id")
        .loc[ids_txt_te, "er_bins"]
        .to_numpy()
)


# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "very_low": 0,
    "low": 1,
    "medium": 2,
    "high": 3,
    "very_high": 4
}
y_te_int = np.array([class_to_int[c] for c in y_te_aligned5])
y_te_5 = torch.tensor(y_te_int, dtype=torch.long)

print("Unique classes:", np.unique(y_te_int))
print("shape:", y_te_5.shape)
# print(txt_va.shape, img_va.shape, y_va_5.shape)
np.save("D:/dataset/clip_cross_attention_emb/y_te_5.npy", y_te_5)

# 3 classes

y_te_aligned3 = (
    y_df.set_index("post_id")
        .loc[ids_txt_te, "er_bins3"]
        .to_numpy()
)

# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "low": 0,
    "medium": 1,
    "high": 2
}
y_te_int = np.array([class_to_int[c] for c in y_te_aligned3])
y_te_3 = torch.tensor(y_te_int, dtype=torch.long)

print("Unique classes:", np.unique(y_te_int))
print("shape:", y_te_3.shape)
# print(txt_va.shape, img_va.shape, y_va_3.shape)

np.save("D:/dataset/clip_cross_attention_emb/y_te_3.npy", y_te_3)

y_te_aligned2 = (
    y_df.set_index("post_id")
        .loc[ids_txt_te, "er_bins2"]
        .to_numpy()
)

# Since I am using a pytorch tensor I need the target into numerical variable
class_to_int = {
    "low": 0,
    "high": 1
}
y_te_int = np.array([class_to_int[c] for c in y_te_aligned2])
y_te_2 = torch.tensor(y_te_int, dtype=torch.long)

print("Unique classes:", np.unique(y_te_int))
print("shape:", y_te_2.shape)
# print(txt_va.shape, img_va.shape, y_va_2.shape)


np.save("D:/dataset/clip_cross_attention_emb/y_te_2.npy", y_te_2)

Unique classes: [0 1 2 3 4]
shape: torch.Size([5000])
Unique classes: [0 1 2]
shape: torch.Size([5000])
Unique classes: [0 1]
shape: torch.Size([5000])


In [18]:
dim_text = 512 # token text
dim_img  = 768 # patch images
dim = dim_text # common space must be 512

# Projection to get the image to a 512 dimension
img_proj = nn.Linear(dim_img, dim)

num_heads = 8

# Attention layers text to image and image to text
attn_t2i = nn.MultiheadAttention(dim, num_heads, batch_first=True)
attn_i2t = nn.MultiheadAttention(dim, num_heads, batch_first=True)

# Feed-forward operations for text and images
ff_t = nn.Sequential(
    nn.Linear(dim, 4*dim),
    nn.ReLU(),
    nn.Linear(4*dim, dim)
)

ff_i = nn.Sequential(
    nn.Linear(dim, 4*dim),
    nn.ReLU(),
    nn.Linear(4*dim, dim)
)

# Layer norms
norm_t1 = nn.LayerNorm(dim)
norm_t2 = nn.LayerNorm(dim)
norm_i1 = nn.LayerNorm(dim)
norm_i2 = nn.LayerNorm(dim)

In [19]:
fused_test = generate_fused_features(txt_te, img_te, batch_size=32)
np.save("D:/dataset/clip_cross_attention_emb/fused_test.npy", fused_test)

In [20]:
fused_test.shape

(5000, 1024)

# FUSION

In [21]:
text_npz_tr  = "D:/dataset/clip_cross_attention_emb/clip-vit-base-patch32_TOKENS_train_ids_y.npz"
t_tr = np.load(text_npz_tr, allow_pickle = True)
ids_txt_tr = t_tr["ids"]

In [22]:
len(ids_txt_tr)

24993

In [23]:
# METADATA TRAIN
meta_train_final = pd.read_csv("D:/dataset/meta_classification/meta_train_final.csv")
# Filtra dati che sono in ids_txt_tr
meta_train_final = meta_train_final[meta_train_final["post_id"].isin(ids_txt_tr)]
# Riordina seguento l'ordine degli embeddings
meta_train_final = meta_train_final.set_index("post_id").loc[ids_txt_tr].reset_index()
# Check allineamento
assert (meta_train_final["post_id"].to_numpy() == ids_txt_tr).all()
print("Aligned")
# Remove post_id
X_meta_train = meta_train_final.drop(["post_id"], axis=1)
X_meta_train = X_meta_train.to_numpy(dtype=np.float32)

fused_train = np.load("D:/dataset/clip_cross_attention_emb/fused_train.npy", allow_pickle = True)
X_tr = np.hstack([fused_train, X_meta_train])

Aligned


In [24]:
X_tr.shape

(24993, 1052)

In [25]:
np.save("D:/dataset/clip_cross_attention_emb/X_train.npy", X_tr)

In [26]:
# METADATA VALIDATION
text_npz_va  = "D:/dataset/clip_cross_attention_emb/clip-vit-base-patch32_TOKENS_validation_ids_y.npz"
t_va = np.load(text_npz_va, allow_pickle = True)
ids_txt_va = t_va["ids"]

meta_val_final = pd.read_csv("D:/dataset/meta_classification/meta_val_final.csv")
# Filtra dati che sono in ids_txt_tr
meta_val_final = meta_val_final[meta_val_final["post_id"].isin(ids_txt_va)]
# Riordina seguento l'ordine degli embeddings
meta_val_final = meta_val_final.set_index("post_id").loc[ids_txt_va].reset_index()
# Check allineamento
assert (meta_val_final["post_id"].to_numpy() == ids_txt_va).all()
print("Aligned")
# Remove post_id
X_meta_val = meta_val_final.drop(["post_id"], axis=1)
X_meta_val = X_meta_val.to_numpy(dtype=np.float32)

fused_val = np.load("D:/dataset/clip_cross_attention_emb/fused_val.npy", allow_pickle = True)
X_va = np.hstack([fused_val, X_meta_val])

Aligned


In [27]:
X_va.shape

(5000, 1052)

In [28]:
np.save("D:/dataset/clip_cross_attention_emb/X_val.npy", X_va)

In [30]:
# METADATA TEST

text_npz_te  = "D:/dataset/clip_cross_attention_emb/clip-vit-base-patch32_TOKENS_test_ids_y.npz"
t_te = np.load(text_npz_te, allow_pickle = True)
ids_txt_te = t_te["ids"]

meta_test_final = pd.read_csv("D:/dataset/meta_classification/meta_test_final.csv")
# Filtra dati che sono in ids_txt_tr
meta_test_final = meta_test_final[meta_test_final["post_id"].isin(ids_txt_te)]
# Riordina seguento l'ordine degli embeddings
meta_test_final = meta_test_final.set_index("post_id").loc[ids_txt_te].reset_index()
# Check allineamento
assert (meta_test_final["post_id"].to_numpy() == ids_txt_te).all()
print("Aligned")
# Remove post_id
X_meta_test = meta_test_final.drop(["post_id"], axis=1)
X_meta_test = X_meta_test.to_numpy(dtype=np.float32)

fused_test = np.load("D:/dataset/clip_cross_attention_emb/fused_test.npy", allow_pickle = True)
X_te = np.hstack([fused_test, X_meta_test])

Aligned


In [31]:
X_te.shape

(5000, 1052)

In [32]:
np.save("D:/dataset/clip_cross_attention_emb/X_test.npy", X_te)

# CLASSIFICATION 5

In [2]:
X_tr = np.load("D:/dataset/clip_cross_attention_emb/X_train.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/clip_cross_attention_emb/X_val.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_5.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_5.npy", allow_pickle = True)

In [3]:
print(X_tr.shape, y_tr.shape, X_va.shape, y_va.shape)

(24993, 1052) (24993,) (5000, 1052) (5000,)


In [4]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.1647556846408762 | accuracy (val): 0.2346

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.16843089580730625 | accuracy (val): 0.2324

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.15993931310424248 | accuracy (val): 0.2416

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.15375959452643725 | accuracy (val): 0.2328

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.14564667869311454 | accuracy (val): 0.2426

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.15169951567450124 | accuracy (val): 0.2472

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.15987081430319183 | accuracy (val): 0.2584

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.15535904820075885 | accuracy (val): 0.258

Best hyperparameter configuration:
{'alpha': 1e-05, 'class_weight

In [4]:
# NAIVE BAYES - GAUSSIAN

param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.0672 | accuracy (val): 0.1978

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.0672 | accuracy (val): 0.1978

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.0672 | accuracy (val): 0.1978

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.0672 | accuracy (val): 0.1978

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.06716980704235488

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09       0.06717        0.1978
1   1.000000e-08       0.06717        0.1978
2   1.000000e-07       0.06717        0.1978
3   1.000000e-06       0.06717        0.1978


In [5]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)

# Among these the best is 
# Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
# macro-F1 (val): 0.2244 | accuracy (val): 0.2418


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.2559 | accuracy (val): 0.2606

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.2667 | accuracy (val): 0.2688

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.2772 | accuracy (val): 0.2810

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.2714 | accuracy (val): 0.2690

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.2675 | accuracy (val): 0.2734

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.2671 | accuracy (val): 0.2776

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.2272 | accuracy (val

In [6]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    clf.fit(X_tr, y_tr_enc)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2647 | accuracy (val): 0.2956

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2554 | accuracy (val): 0.2912

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2637 | accuracy (val): 0.2902

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2620 | accuracy (val): 0.2898

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2602 | accuracy (val): 0.2970

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE ON TEST

In [9]:
X_tr = np.load("D:/dataset/clip_cross_attention_emb/X_train.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/clip_cross_attention_emb/X_val.npy", allow_pickle = True).astype(np.float32)
X_te = np.load("D:/dataset/clip_cross_attention_emb/X_test.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_5.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_5.npy", allow_pickle = True)
y_te = np.load("D:/dataset/clip_cross_attention_emb/y_te_5.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0).astype(np.float32)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, X_va, y_tr, y_va
gc.collect()

66

In [24]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=8, max_features=0.05, min_samples_leaf=2, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 4, n_estimators= 100, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")

del X_trva, X_te, y_trva, y_te
gc.collect()


Configuration: GaussianNB()
macro-F1 (train): 0.1510 | accuracy (train): 0.2670

Configuration: RandomForestClassifier(max_depth=8, max_features=0.05, min_samples_leaf=2,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (train): 0.2883 | accuracy (train): 0.2968

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=

98

In [10]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = 'balanced',
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_trva, y_trva)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2303 | accuracy (test): 0.2380


# 3 classi

In [None]:
del X_tr, X_va, y_tr, y_va
gc.collect()

In [2]:
X_tr = np.load("D:/dataset/clip_cross_attention_emb/X_train.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/clip_cross_attention_emb/X_val.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_3.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_3.npy", allow_pickle = True)

In [3]:
print(X_tr.shape, y_tr.shape, X_va.shape, y_va.shape)

(24993, 1052) (24993,) (5000, 1052) (5000,)


In [4]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.35353245107982706 | accuracy (val): 0.3848

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.3397182866783806 | accuracy (val): 0.3856

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.3364996891219681 | accuracy (val): 0.3854

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.33820837381647556 | accuracy (val): 0.3962

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.32146032926155677 | accuracy (val): 0.3856

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.30364638333081906 | accuracy (val): 0.3818

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.30429447029801726 | accuracy (val): 0.3936

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.3227482384757346 | accuracy (val): 0.4142

Best hyperparameter configuration:
{'alpha': 1e-05, 'class_weight'

In [11]:
# NAIVE BAYES - GAUSSIAN

param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.1687 | accuracy (val): 0.3360

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.1687 | accuracy (val): 0.3360

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.1687 | accuracy (val): 0.3360

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.1687 | accuracy (val): 0.3360

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.1686771998419683

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.168677         0.336
1   1.000000e-08      0.168677         0.336
2   1.000000e-07      0.168677         0.336
3   1.000000e-06      0.168677         0.336


In [12]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)

# Among these the best is 
# Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
# macro-F1 (val): 0.2244 | accuracy (val): 0.2418


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.4088 | accuracy (val): 0.4200

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.4178 | accuracy (val): 0.4306

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.4119 | accuracy (val): 0.4352

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.4025 | accuracy (val): 0.4264

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.4146 | accuracy (val): 0.4390

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.3947 | accuracy (val): 0.4254

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.4169 | accuracy (val

In [13]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    clf.fit(X_tr, y_tr_enc)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4181 | accuracy (val): 0.4588

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3873 | accuracy (val): 0.4478

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4160 | accuracy (val): 0.4666

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3918 | accuracy (val): 0.4514

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4229 | accuracy (val): 0.4638

Combination: {'colsample_bytr

In [11]:
# PERFORMANCE ON TEST
del X_trva, X_te, y_trva, y_te
gc.collect()

28

In [12]:
X_tr = np.load("D:/dataset/clip_cross_attention_emb/X_train.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/clip_cross_attention_emb/X_val.npy", allow_pickle = True).astype(np.float32)
X_te = np.load("D:/dataset/clip_cross_attention_emb/X_test.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_3.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_3.npy", allow_pickle = True)
y_te = np.load("D:/dataset/clip_cross_attention_emb/y_te_3.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0).astype(np.float32)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, X_va, y_tr, y_va
gc.collect()

66

In [26]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=8, max_features='sqrt', min_samples_leaf=2, n_estimators=50, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 1, learning_rate = 0.1, max_depth= 6, n_estimators= 100, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")

del X_trva, X_te, y_trva, y_te
gc.collect()


Configuration: GaussianNB()
macro-F1 (train): 0.3145 | accuracy (train): 0.4118

Configuration: RandomForestClassifier(max_depth=8, min_samples_leaf=2, n_estimators=50,
                       n_jobs=-1, random_state=42)
macro-F1 (train): 0.4504 | accuracy (train): 0.4580

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=1,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=

98

In [13]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_trva, y_trva)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.3278 | accuracy (test): 0.3708


# 2 classi

In [5]:
del X_tr, X_va, y_tr, y_va
gc.collect()

238

In [6]:
X_tr = np.load("D:/dataset/clip_cross_attention_emb/X_train.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/clip_cross_attention_emb/X_val.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_2.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_2.npy", allow_pickle = True)

In [7]:
print(X_tr.shape, y_tr.shape, X_va.shape, y_va.shape)

(24993, 1052) (24993,) (5000, 1052) (5000,)


In [8]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.5744943848469726 | accuracy (val): 0.5748

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.5803967103102088 | accuracy (val): 0.5804

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.5861664794848382 | accuracy (val): 0.5862

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.5776986230489807 | accuracy (val): 0.5786

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.5978638527859992 | accuracy (val): 0.598

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.5916889649859678 | accuracy (val): 0.592

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.6251065140966521 | accuracy (val): 0.6348

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.6309103858967295 | accuracy (val): 0.6374

Best hyperparameter configuration:
{'alpha': 0.01, 'class_weight': 'balan

In [18]:
# NAIVE BAYES - GAUSSIAN

param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.3333 | accuracy (val): 0.4968

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.3333 | accuracy (val): 0.4968

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.3333 | accuracy (val): 0.4968

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.3333 | accuracy (val): 0.4968

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.33331298295061895

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.333313        0.4968
1   1.000000e-08      0.333313        0.4968
2   1.000000e-07      0.333313        0.4968
3   1.000000e-06      0.333313        0.4968


In [19]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)

# Among these the best is 
# Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
# macro-F1 (val): 0.2244 | accuracy (val): 0.2418


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.5820 | accuracy (val): 0.5972

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.5996 | accuracy (val): 0.6120

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.5971 | accuracy (val): 0.6152

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.5830 | accuracy (val): 0.6016

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.5925 | accuracy (val): 0.6124

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.5852 | accuracy (val): 0.6082

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.5511 | accuracy (val

In [20]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    clf.fit(X_tr, y_tr_enc)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6058 | accuracy (val): 0.6248

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6121 | accuracy (val): 0.6280

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5836 | accuracy (val): 0.6106

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5749 | accuracy (val): 0.6034

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6049 | accuracy (val): 0.6244

Combination: {'colsample_bytr

In [22]:
del X_tr, y_tr, X_va, y_va
gc.collect()

653

In [14]:
# PERFORMANCE ON TEST
del X_trva, y_trva, X_te, y_te
gc.collect()

28

In [15]:
X_tr = np.load("D:/dataset/clip_cross_attention_emb/X_train.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/clip_cross_attention_emb/X_val.npy", allow_pickle = True).astype(np.float32)
X_te = np.load("D:/dataset/clip_cross_attention_emb/X_test.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/clip_cross_attention_emb/y_tr_2.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_cross_attention_emb/y_va_2.npy", allow_pickle = True)
y_te = np.load("D:/dataset/clip_cross_attention_emb/y_te_2.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0).astype(np.float32)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, X_va, y_tr, y_va
gc.collect()

66

In [28]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=8, max_features=0.05, min_samples_leaf=2, n_estimators=50, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 4, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")

del X_trva, X_te, y_trva, y_te
gc.collect()


Configuration: GaussianNB()
macro-F1 (train): 0.5414 | accuracy (train): 0.5784

Configuration: RandomForestClassifier(max_depth=8, max_features=0.05, min_samples_leaf=2,
                       n_estimators=50, n_jobs=-1, random_state=42)
macro-F1 (train): 0.6331 | accuracy (train): 0.6372

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=

98

In [16]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 0.01,
        average = True,
        class_weight = 'balanced',
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_trva, y_trva)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.6206 | accuracy (test): 0.6242
