In [1]:
from sklearn.model_selection import train_test_split
import duckdb, torch, time, os, gc
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.sparse import load_npz, hstack, save_npz

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from transformers import CLIPTokenizer, CLIPModel
import torch
import numpy as np
import gc, os, time
from pathlib import Path

# SET UP

In [3]:
# Check which model to use, depending on performance and time required on a sample set

TEXT_COLUMN = "caption_bert_clip"
TARGET_COLUMN = "er_bins"
TABLE_NAME = "md1718"
DB_PATH = "D:/db/meta.duckdb"

SAMPLE_SIZE = 5000 
TEST_SIZE = 0.2
RANDOM_STATE = 42

con = duckdb.connect(DB_PATH)

query = f"""
    SELECT
        {TEXT_COLUMN} AS {TEXT_COLUMN},
        {TARGET_COLUMN} AS {TARGET_COLUMN}
    FROM {TABLE_NAME}
    WHERE {TEXT_COLUMN} IS NOT NULL
      AND {TARGET_COLUMN} IS NOT NULL
    ORDER BY random()
    LIMIT {SAMPLE_SIZE}
"""

df = con.execute(query).df()

print(f"Campione estratto da DuckDB: {len(df)} righe")

texts = df[TEXT_COLUMN].astype(str).tolist()

# Numeric target for XGBoost
le = LabelEncoder()
y = le.fit_transform(df[TARGET_COLUMN])

# Fixed split on indexes to ensure the same are used
indices = np.arange(len(df))
train_idx, test_idx = train_test_split(
    indices,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Campione estratto da DuckDB: 5000 righe


In [4]:
# List of encoders to test
# Modelli CLIP da testare (puoi aggiungerne altri)
ENCODERS = {
    "clip-vit-b32": "openai/clip-vit-base-patch32",
    "clip-vit-b16": "openai/clip-vit-base-patch16",
    "clip-vit-l14": "openai/clip-vit-large-patch14"
}


# Function to extract embeddings
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np
import time

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def extract_clip_text_embeddings(model_name, model_path, texts, batch_size=32):
    print(f"\n Upload CLIP model: {model_name} ({model_path})")

    # Carico modello e processor
    model = CLIPModel.from_pretrained(model_path)
    processor = CLIPProcessor.from_pretrained(model_path)

    model.to(DEVICE)
    model.eval()

    all_embeddings = []
    start_time = time.time()

    with torch.no_grad():
        for start in range(0, len(texts), batch_size):
            batch_texts = texts[start:start + batch_size]

            # Tokenizzazione
            inputs = processor(
                text=batch_texts,
                images=None,
                return_tensors="pt",
                padding=True,
                truncation=True
            )

            # Sposto su device
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

            # Estrazione embedding testuali
            text_features = model.get_text_features(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"]
            )

            # Normalizzazione L2 (tipica per CLIP)
            text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)

            all_embeddings.append(text_features.cpu().numpy())

    end_time = time.time()
    extraction_time = end_time - start_time

    embeddings = np.vstack(all_embeddings)

    print(f" Time CLIP embedding extraction: {extraction_time:.2f} s")
    print(f" Shape embeddings: {embeddings.shape}")

    return embeddings, extraction_time

# Function classify ER based on the extracted embeddings
def benchmark_classifiers(X, y, train_idx, test_idx):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    results = []

    # 1) Linear SVM
    svm = LinearSVC(random_state=RANDOM_STATE)
    start_train = time.time()
    svm.fit(X_train, y_train)
    end_train = time.time()
    svm_train_time = end_train - start_train

    start_pred = time.time()
    y_pred_svm = svm.predict(X_test)
    end_pred = time.time()
    svm_infer_time = end_pred - start_pred

    svm_acc = accuracy_score(y_test, y_pred_svm)
    svm_f1 = f1_score(y_test, y_pred_svm, average="macro")

    results.append({
        "Classifier": "LinearSVM",
        "Train Time (s)": svm_train_time,
        "Inference Time (s)": svm_infer_time,
        "Accuracy": svm_acc,
        "F1-macro": svm_f1
    })

    # 2) Naive Bayes (GaussianNB)
    nb = GaussianNB()
    start_train = time.time()
    nb.fit(X_train, y_train)
    end_train = time.time()
    nb_train_time = end_train - start_train

    start_pred = time.time()
    y_pred_nb = nb.predict(X_test)
    end_pred = time.time()
    nb_infer_time = end_pred - start_pred

    nb_acc = accuracy_score(y_test, y_pred_nb)
    nb_f1 = f1_score(y_test, y_pred_nb, average="macro")

    results.append({
        "Classifier": "NaiveBayes",
        "Train Time (s)": nb_train_time,
        "Inference Time (s)": nb_infer_time,
        "Accuracy": nb_acc,
        "F1-macro": nb_f1
    })

    # 3) XGBoost
    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softmax" if len(np.unique(y)) > 2 else "binary:logistic",
        eval_metric="mlogloss",
        tree_method="auto",
        n_jobs=-1,
        random_state=RANDOM_STATE
    )

    start_train = time.time()
    xgb.fit(X_train, y_train)
    end_train = time.time()
    xgb_train_time = end_train - start_train

    start_pred = time.time()
    y_pred_xgb = xgb.predict(X_test)
    end_pred = time.time()
    xgb_infer_time = end_pred - start_pred

    xgb_acc = accuracy_score(y_test, y_pred_xgb)
    xgb_f1 = f1_score(y_test, y_pred_xgb, average="macro")

    results.append({
        "Classifier": "XGBoost",
        "Train Time (s)": xgb_train_time,
        "Inference Time (s)": xgb_infer_time,
        "Accuracy": xgb_acc,
        "F1-macro": xgb_f1
    })

    return results


all_results = []

for enc_name, enc_path in ENCODERS.items():
    # 1) embedding + time
    X, emb_time = extract_clip_text_embeddings(enc_name, enc_path, texts)

    # 2) classification + time
    clf_results = benchmark_classifiers(X, y, train_idx, test_idx)

    # 3) encoder info
    for r in clf_results:
        r_with_enc = {
            "Encoder": enc_name,
            "Embedding Time (s)": emb_time,
            **r
        }
        all_results.append(r_with_enc)

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(["F1-macro", "Accuracy"], ascending=False)

print("\n FINAL RESULTS")
print(results_df.to_string(index=False))
results_df.to_csv("classification_embedding_benchmark_results.csv", index=False)

con.close()


 Upload CLIP model: clip-vit-b32 (openai/clip-vit-base-patch32)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


 Time CLIP embedding extraction: 252.89 s
 Shape embeddings: (5000, 512)

 Upload CLIP model: clip-vit-b16 (openai/clip-vit-base-patch16)


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


vocab.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

 Time CLIP embedding extraction: 256.74 s
 Shape embeddings: (5000, 512)

 Upload CLIP model: clip-vit-l14 (openai/clip-vit-large-patch14)


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

 Time CLIP embedding extraction: 536.35 s
 Shape embeddings: (5000, 768)

 FINAL RESULTS
     Encoder  Embedding Time (s) Classifier  Train Time (s)  Inference Time (s)  Accuracy  F1-macro
clip-vit-b16          256.735876    XGBoost       86.038053            0.011756     0.248  0.247757
clip-vit-b32          252.894578    XGBoost       85.060797            0.015024     0.247  0.245733
clip-vit-b32          252.894578  LinearSVM        3.211219            0.015683     0.246  0.244110
clip-vit-l14          536.346948    XGBoost      140.691305            0.017699     0.242  0.241942
clip-vit-b16          256.735876  LinearSVM        3.098709            0.004016     0.241  0.237043
clip-vit-l14          536.346948  LinearSVM        4.337271            0.004008     0.238  0.236431
clip-vit-l14          536.346948 NaiveBayes        0.023740            0.058707     0.239  0.233847
clip-vit-b16          256.735876 NaiveBayes        0.021583            0.016173     0.234  0.226761
clip-vit-b3

# EMBEDDINGS EXTRACTION

In [11]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [12]:
def load_split(split_name):
    print(f"Loading {split_name}...")
    df = con.sql(f"""
        SELECT post_id, caption_bert_clip, er_bins
        FROM md1718
        WHERE split = '{split_name}'
    """).df()
    ids = df["post_id"].to_numpy()
    texts = df["caption_bert_clip"].tolist()
    y = df["er_bins"]
    del df; gc.collect()
    print(f"{split_name} done.")
    return ids, texts, y

In [13]:
train_ids, Xtr_text, y_tr = load_split("train")
val_ids, Xva_text, y_val  = load_split("validation")
test_ids, Xte_text, y_te  = load_split("test")

con.close()

Loading train...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

train done.
Loading validation...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

validation done.
Loading test...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

test done.


In [14]:
clip_name = "openai/clip-vit-base-patch32"

tokenizer = CLIPTokenizer.from_pretrained(clip_name)
model = CLIPModel.from_pretrained(clip_name)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [15]:
import torch
torch.cuda.is_available()

False

In [16]:
def embed_clip(texts, bs=64):
    all_emb = []

    with torch.no_grad():
        for i in range(0, len(texts), bs):
            batch = texts[i:i+bs]

            inputs = tokenizer(
                batch,
                padding=True,
                truncation=True,
                return_tensors="pt"
            ).to(device)

            # Textual embeddings
            text_feat = model.get_text_features(
                input_ids=inputs["input_ids"], # token ids in CLIP vocabulary
                attention_mask=inputs["attention_mask"] # 1 valid token, 0 otherwise
            )

            # L2-normalization (important for CLIP)
            text_feat = text_feat / text_feat.norm(dim=-1, keepdim=True)

            all_emb.append(text_feat.cpu().numpy())

    return np.vstack(all_emb)


In [17]:
# CPU suggestions
torch.set_grad_enabled(False)
torch.set_num_threads(8)
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"

emb_dir = Path("D:/dataset/clip_text_emb_ALL")
emb_dir.mkdir(parents=True, exist_ok=True)

# Save shards on memory and skips the ones already executed
def embed_sharded_cached(texts, ids, y, split_name, shard=5000):
    prefix = f"{clip_name.split('/')[-1]}_{split_name}"
    out_files = []

    # Check for correct lengths
    n = len(texts)
    assert len(ids) == n, f"ids len {len(ids)} != texts len {n}"
    assert len(y) == n,   f"y len {len(y)} != texts len {n}"

    for i in range(0, n, shard):
        f = emb_dir / f"{prefix}_{i:07d}.npy"
        if f.exists():
            print(f"[skip] {f.name}")
            out_files.append(f)
            continue

        part = texts[i:i+shard]
        t0 = time.time()
        E = embed_clip(part, bs=64).astype("float32")
        np.save(f, E)
        dt = time.time() - t0
        print(f"[saved] {f.name}  [{i+len(part)}/{n}]  ({len(part)} cap in {dt:.1f}s)")

        out_files.append(f)

    # Concatenate shards and save ALL
    arrays = [np.load(f) for f in out_files]
    E_all = np.vstack(arrays).astype("float32")
    np.save(emb_dir / f"{prefix}_ALL.npy", E_all)

    # Safety check
    assert E_all.shape[0] == n, \
        f"Embeddings rows {E_all.shape[0]} != texts len {n}"

    npz_path = emb_dir / f"{prefix}_ids_y.npz"
    np.savez(
        npz_path,
        ids=np.asarray(ids),
        embeddings=E_all,
        y=np.asarray(y)
    )
    print(f"[done] Saved aligned ids + embeddings + y → {npz_path.name}")

    return E_all

In [9]:
print("Train…")
E_tr = embed_sharded_cached(Xtr_text, train_ids, y_tr, "train", shard=10000)

Train…
[saved] clip-vit-base-patch32_train_0000000.npy  [10000/773497]  (10000 cap in 385.7s)
[saved] clip-vit-base-patch32_train_0010000.npy  [20000/773497]  (10000 cap in 414.3s)
[saved] clip-vit-base-patch32_train_0020000.npy  [30000/773497]  (10000 cap in 404.4s)
[saved] clip-vit-base-patch32_train_0030000.npy  [40000/773497]  (10000 cap in 421.9s)
[saved] clip-vit-base-patch32_train_0040000.npy  [50000/773497]  (10000 cap in 422.4s)
[saved] clip-vit-base-patch32_train_0050000.npy  [60000/773497]  (10000 cap in 412.0s)
[saved] clip-vit-base-patch32_train_0060000.npy  [70000/773497]  (10000 cap in 419.1s)
[saved] clip-vit-base-patch32_train_0070000.npy  [80000/773497]  (10000 cap in 400.1s)
[saved] clip-vit-base-patch32_train_0080000.npy  [90000/773497]  (10000 cap in 427.6s)
[saved] clip-vit-base-patch32_train_0090000.npy  [100000/773497]  (10000 cap in 407.4s)
[saved] clip-vit-base-patch32_train_0100000.npy  [110000/773497]  (10000 cap in 424.1s)
[saved] clip-vit-base-patch32_trai

In [18]:
print("Val…")
E_va = embed_sharded_cached(Xva_text, val_ids, y_val, "val",   shard=20000)

Val…
[saved] clip-vit-base-patch32_val_0000000.npy  [20000/412325]  (20000 cap in 784.3s)
[saved] clip-vit-base-patch32_val_0020000.npy  [40000/412325]  (20000 cap in 796.0s)
[saved] clip-vit-base-patch32_val_0040000.npy  [60000/412325]  (20000 cap in 819.0s)
[saved] clip-vit-base-patch32_val_0060000.npy  [80000/412325]  (20000 cap in 811.2s)
[saved] clip-vit-base-patch32_val_0080000.npy  [100000/412325]  (20000 cap in 816.7s)
[saved] clip-vit-base-patch32_val_0100000.npy  [120000/412325]  (20000 cap in 797.7s)
[saved] clip-vit-base-patch32_val_0120000.npy  [140000/412325]  (20000 cap in 833.4s)
[saved] clip-vit-base-patch32_val_0140000.npy  [160000/412325]  (20000 cap in 828.6s)
[saved] clip-vit-base-patch32_val_0160000.npy  [180000/412325]  (20000 cap in 794.1s)
[saved] clip-vit-base-patch32_val_0180000.npy  [200000/412325]  (20000 cap in 820.0s)
[saved] clip-vit-base-patch32_val_0200000.npy  [220000/412325]  (20000 cap in 797.7s)
[saved] clip-vit-base-patch32_val_0220000.npy  [24000

In [19]:
print("Test…")
E_te = embed_sharded_cached(Xte_text, test_ids, y_te, "test",  shard=20000)

Test…
[saved] clip-vit-base-patch32_test_0000000.npy  [20000/423604]  (20000 cap in 779.3s)
[saved] clip-vit-base-patch32_test_0020000.npy  [40000/423604]  (20000 cap in 821.8s)
[saved] clip-vit-base-patch32_test_0040000.npy  [60000/423604]  (20000 cap in 818.5s)
[saved] clip-vit-base-patch32_test_0060000.npy  [80000/423604]  (20000 cap in 826.2s)
[saved] clip-vit-base-patch32_test_0080000.npy  [100000/423604]  (20000 cap in 832.0s)
[saved] clip-vit-base-patch32_test_0100000.npy  [120000/423604]  (20000 cap in 827.2s)
[saved] clip-vit-base-patch32_test_0120000.npy  [140000/423604]  (20000 cap in 834.9s)
[saved] clip-vit-base-patch32_test_0140000.npy  [160000/423604]  (20000 cap in 830.8s)
[saved] clip-vit-base-patch32_test_0160000.npy  [180000/423604]  (20000 cap in 825.9s)
[saved] clip-vit-base-patch32_test_0180000.npy  [200000/423604]  (20000 cap in 838.8s)
[saved] clip-vit-base-patch32_test_0200000.npy  [220000/423604]  (20000 cap in 797.9s)
[saved] clip-vit-base-patch32_test_022000

# LOAD DATA FOR CLASSIFICATION

In [2]:
train = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_train_ids_y.npz", allow_pickle = True)

X_tr = train["embeddings"]
y_tr = train["y"]
ids_tr = train["ids"]

print(X_tr.shape, len(y_tr), len(ids_tr))

(773497, 512) 773497 773497


In [3]:
val = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_val_ids_y.npz", allow_pickle = True)

X_va = val["embeddings"]
y_va = val["y"]
ids_va = val["ids"]

print(X_va.shape, len(y_va), len(ids_va))

(412325, 512) 412325 412325


In [4]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.25431075918002505 | accuracy (val): 0.25624446734978473

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.25232602965048356 | accuracy (val): 0.26416782877584427

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.2551364166993123 | accuracy (val): 0.25695264657733585

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.2520026516181246 | accuracy (val): 0.26392530164312134

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.2527830287365438 | accuracy (val): 0.25449584672285214

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.2521891009621885 | accuracy (val): 0.26207481962044504

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.24961141127044825 | accuracy (val): 0.25228157400109136

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.25216087584441305 | 

In [7]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.2097 | accuracy (val): 0.2442

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.2097 | accuracy (val): 0.2442

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.2097 | accuracy (val): 0.2442

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.2097 | accuracy (val): 0.2442

Best hyperparameter configuration:
{'var_smoothing': 1e-06}
Validation macro-F1: 0.20967730015944536

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
3   1.000000e-06      0.209677      0.244176
0   1.000000e-09      0.209675      0.244174
1   1.000000e-08      0.209675      0.244174
2   1.000000e-07      0.209675      0.244174


In [8]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [50, 80],
    "max_depth": [10, 12], 
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.2063 | accuracy (val): 0.2310

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.2058 | accuracy (val): 0.2316

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.2076 | accuracy (val): 0.2313

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.2059 | accuracy (val): 0.2316

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.2056 | accuracy (val): 0.2298

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.2038 | accuracy (val): 0.2303

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.2013 | ac

In [4]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5],
    "gamma": [0, 1],
    "reg_lambda": [1],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2460 | accuracy (val): 0.2511

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2505 | accuracy (val): 0.2550

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2527 | accuracy (val): 0.2563

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2549 | accuracy (val): 0.2581

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2460 | accuracy (val): 0.2511

Combination: {'colsample_bytr

# PERFORMANCE SUL TEST SET

In [2]:
train = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_train_ids_y.npz", allow_pickle = True)

X_tr = train["embeddings"]
y_tr = train["y"]

In [2]:
val = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_val_ids_y.npz", allow_pickle = True)

X_va = val["embeddings"]
y_va = val["y"]

In [2]:
X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, y_tr, X_va, y_va
gc.collect()

In [3]:
test = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_test_ids_y.npz", allow_pickle = True)

X_te = test["embeddings"]
y_te = test["y"]


In [6]:
# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)


cfgs = [
    GaussianNB(var_smoothing = 1e-06),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=2, n_estimators=50, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_tr, y_tr_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_tr, y_tr)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: GaussianNB(var_smoothing=1e-06)
macro-F1 (test): 0.2069 | accuracy (test): 0.2429

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=2,
                       n_estimators=50, n_jobs=-1, random_state=42)
macro-F1 (test): 0.2171 | accuracy (test): 0.2332

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
             

In [4]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 0.0001,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_tr, y_tr)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2534 | accuracy (test): 0.2559
