In [1]:
import os, time, duckdb, torch, timm, gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from PIL import Image
from sklearn.model_selection import ParameterGrid

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier

import torchvision.transforms as T
from pathlib import Path

from torch.utils.data import Dataset, DataLoader
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import duckdb, torch
from transformers import CLIPModel, CLIPProcessor

In [3]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


# EMBEDDINGS EXTRACTION

In [3]:
# con.execute("""CREATE OR REPLACE TABLE img_splits_clip AS
# SELECT m.post_id, m.split, i.full_image_file, m.er_bins
# FROM md1718 m
# JOIN images_manifest1718_clean i ON m.post_id = i.post_id
# """)

In [4]:
# Configuration
MODEL_NAME = "openai/clip-vit-base-patch32"
MODEL_TAG  = "clip_vit_b32"
IMG_DIR = r"D:/dataset/images_224_rgb"
BATCH_SIZE = 8
SHARD_SIZE = 20000
OUT_DIR = "D:/dataset/clip_img_emb_ALL"
device = torch.device("cpu")

num_threads = max(1, (os.cpu_count() or 4) // 2)
torch.set_num_threads(num_threads)
print(f"Uso dispositivo: {device} | PyTorch threads: {num_threads}")

# Upload clip
print(f"Carico CLIP: {MODEL_NAME} ...")
clip_model = CLIPModel.from_pretrained(MODEL_NAME) # Upload CLIP model with the encoder
clip_processor = CLIPProcessor.from_pretrained(MODEL_NAME) # It preprocesses images to fit CLIP requests

# Freeze weights because we are not training but just extracting featrues
for p in clip_model.parameters():
    p.requires_grad = False

clip_model.eval() # to stabilize the embeddings and make them deterministic
clip_model.to(device)

# Get the image embeddings size: Embedding dim = 512
with torch.no_grad():
    dummy = torch.randn(1, 3, 224, 224)
    feats_dummy = clip_model.get_image_features(pixel_values=dummy.to(device))
    feat_dim = feats_dummy.shape[-1]

print(f"CLIP caricato. Embedding dim = {feat_dim}")


# Upload images tabel and retrieve the path
df = con.sql("""
    SELECT *
    FROM img_splits_clip
""").df()
print("Tabella img_splits caricata. Righe:", len(df))

if "full_image_file" not in df.columns:
    if "image_file" not in df.columns:
        raise ValueError("Non trovo né 'full_image_file' né 'image_file' in img_splits.")
    df["full_image_file"] = df["image_file"].apply(lambda x: os.path.join(IMG_DIR, x))
else:
    df["full_image_file"] = df["full_image_file"].apply(lambda x: os.path.join(IMG_DIR, x))

# Check
print(df[["post_id", "split", "full_image_file"]].head())

# Split
train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "validation"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print(f"train = {len(train_df)} | val = {len(val_df)} | test = {len(test_df)}")

Uso dispositivo: cpu | PyTorch threads: 4
Carico CLIP: openai/clip-vit-base-patch32 ...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIP caricato. Embedding dim = 512


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Tabella img_splits caricata. Righe: 2105587
                             post_id       split  \
0  breemarieblog-1904832738906943213  validation   
1  breemarieblog-1905358465586947802  validation   
2  breemarieblog-1906849580570598483  validation   
3  breemarieblog-1908197204734451219  validation   
4  breemarieblog-1908389940821287367  validation   

                                     full_image_file  
0  D:/dataset/images_224_rgb\breemarieblog-190483...  
1  D:/dataset/images_224_rgb\breemarieblog-190535...  
2  D:/dataset/images_224_rgb\breemarieblog-190684...  
3  D:/dataset/images_224_rgb\breemarieblog-190819...  
4  D:/dataset/images_224_rgb\breemarieblog-190838...  
train = 960048 | val = 556982 | test = 588557


In [5]:
def build_samples(df, path_col="full_image_file"):
    return list(zip(df[path_col].tolist(), df["post_id"].tolist()))

train_samples = build_samples(train_df)
val_samples   = build_samples(val_df)
test_samples  = build_samples(test_df)

print("Samples creati: ",
      f"train={len(train_samples)}, val={len(val_samples)}, test={len(test_samples)}")

SHARD_SIZE = 20000
def extract_and_save_clip(split_name, samples, model, processor, out_dir=OUT_DIR, shard_size=SHARD_SIZE, batch_size=BATCH_SIZE):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    prefix = f"{MODEL_TAG}_{split_name}"

    n = len(samples)
    print(f"\n[{split_name}] Totale esempi: {n}")

    if n == 0:
        print(f"[{split_name}] Nessun esempio, salto.")
        return None, None

    shard_files = []

    for start in range(0, n, shard_size):
        shard_samples = samples[start:start + shard_size]
        shard_path = out_dir / f"{prefix}_{start:07d}.npz"

        if shard_path.exists():
            print(f"[{split_name}] [skip] {shard_path.name}")
            shard_files.append(shard_path)
            continue

        print(f"[{split_name}] Elaboro shard {start} - {start + len(shard_samples) - 1}")

        feats_buf = []
        id_buf = []

        t0 = time.time()

        for batch_start in range(0, len(shard_samples), batch_size):
            batch = shard_samples[batch_start:batch_start + batch_size]

            pil_imgs = []
            pids = []

            for path, pid in batch:
                try:
                    img = Image.open(path).convert("RGB")
                    pil_imgs.append(img)
                    pids.append(pid)
                except Exception as e:
                    print(f"[{split_name}] [warn] Errore nel leggere {path}: {e}")
                    continue

            if not pil_imgs:
                continue 

            inputs = processor(images=pil_imgs, return_tensors="pt")
            pixel_values = inputs["pixel_values"].to(device)

            with torch.no_grad():
                feats = model.get_image_features(pixel_values=pixel_values)
                feats = feats / feats.norm(dim=-1, keepdim=True)

            feats = feats.float().cpu().numpy()

            feats_buf.append(feats)
            id_buf.extend(pids)

            del pixel_values, inputs, pil_imgs, pids, feats

        if not feats_buf:
            print(f"[{split_name}] [warn] Nessuna immagine valida nello shard starting {start}.")
            continue

        F = np.concatenate(feats_buf, axis=0)
        I = np.array(id_buf, dtype=object)

        dt = time.time() - t0
        rate = F.shape[0] / max(dt, 1e-9)

        np.savez_compressed(
            shard_path,
            feats=F,
            post_id=I,
            model=MODEL_TAG,
            feat_dim=feat_dim,
        )

        print(f"[{split_name}] [save] {shard_path.name} | {F.shape[0]} esempi | "
              f"{dt:.1f}s | {rate:.1f} img/s")

        shard_files.append(shard_path)

    if not shard_files:
        print(f"[{split_name}] Nessuno shard creato, salto concatenazione.")
        return None, None

    print(f"[{split_name}] Concateno {len(shard_files)} shard...")

    feats_all = []
    ids_all = []

    for fpath in shard_files:
        data = np.load(fpath, allow_pickle=True)
        feats_all.append(data["feats"])
        ids_all.append(data["post_id"])

    F_all = np.concatenate(feats_all, axis=0)
    I_all = np.concatenate(ids_all, axis=0)

    all_path = out_dir / f"{prefix}_ALL.npz"
    np.savez_compressed(
        all_path,
        feats=F_all,
        post_id=I_all,
        model=MODEL_TAG,
        feat_dim=feat_dim,
    )

    print(f"[{split_name}] File unico: {all_path.name} | {F_all.shape[0]} esempi totali")

    return F_all, I_all

Samples creati:  train=960048, val=556982, test=588557


In [5]:
F_train, I_train = extract_and_save_clip(
    split_name="train",
    samples=train_samples,
    model=clip_model,
    processor=clip_processor,
    shard_size = 20000
)


[train] Totale esempi: 960048
[train] Elaboro shard 0 - 19999
[train] [save] clip_vit_b32_train_0000000.npz | 20000 esempi | 2000.4s | 10.0 img/s
[train] Elaboro shard 20000 - 39999
[train] [save] clip_vit_b32_train_0020000.npz | 20000 esempi | 1970.9s | 10.1 img/s
[train] Elaboro shard 40000 - 59999
[train] [save] clip_vit_b32_train_0040000.npz | 20000 esempi | 1968.6s | 10.2 img/s
[train] Elaboro shard 60000 - 79999
[train] [save] clip_vit_b32_train_0060000.npz | 20000 esempi | 1957.0s | 10.2 img/s
[train] Elaboro shard 80000 - 99999
[train] [save] clip_vit_b32_train_0080000.npz | 20000 esempi | 1964.6s | 10.2 img/s
[train] Elaboro shard 100000 - 119999
[train] [save] clip_vit_b32_train_0100000.npz | 20000 esempi | 1963.3s | 10.2 img/s
[train] Elaboro shard 120000 - 139999
[train] [save] clip_vit_b32_train_0120000.npz | 20000 esempi | 1969.6s | 10.2 img/s
[train] Elaboro shard 140000 - 159999
[train] [save] clip_vit_b32_train_0140000.npz | 20000 esempi | 2000.4s | 10.0 img/s
[train]

In [6]:
F_val, I_val = extract_and_save_clip(
    split_name="validation",
    samples=val_samples,
    model=clip_model,
    processor=clip_processor,
    shard_size = 20000
)


[validation] Totale esempi: 556982
[validation] Elaboro shard 0 - 19999
[validation] [save] clip_vit_b32_validation_0000000.npz | 20000 esempi | 2081.9s | 9.6 img/s
[validation] Elaboro shard 20000 - 39999
[validation] [save] clip_vit_b32_validation_0020000.npz | 20000 esempi | 2007.9s | 10.0 img/s
[validation] Elaboro shard 40000 - 59999
[validation] [save] clip_vit_b32_validation_0040000.npz | 20000 esempi | 2004.4s | 10.0 img/s
[validation] Elaboro shard 60000 - 79999
[validation] [save] clip_vit_b32_validation_0060000.npz | 20000 esempi | 2013.2s | 9.9 img/s
[validation] Elaboro shard 80000 - 99999
[validation] [save] clip_vit_b32_validation_0080000.npz | 20000 esempi | 2004.3s | 10.0 img/s
[validation] Elaboro shard 100000 - 119999
[validation] [save] clip_vit_b32_validation_0100000.npz | 20000 esempi | 1983.7s | 10.1 img/s
[validation] Elaboro shard 120000 - 139999
[validation] [save] clip_vit_b32_validation_0120000.npz | 20000 esempi | 1994.9s | 10.0 img/s
[validation] Elaboro 

In [7]:
F_test, I_test = extract_and_save_clip(
    split_name="test",
    samples=test_samples,
    model=clip_model,
    processor=clip_processor,
    shard_size = 20000
)


[test] Totale esempi: 588557
[test] Elaboro shard 0 - 19999
[test] [save] clip_vit_b32_test_0000000.npz | 20000 esempi | 2235.6s | 8.9 img/s
[test] Elaboro shard 20000 - 39999
[test] [save] clip_vit_b32_test_0020000.npz | 20000 esempi | 2122.3s | 9.4 img/s
[test] Elaboro shard 40000 - 59999
[test] [save] clip_vit_b32_test_0040000.npz | 20000 esempi | 1930.0s | 10.4 img/s
[test] Elaboro shard 60000 - 79999
[test] [save] clip_vit_b32_test_0060000.npz | 20000 esempi | 1945.6s | 10.3 img/s
[test] Elaboro shard 80000 - 99999
[test] [save] clip_vit_b32_test_0080000.npz | 20000 esempi | 2084.5s | 9.6 img/s
[test] Elaboro shard 100000 - 119999
[test] [save] clip_vit_b32_test_0100000.npz | 20000 esempi | 2233.6s | 9.0 img/s
[test] Elaboro shard 120000 - 139999
[test] [save] clip_vit_b32_test_0120000.npz | 20000 esempi | 1963.2s | 10.2 img/s
[test] Elaboro shard 140000 - 159999
[test] [save] clip_vit_b32_test_0140000.npz | 20000 esempi | 1959.8s | 10.2 img/s
[test] Elaboro shard 160000 - 179999

# LOAD EMBEDDINGS

In [2]:
train = np.load("D:/dataset/clip_img_emb_ALL/clip_vit_b32_train_ALL.npz", allow_pickle = True)

X_tr = train["feats"]
ids_tr = train["post_id"]

print(X_tr.shape, len(ids_tr))

(960048, 512) 960048


In [3]:
df = pd.DataFrame({
    "post_id": ids_tr
})

df["emb"] = list(X_tr)

agg = df.groupby("post_id")["emb"].apply(lambda x: np.mean(x.tolist(), axis=0))

X_tr = np.stack(agg.values)
post_ids_unique = agg.index.values

In [4]:
X_tr.shape

(773497, 512)

In [5]:
np.save("D:/dataset/clip_img_emb_ALL/X_tr.npz", X_tr)

In [5]:
ids_list = post_ids_unique.tolist()

DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

con.execute("CREATE TEMP TABLE tmp_ids (post_id VARCHAR, ord INT)")

con.execute(
    "INSERT INTO tmp_ids VALUES " +
    ", ".join(f"('{pid}', {i})" for i, pid in enumerate(ids_list))
)

targets = con.execute("""
    SELECT md1718.er_bins
    FROM md1718
    JOIN tmp_ids USING(post_id)
    ORDER BY tmp_ids.ord
""").fetchall()

y_tr = np.array([t[0] for t in targets])


Set up ready


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
np.save("D:/dataset/clip_img_emb_ALL/y_tr_5.npy", y_tr)

In [7]:
del X_tr, ids_tr, y_tr, train, df, agg, post_ids_unique, ids_list, targets
gc.collect()

0

In [6]:
val = np.load("D:/dataset/clip_img_emb_ALL/clip_vit_b32_validation_ALL.npz", allow_pickle = True)

X_va = val["feats"]
ids_va = val["post_id"]

print(X_va.shape, len(ids_va))

(556982, 512) 556982


In [7]:
df = pd.DataFrame({
    "post_id": ids_va
})

df["emb"] = list(X_va)

agg = df.groupby("post_id")["emb"].apply(lambda x: np.mean(x.tolist(), axis=0))

X_va = np.stack(agg.values)
post_ids_unique_va = agg.index.values

In [8]:
np.save("D:/dataset/clip_img_emb_ALL/X_va.npz", X_va)

In [10]:
ids_list = post_ids_unique_va.tolist()

DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

con.execute("CREATE TEMP TABLE tmp_ids (post_id VARCHAR, ord INT)")

con.execute(
    "INSERT INTO tmp_ids VALUES " +
    ", ".join(f"('{pid}', {i})" for i, pid in enumerate(ids_list))
)

targets = con.execute("""
    SELECT md1718.er_bins
    FROM md1718
    JOIN tmp_ids USING(post_id)
    ORDER BY tmp_ids.ord
""").fetchall()

y_va = np.array([t[0] for t in targets])


Set up ready


In [11]:
print(len(y_va), len(ids_list))

412325 412325


In [12]:
np.save("D:/dataset/clip_img_emb_ALL/y_va_5.npy", y_va)

In [None]:
del X_va, ids_va, y_va, val, df, agg, ids_list, targets
gc.collect()

In [14]:
del post_ids_unique_va, ids_list, targets
gc.collect()

479

In [2]:
X_tr = np.load("D:/dataset/clip_img_emb_ALL/X_tr.npy", allow_pickle = True)
X_va = np.load("D:/dataset/clip_img_emb_ALL/X_va.npy", allow_pickle = True)

y_tr = np.load("D:/dataset/clip_img_emb_ALL/y_tr_5.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_img_emb_ALL/y_va_5.npy", allow_pickle = True)

In [17]:
# SGD 
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.2783466562020923 | accuracy (val): 0.28547868792821196

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.2687266830552627 | accuracy (val): 0.2903510580246165

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.27655811306035166 | accuracy (val): 0.28561692839386404

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.2680011234738303 | accuracy (val): 0.28998969259685925

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.2767911240842179 | accuracy (val): 0.2842272479233614

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.2689597151968819 | accuracy (val): 0.2890171587946401

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.27520197581030825 | accuracy (val): 0.28350209179651975

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.2678752656158571 | accur

In [3]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.2233 | accuracy (val): 0.2765

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.2233 | accuracy (val): 0.2765

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.2233 | accuracy (val): 0.2765

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.2233 | accuracy (val): 0.2765

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.22334045982463513

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.223340      0.276532
1   1.000000e-08      0.223340      0.276532
3   1.000000e-06      0.223338      0.276527
2   1.000000e-07      0.223336      0.276529


In [4]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [50, 80],    # combinazioni leggere
    "max_depth": [10, 12],         # non troppo profonde
    "min_samples_leaf": [2, 5],       # regolarizzazione
    "max_features": [0.05, "sqrt"],   # due strategie interessanti
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.2534 | accuracy (val): 0.2636

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.2523 | accuracy (val): 0.2646

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.2533 | accuracy (val): 0.2638

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.2521 | accuracy (val): 0.2643

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.2524 | accuracy (val): 0.2635

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.2510 | accuracy (val): 0.2640

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.2528 | ac

In [5]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1], 
    "subsample": [0.8],     
    "colsample_bytree": [0.5],
    "gamma": [0, 1],          
    "reg_lambda": [1],        
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2714 | accuracy (val): 0.2778

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2746 | accuracy (val): 0.2806

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2766 | accuracy (val): 0.2817

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2782 | accuracy (val): 0.2833

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2714 | accuracy (val): 0.2778

Combination: {'colsample_bytr

# PERFORMANCE TEST SET

In [3]:
test = np.load("D:/dataset/clip_img_emb_ALL/clip_vit_b32_test_ALL.npz", allow_pickle = True)

# accedi alle chiavi con le stringhe tra virgolette
X_te = test["feats"]
ids_te = test["post_id"]

print(X_te.shape, len(ids_te))

(588557, 512) 588557


In [4]:
df = pd.DataFrame({
    "post_id": ids_te
})

# Inseriamo gli embeddings in un array di oggetti
df["emb"] = list(X_te)

# Aggrega per post_id
agg = df.groupby("post_id")["emb"].apply(lambda x: np.mean(x.tolist(), axis=0))

X_te = np.stack(agg.values)
post_ids_unique_te = agg.index.values

# np.save("D:/dataset/clip_img_emb_ALL/X_te.npy", X_te)

In [5]:
ids_list = post_ids_unique_te.tolist()

con.execute("CREATE TEMP TABLE tmp_ids (post_id VARCHAR, ord INT)")

con.execute(
    "INSERT INTO tmp_ids VALUES " +
    ", ".join(f"('{pid}', {i})" for i, pid in enumerate(ids_list))
)

targets = con.execute("""
    SELECT md1718.er_bins
    FROM md1718
    JOIN tmp_ids USING(post_id)
    ORDER BY tmp_ids.ord
""").fetchall()

y_te = np.array([t[0] for t in targets])


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
print(len(y_te), len(ids_list))

423604 423604


In [7]:
np.save("D:/dataset/clip_img_emb_ALL/y_te_5.npy", y_te)

In [8]:
del X_te, ids_te, y_te, test, df, agg, post_ids_unique_te, ids_list, targets
gc.collect()

0

In [4]:
X_tr = np.load("D:/dataset/clip_img_emb_ALL/X_tr.npy", allow_pickle = True)
y_tr = np.load("D:/dataset/clip_img_emb_ALL/y_tr_5.npy", allow_pickle = True)

X_va = np.load("D:/dataset/clip_img_emb_ALL/X_va.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_img_emb_ALL/y_va_5.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

np.save("D:/dataset/clip_img_emb_ALL/X_trva.npy", X_trva)
np.save("D:/dataset/clip_img_emb_ALL/X_trva_5.npy", X_trva)

In [None]:
X_tr= np.load("D:/dataset/clip_img_emb_ALL/X_trva.npy", allow_pickle = True)
y_tr = np.load("D:/dataset/clip_img_emb_ALL/y_trva_5.npy", allow_pickle = True)

X_te = np.load("D:/dataset/clip_img_emb_ALL/X_te.npy", allow_pickle = True)
y_te = np.load("D:/dataset/clip_img_emb_ALL/y_te_5.npy", allow_pickle = True)

In [5]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_tr, y_tr)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.1582 | accuracy (test): 0.2080


In [3]:
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)


cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=2, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 1, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_tr, y_tr_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_tr, y_tr)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")


Configuration: GaussianNB()
macro-F1 (train): 0.0793 | accuracy (train): 0.0866

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=2,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (train): 0.1908 | accuracy (train): 0.2854

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=1,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy