In [1]:
import os, time, duckdb, gc
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import torchvision.transforms as T
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

# LOAD EMBEDDINGS

In [2]:
emb_dir = Path("D:/dataset/efficientnetb0_emb")
MODEL_NAME = "efficientnet_b0"

def load_effnet_split(split_name, emb_dir=emb_dir, model_name=MODEL_NAME):
    all_path = emb_dir / f"{model_name}_{split_name}_ALL.npz"
    data = np.load(all_path, allow_pickle=True)
    F = data["feats"]     
    P = data["post_id"]   
    print(split_name, F.shape, len(P))
    return F, P

In [3]:
X_train, ids_train = load_effnet_split("train")
X_val, ids_val = load_effnet_split("val")

train (960048, 1280) 960048
val (556982, 1280) 556982


In [4]:
unique_posts_tr = len(np.unique(ids_train))
unique_posts_va = len(np.unique(ids_val))
print("post_id unici in ids_train:", unique_posts_tr)
print("post_id unici in ids_val:", unique_posts_va)

post_id unici in ids_train: 773497
post_id unici in ids_val: 412325


In [3]:
def aggregate_by_post(F, P, agg="mean"):
    
    df = pd.DataFrame({
        "post_id": P,
        "feat": list(F) 
    })

    if agg == "mean":
        agg_func = lambda arrs: np.mean(np.stack(arrs), axis=0)
    elif agg == "max":
        agg_func = lambda arrs: np.max(np.stack(arrs), axis=0)
    else:
        raise ValueError("agg deve essere 'mean' o 'max'")

    df_post = (
        df.groupby("post_id")["feat"]
          .apply(agg_func)
          .reset_index()
    )
    return df_post

In [6]:
df_train_img = aggregate_by_post(X_train, ids_train, agg="mean")
df_val_img = aggregate_by_post(X_val, ids_val, agg="mean")

In [7]:
print(df_train_img.shape, df_val_img.shape)

(773497, 2) (412325, 2)


In [4]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [9]:
y_tr_ids = con.sql("""SELECT post_id, er_bins3 FROM md1718 WHERE split = 'train'""").df()
y_val_ids = con.sql("""SELECT post_id, er_bins3 FROM md1718 WHERE split = 'validation'""").df()

In [10]:
df_train = df_train_img.merge(
    y_tr_ids[["post_id", "er_bins3"]],
    on="post_id", how="inner"
)

df_val = df_val_img.merge(
    y_val_ids[["post_id", "er_bins3"]],
    on="post_id", how="inner"
)

In [11]:
X_tr = np.stack(df_train["feat"].values)
y_tr = df_train["er_bins3"].values

X_va = np.stack(df_val["feat"].values)
y_va = df_val["er_bins3"].values

In [12]:
print(X_tr.shape, X_va.shape, y_tr.shape, y_va.shape)

(773497, 1280) (412325, 1280) (773497,) (412325,)


In [13]:
df_train['post_id'].head()

0    100pintas-1769662389073991144
1    100pintas-1782702664733979876
2    100pintas-1797067212467389817
3    100pintas-1807955339238986900
4    100pintas-1808039696742034708
Name: post_id, dtype: object

In [14]:
y_tr[:5]

array(['medium', 'medium', 'medium', 'medium', 'high'], dtype=object)

In [15]:
ids_tr = df_train['post_id'].values
ids_va = df_val['post_id'].values

In [16]:
del X_train, X_val, df_train, df_train_img, df_val, df_val_img, ids_train, ids_val, y_tr_ids, y_val_ids
gc.collect()

0

In [17]:
np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/train_data_3.npz",
    X=X_tr,
    y=y_tr,
    ids = ids_tr
)

np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/val_data_3.npz",
    X=X_va,
    y=y_va,
    ids = ids_va
)

In [2]:
train_data = np.load("D:/dataset/efficientnetb0_emb/train_data_3.npz", allow_pickle = True)
X_tr = train_data["X"]
y_tr = train_data["y"]

val_data = np.load("D:/dataset/efficientnetb0_emb/val_data_3.npz", allow_pickle = True)
X_va = val_data["X"]
y_va = val_data["y"]

del train_data, val_data
gc.collect()

579

In [5]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.40787615140160755 | accuracy (val): 0.4142169405202207

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.3984984384307299 | accuracy (val): 0.41779421572788455

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.4068502983466689 | accuracy (val): 0.4146995695143394

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.396013011089475 | accuracy (val): 0.41826714363669437

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.40344063465924057 | accuracy (val): 0.41383617292184566

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.3894171511357631 | accuracy (val): 0.4165379251803796

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.4010127675288108 | accuracy (val): 0.4119153580306797

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.38539996979673163 | accura

In [3]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.3726 | accuracy (val): 0.4114

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.3726 | accuracy (val): 0.4114

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.3726 | accuracy (val): 0.4114

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.3726 | accuracy (val): 0.4114

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.37261486210082356

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.372615      0.411362
1   1.000000e-08      0.372615      0.411362
2   1.000000e-07      0.372615      0.411362
3   1.000000e-06      0.372615      0.411362


In [4]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.3989 | accuracy (val): 0.3981

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.4000 | accuracy (val): 0.3991

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.4005 | accuracy (val): 0.3996

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.3998 | accuracy (val): 0.3989

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.4005 | accuracy (val): 0.3995

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.4011 | accuracy (val): 0.4001

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.3947 | accuracy (val

In [3]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
    "gamma": [0, 1],
    "reg_lambda": [1],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4112 | accuracy (val): 0.4138

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4125 | accuracy (val): 0.4158

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4136 | accuracy (val): 0.4170

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4151 | accuracy (val): 0.4186

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4112 | accuracy (val): 0.4138

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE SUL TEST SET

In [2]:
emb_dir = Path("D:/dataset/efficientnetb0_emb")
MODEL_NAME = "efficientnet_b0"  # stesso nome di prima

def load_effnet_split(split_name, emb_dir=emb_dir, model_name=MODEL_NAME):
    all_path = emb_dir / f"{model_name}_{split_name}_ALL.npz"
    data = np.load(all_path, allow_pickle=True)
    F = data["feats"]      # shape: [N_images, feat_dim]
    P = data["post_id"]    # shape: [N_images], dtype=object (stringhe/ID)
    print(split_name, F.shape, len(P))
    return F, P

def aggregate_by_post(F, P, agg="mean"):
    # F: [N_img, D], P: [N_img]
    df = pd.DataFrame({
        "post_id": P,
        "feat": list(F)  # ogni riga Ã¨ un array 1D di lunghezza D
    })

    if agg == "mean":
        agg_func = lambda arrs: np.mean(np.stack(arrs), axis=0)
    elif agg == "max":
        agg_func = lambda arrs: np.max(np.stack(arrs), axis=0)
    else:
        raise ValueError("agg deve essere 'mean' o 'max'")

    df_post = (
        df.groupby("post_id")["feat"]
          .apply(agg_func)
          .reset_index()
    )
    # df_post: colonne = ["post_id", "feat"], UN record per post
    return df_post

In [None]:
X_train, ids_train = load_effnet_split("train")
# X_val, ids_val = load_effnet_split("val")

In [4]:
unique_posts_tr = len(np.unique(ids_train))
# unique_posts_va = len(np.unique(ids_val))
print("post_id unici in ids_train:", unique_posts_tr)
# print("post_id unici in ids_val:", unique_posts_va)

post_id unici in ids_train: 773497
post_id unici in ids_val: 412325


In [5]:
del unique_posts_tr, unique_posts_va, X_val
gc.collect()

38

In [6]:
df_train_img = aggregate_by_post(X_train, ids_train, agg="mean")
# df_val_img = aggregate_by_post(X_val, ids_val, agg="mean")

In [7]:
print(df_train_img.shape, df_val_img.shape)

(773497, 2) (412325, 2)


In [8]:
del X_train, X_val
gc.collect()

In [10]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [11]:
y_tr_ids = con.sql("""SELECT post_id, er_bins3 FROM md1718 WHERE split = 'train'""").df()
y_val_ids = con.sql("""SELECT post_id, er_bins3 FROM md1718 WHERE split = 'validation'""").df()

In [12]:
df_train = df_train_img.merge(
    y_tr_ids[["post_id", "er_bins3"]],
    on="post_id", how="inner"
)

df_val = df_val_img.merge(
    y_val_ids[["post_id", "er_bins3"]],
    on="post_id", how="inner"
)

In [13]:
del df_train_img, df_val_img
gc.collect()

0

In [14]:
X_tr = np.stack(df_train["feat"].values)
y_tr = df_train["er_bins3"].values

X_va = np.stack(df_val["feat"].values)
y_va = df_val["er_bins3"].values

X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

In [16]:
print(X_trva.shape, y_trva.shape)

(1185822, 1280) (1185822,)


In [17]:
del df_train, df_val
del ids_train, ids_val, y_tr_ids, y_val_ids
gc.collect()

477

In [18]:
np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/trainval_data_3.npz",
    X=X_trva,
    y=y_trva
)

In [19]:
del X_tr, X_va, y_tr, y_va, X_trva, y_trva
gc.collect()

0

In [6]:
X_test, ids_test = load_effnet_split("test")
unique_posts_te = len(np.unique(ids_test))
print("post_id unici in ids_te:", unique_posts_te)
del unique_posts_te
gc.collect()
df_test_img = aggregate_by_post(X_test, ids_test, agg="mean")
print(df_test_img.shape)
del X_test
gc.collect()
y_test_ids = con.sql("""SELECT post_id, er_bins3 FROM md1718 WHERE split = 'test'""").df()
df_test = df_test_img.merge(
    y_test_ids[["post_id", "er_bins3"]],
    on="post_id", how="inner"
)

X_te = np.stack(df_test["feat"].values)
y_te = df_test["er_bins3"].values

np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/test_data_3.npz",
    X=X_te,
    y=y_te
)

test (588557, 1280) 588557
post_id unici in ids_te: 423604
(423604, 2)


In [7]:
del ids_test, df_test_img, y_test_ids, df_test, X_te, y_te
gc.collect()

0

In [2]:
train_data = np.load("D:/dataset/efficientnetb0_emb/trainval_data_3.npz", allow_pickle = True)
X_tr = train_data["X"]
y_tr = train_data["y"]

In [3]:
test_data = np.load("D:/dataset/efficientnetb0_emb/test_data_3.npz", allow_pickle = True)
X_te = test_data["X"]
y_te = test_data["y"]

del test_data
gc.collect()

82

In [4]:
print(X_tr.shape, X_te.shape, y_tr.shape, y_te.shape)

(773497, 1280) (423604, 1280) (773497,) (423604,)


In [10]:
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)

In [12]:
cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=5, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_tr, y_tr_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_tr, y_tr)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: GaussianNB()
macro-F1 (test): 0.3747 | accuracy (test): 0.4116

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=5,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (test): 0.4063 | accuracy (test): 0.4048

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=Non

In [5]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_tr, y_tr)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.4072 | accuracy (test): 0.4141
