In [1]:
import os, time, duckdb, torch, timm, gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from PIL import Image
from sklearn.model_selection import ParameterGrid

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier

import torchvision.transforms as T
from pathlib import Path

from torch.utils.data import Dataset, DataLoader
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import duckdb, torch
from transformers import CLIPModel, CLIPProcessor

In [2]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


# SET UP

In [14]:
con.execute("""
CREATE OR REPLACE TABLE train_clip_sample AS
WITH categories AS (
    SELECT DISTINCT category FROM md1718
),
params AS (
    SELECT COUNT(*) AS num_categories FROM categories
),
sampled AS (
    SELECT *
    FROM (
        SELECT
            *,
            ROW_NUMBER() OVER (
                PARTITION BY category
                ORDER BY RANDOM()
            ) AS rn
        FROM md1718
        WHERE split = 'train'
    )
    WHERE rn <= (25000 / (SELECT num_categories FROM params))
)
SELECT *
FROM sampled;
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x1a2476f5130>

In [15]:
con.execute("""
CREATE OR REPLACE TABLE val_clip_sample AS
SELECT *
FROM md1718
WHERE split = 'validation'
ORDER BY RANDOM()
LIMIT 5000;
""")

<_duckdb.DuckDBPyConnection at 0x1a2476f5130>

In [16]:
con.execute("""CREATE OR REPLACE TABLE test_clip_sample AS
SELECT *
FROM md1718
WHERE split = 'test'
ORDER BY RANDOM()
LIMIT 5000;

""")

<_duckdb.DuckDBPyConnection at 0x1a2476f5130>

In [18]:
con.execute("""ALTER TABLE train_clip_sample DROP COLUMN rn""")

<_duckdb.DuckDBPyConnection at 0x1a2476f5130>

In [19]:
con.execute("""CREATE OR REPLACE TABLE clip_full_sample AS
SELECT * FROM train_clip_sample
UNION ALL
SELECT * FROM val_clip_sample
UNION ALL
SELECT * FROM test_clip_sample;
""")

<_duckdb.DuckDBPyConnection at 0x1a2476f5130>

In [20]:
con.sql("""SELECT COUNT(*) AS tot, COUNT(*) - COUNT(DISTINCT post_id) AS duplicates FROM clip_full_sample""").df()

Unnamed: 0,tot,duplicates
0,34993,0


In [21]:
con.sql("""SELECT category, split, COUNT(*) as n
FROM clip_full_sample
GROUP BY category, split""").df()

Unnamed: 0,category,split,n
0,travel,validation,498
1,beauty,validation,185
2,family,test,651
3,fitness,train,2777
4,pet,train,2777
5,interior,validation,208
6,fashion,test,1832
7,other,test,859
8,interior,train,2777
9,fitness,validation,139


In [22]:
con.sql("""SELECT er_bins, split, COUNT(*) as n
FROM clip_full_sample
GROUP BY er_bins, split
ORDER BY split""").df()

Unnamed: 0,er_bins,split,n
0,low,test,1009
1,high,test,963
2,medium,test,1021
3,very_low,test,1039
4,very_high,test,968
5,low,train,5318
6,very_low,train,4529
7,very_high,train,4374
8,high,train,5321
9,medium,train,5451


In [23]:
con.sql("""SELECT er_bins3, split, COUNT(*) as n
FROM clip_full_sample
GROUP BY er_bins3, split
ORDER BY split""").df()

Unnamed: 0,er_bins3,split,n
0,medium,test,1629
1,high,test,1639
2,low,test,1732
3,high,train,8102
4,medium,train,8868
5,low,train,8023
6,low,validation,1682
7,high,validation,1751
8,medium,validation,1567


In [24]:
con.sql("""SELECT er_bins2, split, COUNT(*) as n
FROM clip_full_sample
GROUP BY er_bins2, split
ORDER BY split""").df()

Unnamed: 0,er_bins2,split,n
0,low,test,2555
1,high,test,2445
2,high,train,12504
3,low,train,12489
4,high,validation,2516
5,low,validation,2484


In [25]:
con.sql("""PRAGMA table_info('clip_full_sample')""").df()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,filename,VARCHAR,False,,False
1,1,username,VARCHAR,False,,False
2,2,like_count,INTEGER,False,,False
3,3,comment_count,INTEGER,False,,False
4,4,width,INTEGER,False,,False
5,5,height,INTEGER,False,,False
6,6,time_utc,TIMESTAMP,False,,False
7,7,caption,VARCHAR,False,,False
8,8,aspect_ratio,DOUBLE,False,,False
9,9,area,INTEGER,False,,False


# TRAIN

In [26]:
sample_df = con.execute("SELECT post_id FROM clip_full_sample25 WHERE split = 'train'").df()
sample_ids = sample_df["post_id"].to_numpy()

print("Sample size:", len(sample_ids))

Sample size: 24993


In [27]:
# Retrieve TEXT embeddings
text_npz_path = "D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_train_ids_y.npz"
train_text = np.load(text_npz_path, allow_pickle=True)

X_all_text = train_text["embeddings"] 
ids_all_text = train_text["ids"]      

mask_text = np.isin(ids_all_text, sample_ids)
X_text_filt = X_all_text[mask_text]
ids_text_filt = ids_all_text[mask_text]

# DataFrame
df_text = pd.DataFrame({
    "post_id": ids_text_filt,
    "emb": list(X_text_filt)
})

In [28]:
# Retrieve IMAGE embeddings 

img_npz_path = "D:/dataset/clip_img_emb_ALL/clip_vit_b32_train_ALL.npz"
train_img = np.load(img_npz_path, allow_pickle=True)

X_all_img = train_img["feats"] 
ids_all_img = train_img["post_id"] 

mask_img = np.isin(ids_all_img, sample_ids)
X_img_filt = X_all_img[mask_img]
ids_img_filt = ids_all_img[mask_img]

df_img = pd.DataFrame({
    "post_id": ids_img_filt,
    "emb": list(X_img_filt)
})

agg = df_img.groupby("post_id")["emb"].apply(lambda s: np.mean(np.vstack(s.values), axis=0))

df_img_agg = agg.reset_index() 

In [29]:
meta = pd.read_csv("D:/dataset/meta_classification/meta_train_final.csv")

# Filter
meta = meta[meta["post_id"].isin(sample_ids)].copy()
print("Meta rows after filter:", len(meta))

Meta rows after filter: 24993


In [30]:
meta.dtypes

post_id                   object
width                    float64
height                   float64
aspect_ratio             float64
area                     float64
dow                      float64
hour_utc                 float64
month                    float64
year                     float64
caption_len_char         float64
n_hashtags               float64
n_mentions               float64
n_urls                   float64
n_emojis                 float64
followees                float64
posts                    float64
has_caption                int64
orientation_landscape    float64
orientation_portrait     float64
orientation_square       float64
category_beauty          float64
category_family          float64
category_fashion         float64
category_fitness         float64
category_food            float64
category_interior        float64
category_other           float64
category_pet             float64
category_travel          float64
dtype: object

In [31]:
df_text = df_text.rename(columns={"emb": "emb_text"})
df_img_agg = df_img_agg.rename(columns={"emb": "emb_img"})

# Align
aligned = (
    sample_df[["post_id"]]
    .merge(df_text, on="post_id", how="left")
    .merge(df_img_agg, on="post_id", how="left")
    .merge(meta, on="post_id", how="left")
)

assert aligned["emb_text"].notna().all()
assert aligned["emb_img"].notna().all()

X_text = np.stack(aligned["emb_text"].to_numpy())
X_img  = np.stack(aligned["emb_img"].to_numpy())

meta_cols = meta.columns.drop("post_id")
X_meta = aligned[meta_cols].to_numpy(np.float32)

X_train = np.hstack([X_text, X_img, X_meta])
ids_train = aligned["post_id"].to_numpy()

In [32]:
print(X_text.shape, X_img.shape, X_meta.shape)

(24993, 512) (24993, 512) (24993, 28)


In [33]:
np.save("D:/dataset/restricted_clip/X_tr", X_train)
np.save("D:/dataset/restricted_clip/ids_train", ids_train)

# VALIDATION

In [34]:
sample_df = con.execute("SELECT post_id FROM clip_full_sample WHERE split = 'validation'").df()
sample_ids = sample_df["post_id"].to_numpy()

print("Sample size:", len(sample_ids))

Sample size: 5000


In [35]:
# Retrieve TEXT embeddings
text_npz_path = "D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_val_ids_y.npz"
val_text = np.load(text_npz_path, allow_pickle=True)

X_all_text = val_text["embeddings"] 
ids_all_text = val_text["ids"]      
mask_text = np.isin(ids_all_text, sample_ids)
X_text_filt = X_all_text[mask_text]
ids_text_filt = ids_all_text[mask_text]

df_text = pd.DataFrame({
    "post_id": ids_text_filt,
    "emb": list(X_text_filt)
})


In [36]:
# Retrieve IMAGE embeddings 
img_npz_path = "D:/dataset/clip_img_emb_ALL/clip_vit_b32_validation_ALL.npz"
val_img = np.load(img_npz_path, allow_pickle=True)

X_all_img = val_img["feats"]     
ids_all_img = val_img["post_id"] 

mask_img = np.isin(ids_all_img, sample_ids)
X_img_filt = X_all_img[mask_img]
ids_img_filt = ids_all_img[mask_img]

df_img = pd.DataFrame({
    "post_id": ids_img_filt,
    "emb": list(X_img_filt)
})

agg = df_img.groupby("post_id")["emb"].apply(lambda s: np.mean(np.vstack(s.values), axis=0))

df_img_agg = agg.reset_index()

In [37]:
meta = pd.read_csv("D:/dataset/meta_classification/meta_val_final.csv")

meta = meta[meta["post_id"].isin(sample_ids)].copy()
print("Meta rows after filter:", len(meta))

Meta rows after filter: 5000


In [38]:
df_text = df_text.rename(columns={"emb": "emb_text"})
df_img_agg = df_img_agg.rename(columns={"emb": "emb_img"})

# Alignment
aligned = (
    sample_df[["post_id"]]
    .merge(df_text, on="post_id", how="left")
    .merge(df_img_agg, on="post_id", how="left")
    .merge(meta, on="post_id", how="left")
)

assert aligned["emb_text"].notna().all()
assert aligned["emb_img"].notna().all()

X_text = np.stack(aligned["emb_text"].to_numpy())
X_img  = np.stack(aligned["emb_img"].to_numpy())

meta_cols = meta.columns.drop("post_id")
X_meta = aligned[meta_cols].to_numpy(np.float32)

X_val = np.hstack([X_text, X_img, X_meta])
ids_val = aligned["post_id"].to_numpy()

In [39]:
print(X_text.shape, X_img.shape, X_meta.shape)

(5000, 512) (5000, 512) (5000, 28)


In [40]:
np.save("D:/dataset/restricted_clip/X_va", X_val)
np.save("D:/dataset/restricted_clip/ids_va", ids_val)

# TEST

In [41]:
sample_df = con.execute("SELECT post_id FROM clip_full_sample WHERE split = 'test'").df()
sample_ids = sample_df["post_id"].to_numpy()

print("Sample size:", len(sample_ids))

Sample size: 5000


In [42]:
# Retrieve TEXT embeddings
text_npz_path = "D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_test_ids_y.npz"
test_text = np.load(text_npz_path, allow_pickle=True)

X_all_text = test_text["embeddings"]  
ids_all_text = test_text["ids"]       

mask_text = np.isin(ids_all_text, sample_ids)
X_text_filt = X_all_text[mask_text]
ids_text_filt = ids_all_text[mask_text]

df_text = pd.DataFrame({
    "post_id": ids_text_filt,
    "emb_text": list(X_text_filt)
})

In [43]:
# Retrieve IMAGE embeddings 
img_npz_path = "D:/dataset/clip_img_emb_ALL/clip_vit_b32_test_ALL.npz"
test_img = np.load(img_npz_path, allow_pickle=True)

X_all_img = test_img["feats"]         
ids_all_img = test_img["post_id"]     


mask_img = np.isin(ids_all_img, sample_ids)
X_img_filt = X_all_img[mask_img]
ids_img_filt = ids_all_img[mask_img]

df_img = pd.DataFrame({
    "post_id": ids_img_filt,
    "emb_img": list(X_img_filt)
})

agg = df_img.groupby("post_id")["emb_img"].apply(lambda s: np.mean(np.vstack(s.values), axis=0))

df_img_agg = agg.reset_index()  

In [44]:
meta = pd.read_csv("D:/dataset/meta_classification/meta_test_final.csv")

meta = meta[meta["post_id"].isin(sample_ids)].copy()
print("Meta rows after filter:", len(meta))

Meta rows after filter: 5000


In [45]:
# Alignment
aligned = (
    sample_df[["post_id"]]
    .merge(df_text, on="post_id", how="left")
    .merge(df_img_agg, on="post_id", how="left")
    .merge(meta, on="post_id", how="left")
)

assert aligned["emb_text"].notna().all()
assert aligned["emb_img"].notna().all()

X_text = np.stack(aligned["emb_text"].to_numpy())
X_img  = np.stack(aligned["emb_img"].to_numpy())

meta_cols = meta.columns.drop("post_id")
X_meta = aligned[meta_cols].to_numpy(np.float32)

X_test = np.hstack([X_text, X_img, X_meta])
ids_test = aligned["post_id"].to_numpy()

In [46]:
print(X_text.shape, X_img.shape, X_meta.shape)

(5000, 512) (5000, 512) (5000, 28)


In [47]:
np.save("D:/dataset/restricted_clip/X_te", X_test)
np.save("D:/dataset/restricted_clip/ids_te", ids_test)

# TARGET

In [48]:
ids_tr_common = np.load("D:/dataset/restricted_clip/ids_train.npy", allow_pickle = True)

y_df = con.execute("""
    SELECT post_id, er_bins, er_bins3, er_bins2
    FROM clip_full_sample25
    WHERE split = 'train'""").df()


y_tr_aligned5 = (
    y_df.set_index("post_id")
        .loc[ids_tr_common, "er_bins"]
        .to_numpy()
)

y_tr_aligned3 = (
    y_df.set_index("post_id")
        .loc[ids_tr_common, "er_bins3"]
        .to_numpy()
)

y_tr_aligned2 = (
    y_df.set_index("post_id")
        .loc[ids_tr_common, "er_bins2"]
        .to_numpy()
)

In [49]:
np.save("D:/dataset/restricted_clip/y_tr_5.npy", y_tr_aligned5)
np.save("D:/dataset/restricted_clip/y_tr_3.npy", y_tr_aligned3)
np.save("D:/dataset/restricted_clip/y_tr_2.npy", y_tr_aligned2)

In [50]:
ids_va_common = np.load("D:/dataset/restricted_clip/ids_va.npy", allow_pickle = True)

y_df = con.execute("""
    SELECT post_id, er_bins, er_bins3, er_bins2
    FROM clip_full_sample
    WHERE split = 'validation'""").df()


y_va_aligned5 = (
    y_df.set_index("post_id")
        .loc[ids_va_common, "er_bins"]
        .to_numpy()
)

y_va_aligned3 = (
    y_df.set_index("post_id")
        .loc[ids_va_common, "er_bins3"]
        .to_numpy()
)

y_va_aligned2 = (
    y_df.set_index("post_id")
        .loc[ids_va_common, "er_bins2"]
        .to_numpy()
)

In [51]:
np.save("D:/dataset/restricted_clip/y_va_5.npy", y_va_aligned5)
np.save("D:/dataset/restricted_clip/y_va_3.npy", y_va_aligned3)
np.save("D:/dataset/restricted_clip/y_va_2.npy", y_va_aligned2)

In [52]:
ids_te_common = np.load("D:/dataset/restricted_clip/ids_te.npy", allow_pickle = True)

y_df = con.execute("""
    SELECT post_id, er_bins, er_bins3, er_bins2
    FROM clip_full_sample
    WHERE split = 'test'""").df()


y_te_aligned5 = (
    y_df.set_index("post_id")
        .loc[ids_te_common, "er_bins"]
        .to_numpy()
)

y_te_aligned3 = (
    y_df.set_index("post_id")
        .loc[ids_te_common, "er_bins3"]
        .to_numpy()
)

y_te_aligned2 = (
    y_df.set_index("post_id")
        .loc[ids_te_common, "er_bins2"]
        .to_numpy()
)

In [53]:
np.save("D:/dataset/restricted_clip/y_te_5.npy", y_te_aligned5)
np.save("D:/dataset/restricted_clip/y_te_3.npy", y_te_aligned3)
np.save("D:/dataset/restricted_clip/y_te_2.npy", y_te_aligned2)

# CLASSIFICATION 5 CLASSES

In [2]:
X_tr = np.load("D:/dataset/restricted_clip/X_tr.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/restricted_clip/X_va.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/restricted_clip/y_tr_5.npy", allow_pickle = True)
y_va = np.load("D:/dataset/restricted_clip/y_va_5.npy", allow_pickle = True)

In [3]:
print(X_tr.shape, X_va.shape, y_tr.shape, y_va.shape)

(24993, 1052) (5000, 1052) (24993,) (5000,)


In [4]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.3168640479518058 | accuracy (val): 0.3264

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.29921306397152864 | accuracy (val): 0.3296

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.31753262873448834 | accuracy (val): 0.334

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.3087302193220355 | accuracy (val): 0.337

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.32094997804774406 | accuracy (val): 0.3436

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.2944115215974648 | accuracy (val): 0.3378

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.3097367452213542 | accuracy (val): 0.333

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.2886819017410057 | accuracy (val): 0.3332

Best hyperparameter configuration:
{'alpha': 0.001, 'class_weight': Non

In [4]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.2642 | accuracy (val): 0.2938

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.2642 | accuracy (val): 0.2938

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.2642 | accuracy (val): 0.2938

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.2648 | accuracy (val): 0.2946

Best hyperparameter configuration:
{'var_smoothing': 1e-06}
Validation macro-F1: 0.26484311840960306

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
3   1.000000e-06      0.264843        0.2946
2   1.000000e-07      0.264247        0.2938
1   1.000000e-08      0.264233        0.2938
0   1.000000e-09      0.264233        0.2938


In [5]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [50, 80],
    "max_depth": [10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.2946 | accuracy (val): 0.2900

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.2956 | accuracy (val): 0.2910

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.2941 | accuracy (val): 0.2902

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.2945 | accuracy (val): 0.2894

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.2847 | accuracy (val): 0.2820

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.2821 | accuracy (val): 0.2786

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.2861 | ac

In [6]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3230 | accuracy (val): 0.3292

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3257 | accuracy (val): 0.3322

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3251 | accuracy (val): 0.3284

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3208 | accuracy (val): 0.3224

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3254 | accuracy (val): 0.3330

Combination: {'colsample_bytr

In [17]:
del X_tr, X_va, y_tr, y_va
gc.collect()

283

In [None]:
# PERFORMANCE ON TEST

In [2]:
X_tr = np.load("D:/dataset/restricted_clip/X_tr.npy", allow_pickle = True)
X_va = np.load("D:/dataset/restricted_clip/X_va.npy", allow_pickle = True)
X_te = np.load("D:/dataset/restricted_clip/X_te.npy", allow_pickle = True)


y_tr = np.load("D:/dataset/restricted_clip/y_tr_5.npy", allow_pickle = True)
y_va = np.load("D:/dataset/restricted_clip/y_va_5.npy", allow_pickle = True)
y_te = np.load("D:/dataset/restricted_clip/y_te_5.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, X_va, y_tr, y_va
gc.collect()

198

In [3]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

cfgs = [
    GaussianNB(var_smoothing = 1e-06),
    RandomForestClassifier(
        max_depth=10, max_features=0.05, min_samples_leaf=2, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 4, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")

del X_trva, X_te, y_trva, y_te
gc.collect()


Configuration: GaussianNB(var_smoothing=1e-06)
macro-F1 (train): 0.2671 | accuracy (train): 0.2964

Configuration: RandomForestClassifier(max_depth=10, max_features=0.05, min_samples_leaf=2,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (train): 0.3016 | accuracy (train): 0.2996

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
         

1188

In [3]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 0.001,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_trva, y_trva)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2989 | accuracy (test): 0.3396


# CLASSIFICATION 3 CLASSES

In [5]:
X_tr = np.load("D:/dataset/restricted_clip/X_tr.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/restricted_clip/X_va.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/restricted_clip/y_tr_3.npy", allow_pickle = True)
y_va = np.load("D:/dataset/restricted_clip/y_va_3.npy", allow_pickle = True)

In [6]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.4933558139185065 | accuracy (val): 0.508

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.4839664913056942 | accuracy (val): 0.5046

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.4866906282058006 | accuracy (val): 0.5158

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.47427631658995945 | accuracy (val): 0.5158

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.4357226241556084 | accuracy (val): 0.513

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.41993172267960593 | accuracy (val): 0.5104

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.4107064490657348 | accuracy (val): 0.5062

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.4029048638703829 | accuracy (val): 0.5048

Best hyperparameter configuration:
{'alpha': 1e-05, 'class_weight': Non

In [9]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.4373 | accuracy (val): 0.4592

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.4373 | accuracy (val): 0.4592

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.4373 | accuracy (val): 0.4592

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.4369 | accuracy (val): 0.4588

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.43731666035865174

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.437317        0.4592
1   1.000000e-08      0.437317        0.4592
2   1.000000e-07      0.437317        0.4592
3   1.000000e-06      0.436897        0.4588


In [10]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [50, 80],
    "max_depth": [10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.4626 | accuracy (val): 0.4604

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.4724 | accuracy (val): 0.4702

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.4719 | accuracy (val): 0.4698

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.4739 | accuracy (val): 0.4716

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.4697 | accuracy (val): 0.4668

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.4726 | accuracy (val): 0.4692

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.4586 | ac

In [11]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5029 | accuracy (val): 0.5074

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5063 | accuracy (val): 0.5096

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4964 | accuracy (val): 0.5000

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.4970 | accuracy (val): 0.4996

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5074 | accuracy (val): 0.5116

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE ON TEST

In [4]:
X_tr = np.load("D:/dataset/restricted_clip/X_tr.npy", allow_pickle = True)
X_va = np.load("D:/dataset/restricted_clip/X_va.npy", allow_pickle = True)
X_te = np.load("D:/dataset/restricted_clip/X_te.npy", allow_pickle = True)


y_tr = np.load("D:/dataset/restricted_clip/y_tr_3.npy", allow_pickle = True)
y_va = np.load("D:/dataset/restricted_clip/y_va_3.npy", allow_pickle = True)
y_te = np.load("D:/dataset/restricted_clip/y_te_3.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, X_va, y_tr, y_va
gc.collect()

66

In [6]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=10, max_features=0.05, min_samples_leaf=5, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 1, learning_rate = 0.1, max_depth= 4, n_estimators= 100, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")

del X_trva, X_te, y_trva, y_te
gc.collect()


Configuration: GaussianNB()
macro-F1 (train): 0.4382 | accuracy (train): 0.4580

Configuration: RandomForestClassifier(max_depth=10, max_features=0.05, min_samples_leaf=5,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (train): 0.4850 | accuracy (train): 0.4860

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=1,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy

1581

In [5]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-5,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_trva, y_trva)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.4782 | accuracy (test): 0.5028


# CLASSIFICATION 2 CLASSES

In [7]:
X_tr = np.load("D:/dataset/restricted_clip/X_tr.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/restricted_clip/X_va.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/restricted_clip/y_tr_2.npy", allow_pickle = True)
y_va = np.load("D:/dataset/restricted_clip/y_va_2.npy", allow_pickle = True)

In [8]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.6724385728162641 | accuracy (val): 0.673

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.6750198902644489 | accuracy (val): 0.6754

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.6750097416832987 | accuracy (val): 0.676

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.6759236976785228 | accuracy (val): 0.6768

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.674530230596537 | accuracy (val): 0.676

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.6741876522540373 | accuracy (val): 0.6754

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.6588612838041854 | accuracy (val): 0.6616

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.6594766911895806 | accuracy (val): 0.6618

Best hyperparameter configuration:
{'alpha': 0.0001, 'class_weight': 'balan

In [14]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.6305 | accuracy (val): 0.6306

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.6305 | accuracy (val): 0.6306

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.6305 | accuracy (val): 0.6306

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.6305 | accuracy (val): 0.6306

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.6305485575811576

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.630549        0.6306
1   1.000000e-08      0.630549        0.6306
2   1.000000e-07      0.630549        0.6306
3   1.000000e-06      0.630549        0.6306


In [15]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [50, 80],
    "max_depth": [10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.6538 | accuracy (val): 0.6546

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.6562 | accuracy (val): 0.6568

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.6474 | accuracy (val): 0.6482

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.6542 | accuracy (val): 0.6550

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.6454 | accuracy (val): 0.6458

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.6450 | accuracy (val): 0.6454

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.6357 | ac

In [16]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6771 | accuracy (val): 0.6774

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6798 | accuracy (val): 0.6800

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6770 | accuracy (val): 0.6774

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6758 | accuracy (val): 0.6762

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6775 | accuracy (val): 0.6778

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE ON TEST

In [6]:
X_tr = np.load("D:/dataset/restricted_clip/X_tr.npy", allow_pickle = True)
X_va = np.load("D:/dataset/restricted_clip/X_va.npy", allow_pickle = True)
X_te = np.load("D:/dataset/restricted_clip/X_te.npy", allow_pickle = True)


y_tr = np.load("D:/dataset/restricted_clip/y_tr_2.npy", allow_pickle = True)
y_va = np.load("D:/dataset/restricted_clip/y_va_2.npy", allow_pickle = True)
y_te = np.load("D:/dataset/restricted_clip/y_te_2.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, X_va, y_tr, y_va
gc.collect()

94

In [8]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=2, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 4, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")

del X_trva, X_te, y_trva, y_te
gc.collect()


Configuration: GaussianNB()
macro-F1 (train): 0.6273 | accuracy (train): 0.6274

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=2,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (train): 0.6531 | accuracy (train): 0.6538

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy

98

In [7]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 0.0001,
        average = True,
        class_weight = 'balanced',
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_trva, y_trva)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.6778 | accuracy (test): 0.6782
