In [1]:
import duckdb, torch, time, os, gc
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.sparse import load_npz, hstack, save_npz

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from transformers import CLIPTokenizer, CLIPModel
import torch
import numpy as np
import gc, os, time
from pathlib import Path

In [3]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


# LOAD DATA FOR CLASSIFICATION

In [2]:
train = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_train_ids_y.npz", allow_pickle = True)

X_tr = train["embeddings"]
ids_tr = train["ids"]

print(X_tr.shape, len(ids_tr))

(773497, 512) 773497


In [3]:
val = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_val_ids_y.npz", allow_pickle = True)

X_va = val["embeddings"]
ids_va = val["ids"]

print(X_va.shape, len(ids_va))

(412325, 512) 412325


In [4]:
test = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_test_ids_y.npz", allow_pickle = True)

X_te = test["embeddings"]
ids_te = test["ids"]

print(X_va.shape, len(ids_va))

(412325, 512) 412325


In [6]:
metadata_tr = con.sql("""
    SELECT post_id, er_bins2 FROM md1718
    WHERE split = 'train'
""").df().set_index('post_id')

# allineamento diretto ai post_id in X
y_tr = metadata_tr.loc[ids_tr, 'er_bins2'].values
assert len(y_tr) == len(ids_tr)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
metadata_va = con.sql("""
    SELECT post_id, er_bins2 FROM md1718
    WHERE split = 'validation'
""").df().set_index('post_id')

# allineamento diretto ai post_id in X
y_va = metadata_va.loc[ids_va, 'er_bins2'].values
assert len(y_va) == len(ids_va)

In [8]:
metadata_te = con.sql("""
    SELECT post_id, er_bins2 FROM md1718
    WHERE split = 'test'
""").df().set_index('post_id')

# allineamento diretto ai post_id in X
y_te = metadata_te.loc[ids_te, 'er_bins2'].values
assert len(y_te) == len(ids_te)

In [9]:
np.save("D:/dataset/clip_text_emb_ALL/y_tr_2", y_tr)
np.save("D:/dataset/clip_text_emb_ALL/y_va_2", y_va)
np.save("D:/dataset/clip_text_emb_ALL/y_te_2", y_te)

In [2]:
train = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_train_ids_y.npz", allow_pickle = True)
val = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_val_ids_y.npz", allow_pickle = True)

X_tr = train["embeddings"]
X_va = val["embeddings"]

y_tr = np.load("D:/dataset/clip_text_emb_ALL/y_tr_2.npy", allow_pickle = True)
y_va = np.load("D:/dataset/clip_text_emb_ALL/y_va_2.npy", allow_pickle = True)

del train, val
gc.collect()

84

In [3]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.58376487834214 | accuracy (val): 0.5837652337355241

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.583705364041589 | accuracy (val): 0.5837118777663252

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.5836143351833228 | accuracy (val): 0.5837191535803068

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.5834232305943959 | accuracy (val): 0.583479051718911

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.5769437978699319 | accuracy (val): 0.5779494330928273

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.5770549903029751 | accuracy (val): 0.5777820893712484

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.33756738476842324 | accuracy (val): 0.5046383314133268

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.4123171186151787 | accuracy (val)

In [3]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.5517 | accuracy (val): 0.5539

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.5517 | accuracy (val): 0.5539

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.5517 | accuracy (val): 0.5539

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.5517 | accuracy (val): 0.5539

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.5516860599919027

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.551686      0.553908
1   1.000000e-08      0.551686      0.553908
2   1.000000e-07      0.551686      0.553908
3   1.000000e-06      0.551686      0.553908


In [4]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [50, 80],
    "max_depth": [10, 12], 
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.5722 | accuracy (val): 0.5722

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.5738 | accuracy (val): 0.5738

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.5723 | accuracy (val): 0.5723

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.5734 | accuracy (val): 0.5734

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.5726 | accuracy (val): 0.5726

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.5734 | accuracy (val): 0.5734

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.5717 | ac

In [5]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5],
    "gamma": [0, 1],
    "reg_lambda": [1],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5788 | accuracy (val): 0.5788

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5816 | accuracy (val): 0.5816

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5831 | accuracy (val): 0.5831

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5846 | accuracy (val): 0.5847

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5788 | accuracy (val): 0.5788

Combination: {'colsample_bytr

# PERFORMANCE SUL TEST SET

In [4]:
train = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_train_ids_y.npz", allow_pickle = True)

X_tr = train["embeddings"]
ids_tr = train["ids"]

y_tr_ids = con.sql("""SELECT post_id, er_bins2 FROM md1718 WHERE split = 'train'""").df()
y_tr = (
    y_tr_ids.set_index("post_id")
            .loc[ids_tr, "er_bins2"]
            .to_numpy()
)

del train, ids_tr

In [3]:
val = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_val_ids_y.npz", allow_pickle = True)

X_va = val["embeddings"]
ids_va = val["ids"]

y_va_ids = con.sql("""SELECT post_id, er_bins2 FROM md1718 WHERE split = 'validation'""").df()
y_va = (
    y_va_ids.set_index("post_id")
            .loc[ids_va, "er_bins2"]
            .to_numpy()
)

del val, ids_va

In [None]:
X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

del X_tr, X_va, y_tr, y_va

In [5]:
test = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_test_ids_y.npz", allow_pickle = True)

X_te = test["embeddings"]
ids_te = test["ids"]

y_te_ids = con.sql("""SELECT post_id, er_bins2 FROM md1718 WHERE split = 'test'""").df()
y_te = (
    y_te_ids.set_index("post_id")
            .loc[ids_te, "er_bins2"]
            .to_numpy()
)

del test, ids_te, y_tr_ids, y_te_ids
gc.collect()

577

In [6]:
# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)


cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=5, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 1, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_tr, y_tr_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_tr, y_tr)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: GaussianNB()
macro-F1 (test): 0.5478 | accuracy (test): 0.5506

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=5,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (test): 0.5705 | accuracy (test): 0.5705

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=1,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=Non

In [6]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_tr, y_tr)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.5801 | accuracy (test): 0.5801
