In [1]:
from sklearn.model_selection import train_test_split
import duckdb, torch, time, os, gc
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from pathlib import Path
from scipy.sparse import load_npz, hstack, save_npz

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

# LOAD EMBEDDINGS

In [2]:
train = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_train_ids_y.npz", allow_pickle=True)
E_tr = train["embeddings"]
ids_tr = train["ids"]

val = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_val_ids_y.npz", allow_pickle=True)
E_va = val["embeddings"]
ids_va = val["ids"]

In [3]:
ids_tr.shape

(773497,)

In [4]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [5]:
target_tr = con.sql("""SELECT post_id, er_bins2 FROM md1718 WHERE split = 'train'""").df()
target_va = con.sql("""SELECT post_id, er_bins2 FROM md1718 WHERE split = 'validation'""").df()

In [6]:
m_tr = target_tr.set_index("post_id")["er_bins2"]
m_va = target_va.set_index("post_id")["er_bins2"]

y_tr = m_tr.reindex(ids_tr).to_numpy()
y_va = m_va.reindex(ids_va).to_numpy()

# Checks
missing_tr = pd.isna(y_tr).sum()
missing_va = pd.isna(y_va).sum()
print("Missing labels - train:", missing_tr, "val:", missing_va)

assert missing_tr == 0 and missing_va == 0, "Some ids have no label in SQL targets"
assert len(y_tr) == len(E_tr)
assert len(y_va) == len(E_va)

Missing labels - train: 0 val: 0


In [7]:
print(E_tr.shape, E_va.shape, y_tr.shape, y_va.shape)

(773497, 384) (412325, 384) (773497,) (412325,)


In [8]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"]
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(E_tr, y_tr)

    y_val_pred = clf.predict(E_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'average': False, 'class_weight': None}
macro-F1 (val): 0.5583686769859715 | accuracy (val): 0.5658085248287152

Combination: {'alpha': 1e-05, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.5600399910765788 | accuracy (val): 0.566218395683017

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': None}
macro-F1 (val): 0.5665952477713433 | accuracy (val): 0.566788334444916

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': 'balanced'}
macro-F1 (val): 0.566166200247675 | accuracy (val): 0.5664342448311405

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': None}
macro-F1 (val): 0.5627451664784999 | accuracy (val): 0.563894985751531

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.562481535563615 | accuracy (val): 0.5638270781543685

Combination: {'alpha': 0.0001, 'average': True, 'class_weight': None}
macro-F1 (val): 0.5655414286996382 | accuracy (val): 0.565

In [21]:
# NAIVE BAYES - GAUSSIAN

param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(E_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(E_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.5522 | accuracy (val): 0.5531

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.5522 | accuracy (val): 0.5531

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.5522 | accuracy (val): 0.5531

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.5522 | accuracy (val): 0.5531

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.552194862706416

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.552195      0.553086
1   1.000000e-08      0.552195      0.553086
2   1.000000e-07      0.552195      0.553086
3   1.000000e-06      0.552195      0.553086


In [23]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(E_tr, y_tr)

    y_val_pred = clf.predict(E_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.5552 | accuracy (val): 0.5559

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.5564 | accuracy (val): 0.5570

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.5570 | accuracy (val): 0.5575

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.5551 | accuracy (val): 0.5559

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.5568 | accuracy (val): 0.5574

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.5569 | accuracy (val): 0.5574

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.5552 | accuracy (val

In [24]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    clf.fit(E_tr, y_tr_enc)

    y_val_pred = clf.predict(E_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5628 | accuracy (val): 0.5632

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5658 | accuracy (val): 0.5661

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5688 | accuracy (val): 0.5690

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5715 | accuracy (val): 0.5726

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5628 | accuracy (val): 0.5632

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE SUL TEST SET

In [9]:
train = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_train_ids_y.npz", allow_pickle=True)
E_tr = train["embeddings"]
ids_tr = train["ids"]

val = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_val_ids_y.npz", allow_pickle=True)
E_va = val["embeddings"]
ids_va = val["ids"]

test = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_test_ids_y.npz", allow_pickle=True)
E_te = test["embeddings"]
ids_te = test["ids"]

In [10]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [11]:
E_trva = np.concatenate((E_tr, E_va), axis = 0)
ids_trva = np.concatenate((ids_tr, ids_va), axis = 0)

target_trva = con.sql("""SELECT post_id, er_bins2 FROM md1718 WHERE split = 'train' OR split = 'validation'""").df()
target_te = con.sql("""SELECT post_id, er_bins2 FROM md1718 WHERE split = 'test'""").df()

In [12]:
print(E_trva.shape, ids_trva.shape, target_trva.shape, E_te.shape, ids_te.shape, target_te.shape)

(1185822, 384) (1185822,) (1185822, 2) (423604, 384) (423604,) (423604, 2)


In [13]:
# Build mapping and align
m_trva = target_trva.set_index("post_id")["er_bins2"]
m_te = target_te.set_index("post_id")["er_bins2"]

y_trva = m_trva.reindex(ids_trva).to_numpy()
y_te = m_te.reindex(ids_te).to_numpy()

# Checks
missing_trva = pd.isna(y_trva).sum()
missing_te = pd.isna(y_te).sum()
print("Missing labels - train:", missing_trva, "val:", missing_te)

assert missing_trva == 0 and missing_te == 0, "Some ids have no label in SQL targets"
assert len(y_trva) == len(E_trva)
assert len(y_te) == len(E_te)

Missing labels - train: 0 val: 0


In [14]:
print(E_trva.shape, E_te.shape, y_trva.shape, y_te.shape)

(1185822, 384) (423604, 384) (1185822,) (423604,)


In [8]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

In [None]:
cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=2, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(E_trva, y_trva_enc)
        y_te_pred = cfg.predict(E_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(E_trva, y_trva)
        y_te_pred = cfg.predict(E_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: GaussianNB()
macro-F1 (test): 0.5499 | accuracy (test): 0.5506

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=2,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (test): 0.5682 | accuracy (test): 0.5687

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=Non

In [15]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(E_trva, y_trva)
y_te_pred = cfg.predict(E_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.5675 | accuracy (test): 0.5676
