In [1]:
from sklearn.model_selection import train_test_split
import duckdb, torch, time, os, gc
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from pathlib import Path
from scipy.sparse import load_npz, hstack, save_npz

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

# Check which model to use

In [4]:
# Set up

TEXT_COLUMN = "caption_bert_clip"
TARGET_COLUMN = "er_bins"
TABLE_NAME = "md1718"
DB_PATH = "D:/db/meta.duckdb"

SAMPLE_SIZE = 5000 
TEST_SIZE = 0.2
RANDOM_STATE = 42

con = duckdb.connect(DB_PATH)

query = f"""
    SELECT
        {TEXT_COLUMN} AS {TEXT_COLUMN},
        {TARGET_COLUMN} AS {TARGET_COLUMN}
    FROM {TABLE_NAME}
    WHERE {TEXT_COLUMN} IS NOT NULL
      AND {TARGET_COLUMN} IS NOT NULL
    ORDER BY random()
    LIMIT {SAMPLE_SIZE}
"""

df = con.execute(query).df()

print(f"Campione estratto da DuckDB: {len(df)} righe")

texts = df[TEXT_COLUMN].astype(str).tolist()

# Numeric target for XGBoost
le = LabelEncoder()
y = le.fit_transform(df[TARGET_COLUMN])

# Fixed split on indexes to ensure the same are used
indices = np.arange(len(df))
train_idx, test_idx = train_test_split(
    indices,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Campione estratto da DuckDB: 5000 righe


In [6]:
# List of encoders to test
ENCODERS = {
    "mpnet": "sentence-transformers/all-mpnet-base-v2",
    "minilm": "sentence-transformers/paraphrase-MiniLM-L6-v2",
    "e5-base": "intfloat/e5-base",
}

# Function to extract embeddings
def extract_embeddings(model_name, model_path, texts):
    print(f"\n Upload encoder: {model_name}")
    model = SentenceTransformer(model_path)

    start_time = time.time()
    embeddings = model.encode(
        texts,
        batch_size=32,
        show_progress_bar=True
    )
    end_time = time.time()

    extraction_time = end_time - start_time
    print(f" Time embedding extraction: {extraction_time:.2f} s")

    return np.array(embeddings), extraction_time

# Function classify ER based on the extracted embeddings
def benchmark_classifiers(X, y, train_idx, test_idx):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    results = []

    # 1) Linear SVM
    svm = LinearSVC(random_state=RANDOM_STATE)
    start_train = time.time()
    svm.fit(X_train, y_train)
    end_train = time.time()
    svm_train_time = end_train - start_train

    start_pred = time.time()
    y_pred_svm = svm.predict(X_test)
    end_pred = time.time()
    svm_infer_time = end_pred - start_pred

    svm_acc = accuracy_score(y_test, y_pred_svm)
    svm_f1 = f1_score(y_test, y_pred_svm, average="macro")

    results.append({
        "Classifier": "LinearSVM",
        "Train Time (s)": svm_train_time,
        "Inference Time (s)": svm_infer_time,
        "Accuracy": svm_acc,
        "F1-macro": svm_f1
    })

    # 2) Naive Bayes (GaussianNB)
    nb = GaussianNB()
    start_train = time.time()
    nb.fit(X_train, y_train)
    end_train = time.time()
    nb_train_time = end_train - start_train

    start_pred = time.time()
    y_pred_nb = nb.predict(X_test)
    end_pred = time.time()
    nb_infer_time = end_pred - start_pred

    nb_acc = accuracy_score(y_test, y_pred_nb)
    nb_f1 = f1_score(y_test, y_pred_nb, average="macro")

    results.append({
        "Classifier": "NaiveBayes",
        "Train Time (s)": nb_train_time,
        "Inference Time (s)": nb_infer_time,
        "Accuracy": nb_acc,
        "F1-macro": nb_f1
    })

    # 3) XGBoost
    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softmax" if len(np.unique(y)) > 2 else "binary:logistic",
        eval_metric="mlogloss",
        tree_method="auto",
        n_jobs=-1,
        random_state=RANDOM_STATE
    )

    start_train = time.time()
    xgb.fit(X_train, y_train)
    end_train = time.time()
    xgb_train_time = end_train - start_train

    start_pred = time.time()
    y_pred_xgb = xgb.predict(X_test)
    end_pred = time.time()
    xgb_infer_time = end_pred - start_pred

    xgb_acc = accuracy_score(y_test, y_pred_xgb)
    xgb_f1 = f1_score(y_test, y_pred_xgb, average="macro")

    results.append({
        "Classifier": "XGBoost",
        "Train Time (s)": xgb_train_time,
        "Inference Time (s)": xgb_infer_time,
        "Accuracy": xgb_acc,
        "F1-macro": xgb_f1
    })

    return results


all_results = []

for enc_name, enc_path in ENCODERS.items():
    # 1) embedding + time
    X, emb_time = extract_embeddings(enc_name, enc_path, texts)

    # 2) classification + time
    clf_results = benchmark_classifiers(X, y, train_idx, test_idx)

    # 3) encoder info
    for r in clf_results:
        r_with_enc = {
            "Encoder": enc_name,
            "Embedding Time (s)": emb_time,
            **r
        }
        all_results.append(r_with_enc)

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(["F1-macro", "Accuracy"], ascending=False)

print("\n FINAL RESULTS")
print(results_df.to_string(index=False))
results_df.to_csv("classification_embedding_benchmark_results.csv", index=False)

con.close()


 Upload encoder: mpnet


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

 Time embedding extraction: 455.04 s

 Upload encoder: minilm


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

 Time embedding extraction: 72.18 s

 Upload encoder: e5-base


tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

 Time embedding extraction: 725.68 s

 FINAL RESULTS
Encoder  Embedding Time (s) Classifier  Train Time (s)  Inference Time (s)  Accuracy  F1-macro
e5-base          725.681344    XGBoost      169.532226            0.035311     0.247  0.245108
e5-base          725.681344  LinearSVM        4.000240            0.006941     0.246  0.244317
e5-base          725.681344 NaiveBayes        0.030148            0.047309     0.252  0.242722
  mpnet          455.042816    XGBoost      143.430291            0.023739     0.232  0.233500
  mpnet          455.042816  LinearSVM        5.008187            0.027876     0.233  0.232215
 minilm           72.177455 NaiveBayes        0.023734            0.044004     0.231  0.225793
 minilm           72.177455  LinearSVM        5.487381            0.000000     0.226  0.225073
 minilm           72.177455    XGBoost       77.640962            0.025781     0.219  0.219217
  mpnet          455.042816 NaiveBayes        0.031173            0.042938     0.206  0.1995

# EMBEDDINGS EXTRACTION 

In [5]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [6]:
def load_split(split_name):
    print(f"Loading {split_name}...")
    df = con.sql(f"""
        SELECT post_id, caption_bert_clip, er_bins
        FROM md1718
        WHERE split = '{split_name}'
    """).df()
    ids = df["post_id"].to_numpy()
    texts = df["caption_bert_clip"].tolist()
    y = df["er_bins"]
    del df; gc.collect()
    print(f"{split_name} done.")
    return ids, texts, y

In [None]:
train_ids, Xtr_text, y_tr = load_split("train")
val_ids, Xva_text, y_val  = load_split("validation")
test_ids, Xte_text, y_te  = load_split("test")

con.close()

In [6]:
emb_dir = Path("D:/dataset/sbert_emb")

np.save('D:/dataset/sbert_emb/y_tr.npy', y_tr)
np.save('D:/dataset/sbert_emb/y_val.npy', y_val)
np.save('D:/dataset/sbert_emb/y_te.npy',  y_te)

In [7]:
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
model.max_seq_length = 256 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [8]:
import torch
torch.cuda.is_available()

False

In [9]:
def embed(texts, bs=64):
    return model.encode(
        texts,
        batch_size=bs,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True # ogni vettore avrà lunghezza 1
    )

In [10]:
torch.set_grad_enabled(False)
torch.set_num_threads(8)
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"

emb_dir = Path("D:/dataset/sbert_emb")
emb_dir.mkdir(parents=True, exist_ok=True)

# Save embeddings, ids, y aligned in shards
def embed_sharded_cached(texts, ids, y, split_name, shard=5000):
    prefix = f"{model_name.split('/')[-1]}_{split_name}"
    out_files = []

    # Sanity check
    n = len(texts)
    assert len(ids) == n, f"ids len {len(ids)} != texts len {n}"
    assert len(y) == n,   f"y len {len(y)} != texts len {n}"

    for i in range(0, n, shard):
        f = emb_dir / f"{prefix}_{i:07d}.npy"
        if f.exists():
            print(f"[skip] {f.name}")
            out_files.append(f)
            continue

        part = texts[i:i+shard]
        t0 = time.time()
        E = embed(part, bs=64).astype("float32")
        np.save(f, E)
        dt = time.time() - t0
        print(f"[saved] {f.name}  [{i+len(part)}/{n}]  ({len(part)} samples in {dt:.1f}s)")

        out_files.append(f)

    # Concatenate all shards
    arrays = [np.load(f) for f in out_files]
    E_all = np.vstack(arrays).astype("float32")

    # Safety check
    assert E_all.shape[0] == n, \
        f"Emb rows {E_all.shape[0]} != texts len {n}"

    # Save all files in a uniquel ALL file
    np.save(emb_dir / f"{prefix}_ALL.npy", E_all)

    # Save in .npz aligning embeddings, with ids and y
    npz_path = emb_dir / f"{prefix}_ids_y.npz"
    np.savez(
        npz_path,
        ids=np.asarray(ids),
        embeddings=E_all,
        y=np.asarray(y)
    )

    print(f"[done] Saved aligned ids + embeddings + y → {npz_path.name}")

    return E_all

In [13]:
print("Train…")
E_tr = embed_sharded_cached(Xtr_text, train_ids, y_tr, "train", shard=20000)

Train…
[saved] paraphrase-MiniLM-L6-v2_train_0000000.npy  [20000/773497]  (20000 samples in 207.1s)
[saved] paraphrase-MiniLM-L6-v2_train_0020000.npy  [40000/773497]  (20000 samples in 181.0s)
[saved] paraphrase-MiniLM-L6-v2_train_0040000.npy  [60000/773497]  (20000 samples in 230.3s)
[saved] paraphrase-MiniLM-L6-v2_train_0060000.npy  [80000/773497]  (20000 samples in 230.1s)
[saved] paraphrase-MiniLM-L6-v2_train_0080000.npy  [100000/773497]  (20000 samples in 283.3s)
[saved] paraphrase-MiniLM-L6-v2_train_0100000.npy  [120000/773497]  (20000 samples in 276.2s)
[saved] paraphrase-MiniLM-L6-v2_train_0120000.npy  [140000/773497]  (20000 samples in 268.6s)
[saved] paraphrase-MiniLM-L6-v2_train_0140000.npy  [160000/773497]  (20000 samples in 281.7s)
[saved] paraphrase-MiniLM-L6-v2_train_0160000.npy  [180000/773497]  (20000 samples in 300.0s)
[saved] paraphrase-MiniLM-L6-v2_train_0180000.npy  [200000/773497]  (20000 samples in 276.8s)
[saved] paraphrase-MiniLM-L6-v2_train_0200000.npy  [22000

In [14]:
print("Val…")
E_va = embed_sharded_cached(Xva_text, val_ids, y_val, "val",   shard=20000)

Val…
[saved] paraphrase-MiniLM-L6-v2_val_0000000.npy  [20000/412325]  (20000 samples in 193.7s)
[saved] paraphrase-MiniLM-L6-v2_val_0020000.npy  [40000/412325]  (20000 samples in 235.3s)
[saved] paraphrase-MiniLM-L6-v2_val_0040000.npy  [60000/412325]  (20000 samples in 258.2s)
[saved] paraphrase-MiniLM-L6-v2_val_0060000.npy  [80000/412325]  (20000 samples in 261.8s)
[saved] paraphrase-MiniLM-L6-v2_val_0080000.npy  [100000/412325]  (20000 samples in 258.2s)
[saved] paraphrase-MiniLM-L6-v2_val_0100000.npy  [120000/412325]  (20000 samples in 259.4s)
[saved] paraphrase-MiniLM-L6-v2_val_0120000.npy  [140000/412325]  (20000 samples in 276.2s)
[saved] paraphrase-MiniLM-L6-v2_val_0140000.npy  [160000/412325]  (20000 samples in 244.9s)
[saved] paraphrase-MiniLM-L6-v2_val_0160000.npy  [180000/412325]  (20000 samples in 239.4s)
[saved] paraphrase-MiniLM-L6-v2_val_0180000.npy  [200000/412325]  (20000 samples in 248.3s)
[saved] paraphrase-MiniLM-L6-v2_val_0200000.npy  [220000/412325]  (20000 sample

In [15]:
print("Test…")
E_te = embed_sharded_cached(Xte_text, test_ids, y_te, "test",  shard=20000)

Test…
[saved] paraphrase-MiniLM-L6-v2_test_0000000.npy  [20000/423604]  (20000 samples in 223.7s)
[saved] paraphrase-MiniLM-L6-v2_test_0020000.npy  [40000/423604]  (20000 samples in 250.1s)
[saved] paraphrase-MiniLM-L6-v2_test_0040000.npy  [60000/423604]  (20000 samples in 258.4s)
[saved] paraphrase-MiniLM-L6-v2_test_0060000.npy  [80000/423604]  (20000 samples in 260.7s)
[saved] paraphrase-MiniLM-L6-v2_test_0080000.npy  [100000/423604]  (20000 samples in 260.5s)
[saved] paraphrase-MiniLM-L6-v2_test_0100000.npy  [120000/423604]  (20000 samples in 251.5s)
[saved] paraphrase-MiniLM-L6-v2_test_0120000.npy  [140000/423604]  (20000 samples in 229.6s)
[saved] paraphrase-MiniLM-L6-v2_test_0140000.npy  [160000/423604]  (20000 samples in 249.6s)
[saved] paraphrase-MiniLM-L6-v2_test_0160000.npy  [180000/423604]  (20000 samples in 282.0s)
[saved] paraphrase-MiniLM-L6-v2_test_0180000.npy  [200000/423604]  (20000 samples in 272.2s)
[saved] paraphrase-MiniLM-L6-v2_test_0200000.npy  [220000/423604]  (

# LOAD EMBEDDINGS

In [2]:
train = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_train_ids_y.npz", allow_pickle=True)
E_tr = train["embeddings"]
y_tr = train["y"]

val = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_val_ids_y.npz", allow_pickle=True)
E_va = val["embeddings"]
y_va = val["y"]

In [3]:
print(E_tr.shape, E_va.shape, y_tr.shape, y_va.shape)

(773497, 384) (412325, 384) (773497,) (412325,)


In [4]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"]
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(E_tr, y_tr)

    y_val_pred = clf.predict(E_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'average': False, 'class_weight': None}
macro-F1 (val): 0.12581869535837908 | accuracy (val): 0.20248105256775603

Combination: {'alpha': 1e-05, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.1544386658411066 | accuracy (val): 0.2149420966470624

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': None}
macro-F1 (val): 0.23603247877747063 | accuracy (val): 0.24126114109015945

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': 'balanced'}
macro-F1 (val): 0.2390775257199076 | accuracy (val): 0.24866064390953738

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': None}
macro-F1 (val): 0.22067993556451188 | accuracy (val): 0.22984053841023463

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.22141086314808947 | accuracy (val): 0.22974110228581823

Combination: {'alpha': 0.0001, 'average': True, 'class_weight': None}
macro-F1 (val): 0.23735477518888679 | accura

In [3]:
# NAIVE BAYES - GAUSSIAN

param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(E_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(E_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.2224 | accuracy (val): 0.2428

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.2224 | accuracy (val): 0.2428

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.2224 | accuracy (val): 0.2428

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.2224 | accuracy (val): 0.2428

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.2223940710152486

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.222394      0.242753
1   1.000000e-08      0.222394      0.242753
2   1.000000e-07      0.222394      0.242753
3   1.000000e-06      0.222394      0.242753


In [3]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(E_tr, y_tr)

    y_val_pred = clf.predict(E_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1735 | accuracy (val): 0.2169

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.1716 | accuracy (val): 0.2169

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.1708 | accuracy (val): 0.2175

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.1746 | accuracy (val): 0.2174

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.1723 | accuracy (val): 0.2178

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.1713 | accuracy (val): 0.2183

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1735 | accuracy (val

In [11]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    clf.fit(E_tr, y_tr_enc)

    y_val_pred = clf.predict(E_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2241 | accuracy (val): 0.2349

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2310 | accuracy (val): 0.2388

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2348 | accuracy (val): 0.2412

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2399 | accuracy (val): 0.2447

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2241 | accuracy (val): 0.2349

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE SUL TEST SET

In [None]:
train = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_train_ids_y.npz", allow_pickle=True)
E_tr = train["embeddings"]
y_tr = train["y"]

val = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_val_ids_y.npz", allow_pickle=True)
E_va = val["embeddings"]
y_va = val["y"]

In [6]:
test = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_test_ids_y.npz", allow_pickle=True)
E_te = test["embeddings"]
y_te = test["y"]

In [7]:
E_trva = np.concatenate((E_tr, E_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

In [4]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

In [5]:
print(E_trva.shape, y_trva.shape, E_te.shape, y_te.shape)

(1185822, 384) (1185822,) (423604, 384) (423604,)


In [6]:
cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=2, n_estimators=30, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(E_trva, y_trva_enc)
        y_te_pred = cfg.predict(E_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(E_trva, y_trva)
        y_te_pred = cfg.predict(E_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: GaussianNB()
macro-F1 (test): 0.2201 | accuracy (test): 0.2447

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=2,
                       n_estimators=30, n_jobs=-1, random_state=42)
macro-F1 (test): 0.2282 | accuracy (test): 0.2352

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=Non

In [8]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = 'balanced',
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(E_trva, y_trva)
y_te_pred = cfg.predict(E_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2378 | accuracy (test): 0.2589
