In [7]:
import os, re, duckdb, joblib, gc, random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz, vstack, csr_matrix, hstack
import time
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

In [3]:
# Setup

DB_PATH   = r"D:/db/meta.duckdb"
OUT_DIR   = r"D:/dataset/text_features/tfidf_v2"
os.makedirs(OUT_DIR, exist_ok=True)
   
table = "md1718"

# Connection
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [4]:
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

tr_ids = np.load(os.path.join(OUT_DIR, "tfidf_train_post_ids.npy"), allow_pickle=True)
va_ids = np.load(os.path.join(OUT_DIR, "tfidf_val_post_ids.npy"), allow_pickle=True)
te_ids = np.load(os.path.join(OUT_DIR, "tfidf_test_post_ids.npy"), allow_pickle=True)

In [6]:
query = """
SELECT post_id, er_bins3
FROM md1718
WHERE er_bins3 IS NOT NULL
"""

df_labels = con.execute(query).df()

id2y = dict(zip(df_labels["post_id"], df_labels["er_bins3"]))

y_tr = np.array([id2y[pid] for pid in tr_ids], dtype='object')
y_va = np.array([id2y[pid] for pid in va_ids], dtype='object')
y_te = np.array([id2y[pid] for pid in te_ids], dtype='object')

np.save("D:/dataset/text_features/tfidf_v3/tfidf_y_train_3.npy", y_tr)
np.save("D:/dataset/text_features/tfidf_v3/tfidf_y_val_3.npy", y_va)
np.save("D:/dataset/text_features/tfidf_v3/tfidf_y_test_3.npy", y_te)

In [2]:
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

Xtr_full = load_npz(f"{OUT_DIR}/tfidf_topwords_train.npz").astype(np.float32)
Xva_full = load_npz(f"{OUT_DIR}/tfidf_topwords_val.npz").astype(np.float32)
# Xte_full = load_npz(f"{OUT_DIR}/tfidf_topwords_test.npz")

tr_ids = np.load(os.path.join(OUT_DIR, "tfidf_train_post_ids.npy"), allow_pickle=True)
va_ids = np.load(os.path.join(OUT_DIR, "tfidf_val_post_ids.npy"), allow_pickle=True)
# te_ids = np.load(os.path.join(OUT_DIR, "tfidf_test_post_ids.npy"), allow_pickle=True)

y_tr = np.load("D:/dataset/text_features/tfidf_v3/tfidf_y_train_3.npy", allow_pickle = True)
y_va = np.load("D:/dataset/text_features/tfidf_v3/tfidf_y_val_3.npy", allow_pickle = True)
# y_te = np.load("D:/dataset/text_features/tfidf_v3/tfidf_y_test_3.npy", allow_pickle = True)

In [8]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(Xtr_full, y_tr)

    y_val_pred = clf.predict(Xva_full)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'average': False, 'class_weight': None}
macro-F1 (val): 0.40679655232785955 | accuracy (val): 0.41656702843630633

Combination: {'alpha': 1e-05, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.4038220676065618 | accuracy (val): 0.418938943794337

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': None}
macro-F1 (val): 0.41163660706615973 | accuracy (val): 0.4206851391499424

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': 'balanced'}
macro-F1 (val): 0.40633572483151803 | accuracy (val): 0.421483053416601

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': None}
macro-F1 (val): 0.4106078321603677 | accuracy (val): 0.41618868610925847

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.40523333062239364 | accuracy (val): 0.4190044261201722

Combination: {'alpha': 0.0001, 'average': True, 'class_weight': None}
macro-F1 (val): 0.4078070386532328 | accuracy (va

In [8]:
# NAIVE BAYES

param_grid = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

batch_size = 256
classes = np.unique(y_tr)

for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit the model using minibatch for memory
    for start in range(0, Xtr_full.shape[0], batch_size):
        # print(f"Batch {start} fit")
        end = min(start + batch_size, Xtr_full.shape[0])

        Xb = Xtr_full[start:end].toarray()
        yb = y_tr[start:end]

        if start == 0:
            clf.partial_fit(Xb, yb, classes=classes)
        else:
            clf.partial_fit(Xb, yb)

        del Xb, yb
        gc.collect()

    # Predict using minibatches
    y_val_pred = []

    for start in range(0, Xva_full.shape[0], batch_size):
        # print(f"Batch {start} predict")
        end = min(start + batch_size, Xva_full.shape[0])

        Xb = Xva_full[start:end].toarray()
        y_val_pred.append(clf.predict(Xb))

        del Xb
        gc.collect()

    y_val_pred = np.concatenate(y_val_pred)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values(
    "val_macro_f1", ascending=False
)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.3169 | accuracy (val): 0.3788

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.3245 | accuracy (val): 0.3802

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.3342 | accuracy (val): 0.3825

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.3467 | accuracy (val): 0.3862

Best hyperparameter configuration:
{'var_smoothing': 1e-06}
Validation macro-F1: 0.34671625646927323

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
3   1.000000e-06      0.346716      0.386166
2   1.000000e-07      0.334155      0.382536
1   1.000000e-08      0.324452      0.380217
0   1.000000e-09      0.316885      0.378796


In [10]:
# RANDOM FOREST 
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(Xtr_full, y_tr)

    y_val_pred = clf.predict(Xva_full)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.2302 | accuracy (val): 0.3408

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.2316 | accuracy (val): 0.3412

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.2320 | accuracy (val): 0.3413

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.2304 | accuracy (val): 0.3409

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.2319 | accuracy (val): 0.3415

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.2323 | accuracy (val): 0.3413

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1841 | accuracy (val

In [11]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
    "gamma": [0, 1],
    "reg_lambda": [1],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(Xtr_full, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(Xva_full)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2838 | accuracy (val): 0.3536

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2959 | accuracy (val): 0.3582

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2964 | accuracy (val): 0.3586

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3095 | accuracy (val): 0.3636

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2786 | accuracy (val): 0.3537

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE ON TEST SET

In [9]:
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

Xtr_full = load_npz(f"{OUT_DIR}/tfidf_topwords_train.npz").astype(np.float32)
Xva_full = load_npz(f"{OUT_DIR}/tfidf_topwords_val.npz").astype(np.float32)
Xte_full = load_npz(f"{OUT_DIR}/tfidf_topwords_test.npz").astype(np.float32)

tr_ids = np.load(os.path.join(OUT_DIR, "tfidf_train_post_ids.npy"), allow_pickle=True)
va_ids = np.load(os.path.join(OUT_DIR, "tfidf_val_post_ids.npy"), allow_pickle=True)
te_ids = np.load(os.path.join(OUT_DIR, "tfidf_test_post_ids.npy"), allow_pickle=True)

y_tr = np.load("D:/dataset/text_features/tfidf_v3/tfidf_y_train_3.npy", allow_pickle = True)
y_va = np.load("D:/dataset/text_features/tfidf_v3/tfidf_y_val_3.npy", allow_pickle = True)
y_te = np.load("D:/dataset/text_features/tfidf_v3/tfidf_y_test_3.npy", allow_pickle = True)

X_full = vstack([Xtr_full, Xva_full])
y_full = np.concatenate([y_tr, y_va])

le = LabelEncoder()
y_full_enc = le.fit_transform(y_full)
y_te_enc = le.transform(y_te)

In [10]:
del Xtr_full, Xva_full, tr_ids, va_ids, y_tr, y_va
gc.collect()

476

In [11]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_full, y_full)
y_te_pred = cfg.predict(Xte_full)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.4348 | accuracy (test): 0.4510


In [5]:
cfgs = [
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=5, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_full_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    ),
    
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_full, y_full_enc)
        y_te_pred = cfg.predict(Xte_full)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_full, y_full)
        y_te_pred = cfg.predict(Xte_full)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: LinearSVC(C=0.01, max_iter=2000, random_state=42)




macro-F1 (test): 0.4406 | accuracy (test): 0.4468

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=5,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (test): 0.2906 | accuracy (test): 0.3547

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=150, n_jobs=-

In [6]:
# TEST SU GAUSSIAN NAIVE BAYES

batch_size = 256
classes = np.unique(y_full)

clf = GaussianNB(var_smoothing = 1e-06)


# Fit the model using minibatch for memory
for start in range(0, X_full.shape[0], batch_size):
    # print(f"Batch {start} fit")
    end = min(start + batch_size, X_full.shape[0])

    Xb = X_full[start:end].toarray()
    yb = y_full[start:end]

    if start == 0:
        clf.partial_fit(Xb, yb, classes=classes)
    else:
        clf.partial_fit(Xb, yb)

    del Xb, yb
    gc.collect()

# Predict using minibatches
y_te_pred = []

for start in range(0, Xte_full.shape[0], batch_size):
    # print(f"Batch {start} predict")
    end = min(start + batch_size, Xte_full.shape[0])

    Xb = Xte_full[start:end].toarray()
    y_te_pred.append(clf.predict(Xb))

    del Xb
    gc.collect()

y_te_pred = np.concatenate(y_te_pred)

macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.3596 | accuracy (test): 0.4104
