In [2]:
import os, duckdb, gc
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

In [3]:
DB_PATH   = r"D:/db/meta.duckdb"

# Connection
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [4]:
con.execute("""ALTER TABLE features.img_handcrafted ADD COLUMN er_bins TEXT""")

In [5]:
con.execute("""
UPDATE features.img_handcrafted AS i
SET er_bins = m.er_bins
FROM md1718 AS m 
WHERE i.post_id = m.post_id""")

In [6]:
column_names = con.sql("""PRAGMA table_info('features.img_handcrafted');""").fetchdf()
print(column_names['name'].to_list())

['post_id', 'h_mean', 'h_std', 's_mean', 's_std', 'v_mean', 'v_std', 'h_hist_00', 'h_hist_01', 'h_hist_02', 'h_hist_03', 'h_hist_04', 'h_hist_05', 'h_hist_06', 'h_hist_07', 'h_hist_08', 'h_hist_09', 'h_hist_10', 'h_hist_11', 'h_hist_12', 'h_hist_13', 'h_hist_14', 'h_hist_15', 's_hist_00', 's_hist_01', 's_hist_02', 's_hist_03', 's_hist_04', 's_hist_05', 's_hist_06', 's_hist_07', 's_hist_08', 's_hist_09', 's_hist_10', 's_hist_11', 's_hist_12', 's_hist_13', 's_hist_14', 's_hist_15', 'v_hist_00', 'v_hist_01', 'v_hist_02', 'v_hist_03', 'v_hist_04', 'v_hist_05', 'v_hist_06', 'v_hist_07', 'v_hist_08', 'v_hist_09', 'v_hist_10', 'v_hist_11', 'v_hist_12', 'v_hist_13', 'v_hist_14', 'v_hist_15', 'gray_hist_00', 'gray_hist_01', 'gray_hist_02', 'gray_hist_03', 'gray_hist_04', 'gray_hist_05', 'gray_hist_06', 'gray_hist_07', 'gray_hist_08', 'gray_hist_09', 'gray_hist_10', 'gray_hist_11', 'gray_hist_12', 'gray_hist_13', 'gray_hist_14', 'gray_hist_15', 'laplacian_var', 'edge_density', 'entropy_gray', 'c

In [7]:
# Exctract the df
df_images = con.sql("""SELECT * FROM features.img_handcrafted""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [8]:
# Split

train = df_images[df_images["split"] == "train"].copy()
val   = df_images[df_images["split"] == "validation"].copy()
test  = df_images[df_images["split"] == "test"].copy()

cols_to_drop = ['post_id', 'filename', 'split', 'er_bins', 'rn', 'er_bins3', 'er_bins2']
feature_cols = [col for col in train.columns if col not in cols_to_drop]

# Liste di caption pulite per il modello
X_train = train[feature_cols]
X_val   = val[feature_cols]
X_test  = test[feature_cols]


y_tr = train["er_bins"].to_numpy()
y_va = val["er_bins"].to_numpy()
y_te = test["er_bins"].to_numpy()

# IDs
train_ids = train["post_id"].to_numpy()
val_ids   = val["post_id"].to_numpy()
test_ids  = test["post_id"].to_numpy()

In [9]:
print(feature_cols)

['h_mean', 'h_std', 's_mean', 's_std', 'v_mean', 'v_std', 'h_hist_00', 'h_hist_01', 'h_hist_02', 'h_hist_03', 'h_hist_04', 'h_hist_05', 'h_hist_06', 'h_hist_07', 'h_hist_08', 'h_hist_09', 'h_hist_10', 'h_hist_11', 'h_hist_12', 'h_hist_13', 'h_hist_14', 'h_hist_15', 's_hist_00', 's_hist_01', 's_hist_02', 's_hist_03', 's_hist_04', 's_hist_05', 's_hist_06', 's_hist_07', 's_hist_08', 's_hist_09', 's_hist_10', 's_hist_11', 's_hist_12', 's_hist_13', 's_hist_14', 's_hist_15', 'v_hist_00', 'v_hist_01', 'v_hist_02', 'v_hist_03', 'v_hist_04', 'v_hist_05', 'v_hist_06', 'v_hist_07', 'v_hist_08', 'v_hist_09', 'v_hist_10', 'v_hist_11', 'v_hist_12', 'v_hist_13', 'v_hist_14', 'v_hist_15', 'gray_hist_00', 'gray_hist_01', 'gray_hist_02', 'gray_hist_03', 'gray_hist_04', 'gray_hist_05', 'gray_hist_06', 'gray_hist_07', 'gray_hist_08', 'gray_hist_09', 'gray_hist_10', 'gray_hist_11', 'gray_hist_12', 'gray_hist_13', 'gray_hist_14', 'gray_hist_15', 'laplacian_var', 'edge_density', 'entropy_gray', 'colorfulness

In [10]:
X_train_np = train[feature_cols].values
X_val_np = val[feature_cols].values
X_test_np = test[feature_cols].values

In [11]:
print("All finite values (train)?", np.isfinite(X_train_np).all())
print("All finite values (val)?", np.isfinite(X_val_np).all())
print("All finite values (test)?", np.isfinite(X_test_np).all())

All finite values (train)? True
All finite values (val)? False
All finite values (test)? False


In [12]:
mask = ~np.isfinite(X_val_np)
rows, cols = np.where(mask)
print(f"Found {len(rows)} infinite values")
print("Columns:", X_val.columns[np.unique(cols)].tolist())

Found 1 infinite values
Columns: ['laplacian_var']


In [13]:
mask_inf = ~np.isfinite(X_val["laplacian_var"])
X_val.loc[mask_inf, "laplacian_var"] = 0
X_val_np = X_val.values
print("All finite values (val)?", np.isfinite(X_val_np).all())

All finite values (val)? True


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val.loc[mask_inf, "laplacian_var"] = 0


In [14]:
mask = ~np.isfinite(X_test_np)
rows, cols = np.where(mask)
print(f"Found {len(rows)} infinite values")
print("Columns:", X_test.columns[np.unique(cols)].tolist())

Found 1 infinite values
Columns: ['laplacian_var']


In [15]:
mask_inf = ~np.isfinite(X_test["laplacian_var"])
X_test.loc[mask_inf, "laplacian_var"] = 0
X_test_np = X_test.values
print("All finite values (test)?", np.isfinite(X_test_np).all())

All finite values (test)? True


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.loc[mask_inf, "laplacian_var"] = 0


In [16]:
mask_inf = ~np.isfinite(X_test["laplacian_var"])
print("Index with inf:", X_test.index[mask_inf].tolist())

Index with inf: []


In [17]:
# Normalize with StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_np)
X_val_scaled   = scaler.transform(X_val_np)
X_test_scaled  = scaler.transform(X_test_np)

In [18]:
import gc
del df_images, train, val, test
del X_train, X_val, X_test, X_train_np, X_val_np, X_test_np

In [19]:
gc.collect()

0

In [20]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"]
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_train_scaled, y_tr)

    y_val_pred = clf.predict(X_val_scaled)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'average': False, 'class_weight': None}
macro-F1 (val): 0.21222144514461813 | accuracy (val): 0.21500272843024312

Combination: {'alpha': 1e-05, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.2047727687250231 | accuracy (val): 0.2080907051476384

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': None}
macro-F1 (val): 0.21004026263512982 | accuracy (val): 0.23043472988540592

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': 'balanced'}
macro-F1 (val): 0.23233569308926305 | accuracy (val): 0.24055296186260838

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': None}
macro-F1 (val): 0.20224208943459274 | accuracy (val): 0.20351421815315587

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.20441244396799513 | accuracy (val): 0.2066331170799733

Combination: {'alpha': 0.0001, 'average': True, 'class_weight': None}
macro-F1 (val): 0.21518903683374652 | accura

In [17]:
# NAIVE BAYES
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_train_scaled, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_val_scaled)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.1769 | accuracy (val): 0.2308

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.1769 | accuracy (val): 0.2308

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.1769 | accuracy (val): 0.2308

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.1769 | accuracy (val): 0.2308

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.17691308122958

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.176913      0.230837
1   1.000000e-08      0.176913      0.230837
2   1.000000e-07      0.176913      0.230837
3   1.000000e-06      0.176913      0.230837


In [18]:
# RANDOM FOREST

param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit
    clf.fit(X_train_scaled, y_tr)

    # Validation
    y_val_pred = clf.predict(X_val_scaled)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_rf)


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1893 | accuracy (val): 0.2221

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.1882 | accuracy (val): 0.2220

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.1872 | accuracy (val): 0.2219

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.1886 | accuracy (val): 0.2217

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.1880 | accuracy (val): 0.2221

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.1872 | accuracy (val): 0.2219

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1926 | accuracy (val

In [23]:
# XGBOOST

# Convert the labels into numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [150, 300],
    "max_depth": [4, 6],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "gamma": [0],
    "reg_lambda": [1, 3],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_train_scaled, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_val_scaled)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2184 | accuracy (val): 0.2305

Combination: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 3, 'subsample': 0.8}
macro-F1 (val): 0.2191 | accuracy (val): 0.2312

Combination: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 300, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2283 | accuracy (val): 0.2343

Combination: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 300, 'reg_lambda': 3, 'subsample': 0.8}
macro-F1 (val): 0.2288 | accuracy (val): 0.2347

Combination: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2281 | accuracy (val): 0.2344

Combination: {'colsample

In [None]:
# PERFORMANCE ON TEST SET

In [21]:
# Exctract the df
df_images = con.sql("""SELECT * FROM features.img_handcrafted""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [22]:
# Split

train = df_images[df_images["split"] == "train"].copy()
val   = df_images[df_images["split"] == "validation"].copy()
test  = df_images[df_images["split"] == "test"].copy()

cols_to_drop = ['post_id', 'filename', 'split', 'er_bins', 'er_bins3', 'er_bins2', 'rn']
feature_cols = [col for col in train.columns if col not in cols_to_drop]

# Liste di caption pulite per il modello
X_train = train[feature_cols]
X_val   = val[feature_cols]
X_test  = test[feature_cols]


y_tr = train["er_bins"].to_numpy()
y_va = val["er_bins"].to_numpy()
y_te = test["er_bins"].to_numpy()

# IDs
train_ids = train["post_id"].to_numpy()
val_ids   = val["post_id"].to_numpy()
test_ids  = test["post_id"].to_numpy()

In [23]:
X_train_np = train[feature_cols].values
X_val_np = val[feature_cols].values
X_test_np = test[feature_cols].values

In [24]:
print("All finite values (train)?", np.isfinite(X_train_np).all())
print("All finite values (val)?", np.isfinite(X_val_np).all())
print("All finite values (test)?", np.isfinite(X_test_np).all())

All finite values (train)? True
All finite values (val)? False
All finite values (test)? False


In [25]:
mask = ~np.isfinite(X_val_np)
rows, cols = np.where(mask)
print(f"Found {len(rows)} infinite values")
print("Columns:", X_val.columns[np.unique(cols)].tolist())

Found 1 infinite values
Columns: ['laplacian_var']


In [26]:
mask_inf = ~np.isfinite(X_val["laplacian_var"])
X_val.loc[mask_inf, "laplacian_var"] = 0
X_val_np = X_val.values
print("All finite values (val)?", np.isfinite(X_val_np).all())

All finite values (val)? True


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val.loc[mask_inf, "laplacian_var"] = 0


In [27]:
mask = ~np.isfinite(X_test_np)
rows, cols = np.where(mask)
print(f"Found {len(rows)} infinite values")
print("Columns:", X_test.columns[np.unique(cols)].tolist())

Found 1 infinite values
Columns: ['laplacian_var']


In [28]:
mask_inf = ~np.isfinite(X_test["laplacian_var"])
X_test.loc[mask_inf, "laplacian_var"] = 0
X_test_np = X_test.values
print("All finite values (test)?", np.isfinite(X_test_np).all())

All finite values (test)? True


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.loc[mask_inf, "laplacian_var"] = 0


In [29]:
mask_inf = ~np.isfinite(X_test["laplacian_var"])
print("Index with inf:", X_test.index[mask_inf].tolist())

Index with inf: []


In [30]:
X_full = np.concatenate([X_train_np, X_val_np], axis = 0)
y_full = np.concatenate([y_tr, y_va], axis = 0)

le = LabelEncoder()
y_full_enc = le.fit_transform(y_full)
y_te_enc = le.transform(y_te)

scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)
X_test_scaled = scaler.transform(X_test_np)

In [31]:
del train, test, val, X_train, X_test, X_val, X_train_np, X_val_np, X_test_np, X_full
gc.collect()

4057

In [17]:
cfgs = [
    GaussianNB(var_smoothing=1e-09),
    RandomForestClassifier(
        max_depth=12, max_features='sqrt', min_samples_leaf=2, n_estimators=30, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.8, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 300, reg_lambda= 3, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_full_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_full_scaled, y_full_enc)
        y_te_pred = cfg.predict(X_test_scaled)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_full_scaled, y_full)
        y_te_pred = cfg.predict(X_test_scaled)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: LinearSVC(C=10, class_weight='balanced', max_iter=5000, random_state=42)
macro-F1 (test): 0.2180 | accuracy (test): 0.2491

Configuration: GaussianNB()
macro-F1 (test): 0.1668 | accuracy (test): 0.2331

Configuration: RandomForestClassifier(max_depth=12, min_samples_leaf=2, n_estimators=30,
                       n_jobs=-1, random_state=42)
macro-F1 (test): 0.2301 | accuracy (test): 0.2329

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_le

In [32]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = 'balanced',
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_full_scaled, y_full)
y_te_pred = cfg.predict(X_test_scaled)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2299 | accuracy (test): 0.2448


In [18]:
con.close()