In [1]:
import os, time, duckdb, torch, timm, gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from PIL import Image
from sklearn.model_selection import ParameterGrid

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier

import torchvision.transforms as T
from pathlib import Path

from torch.utils.data import Dataset, DataLoader
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import duckdb, torch
from transformers import CLIPModel, CLIPProcessor

In [2]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


# TRAIN

In [4]:
# TEXT
train = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_train_ids_y.npz", allow_pickle = True)

X_tr_text = train["embeddings"]
ids_tr_text = train["ids"]

print(X_tr_text.shape, len(ids_tr_text))

(773497, 512) 773497


In [5]:
# IMAGE
train = np.load("D:/dataset/clip_img_emb_ALL/clip_vit_b32_train_ALL.npz", allow_pickle = True)

X_tr_img = train["feats"]
ids_tr_img = train["post_id"]

print(X_tr_img.shape, len(ids_tr_img))

df = pd.DataFrame({
    "post_id": ids_tr_img
})

# Inseriamo gli embeddings in un array di oggetti
df["emb"] = list(X_tr_img)

# Aggrega per post_id
agg = df.groupby("post_id")["emb"].apply(lambda x: np.mean(x.tolist(), axis=0))

X_tr_img = np.stack(agg.values)
post_ids_unique_tr_img = agg.index.values

(960048, 512) 960048


In [6]:
meta_train_final = pd.read_csv("D:/dataset/meta_classification/meta_train_final.csv")

In [7]:
# Alignment train

ids_text = set(ids_tr_text)
ids_img = set(post_ids_unique_tr_img)
ids_meta = set(meta_train_final.post_id)

ids_tr_common = sorted(list(ids_text & ids_img & ids_meta))

In [8]:
# Text
df_text = pd.DataFrame(X_tr_text, index=ids_tr_text)
X_tr_text_aligned = df_text.loc[ids_tr_common].values # reordering

In [9]:
# Img
df_img_agg = pd.DataFrame(X_tr_img, index=post_ids_unique_tr_img)
X_tr_img_aligned = df_img_agg.loc[ids_tr_common].values

In [10]:
# metadata
meta_train_aligned = (
    meta_train_final.set_index("post_id")
                    .loc[ids_tr_common]
                    .reset_index())

In [12]:
# target
y_df = con.execute("""
    SELECT post_id, er_bins
    FROM md1718
    WHERE split = 'train'""").df()


y_tr_aligned = (
    y_df.set_index("post_id")
        .loc[ids_tr_common, "er_bins"]
        .to_numpy()
)

In [13]:
assert X_tr_text_aligned.shape[0] == X_tr_img_aligned.shape[0] == len(y_tr_aligned)
print("Tutto allineato correttamente!")

Tutto allineato correttamente!


In [14]:
X_tr = np.concatenate((X_tr_text_aligned, X_tr_img_aligned, meta_train_aligned.drop(columns=["post_id"])), axis=1)

In [None]:
np.save("D:/dataset/multimodal3/X_tr.npy", X_tr)
np.save("D:/dataset/multimodal3/y_tr_5.npy", y_tr_aligned)

In [17]:
np.save("D:/dataset/multimodal3/ids_tr_order.npy", ids_tr_common)

# VALIDATION 

In [3]:
# TEXT
val = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_val_ids_y.npz", allow_pickle = True)

X_va_text = val["embeddings"]
ids_va_text = val["ids"]

print(X_va_text.shape, len(ids_va_text))

(412325, 512) 412325


In [5]:
# IMAGE
val = np.load("D:/dataset/clip_img_emb_ALL/clip_vit_b32_validation_ALL.npz", allow_pickle = True)

X_va_img = val["feats"]
ids_va_img = val["post_id"]

print(X_va_img.shape, len(ids_va_img))

df = pd.DataFrame({
    "post_id": ids_va_img
})

# Inseriamo gli embeddings in un array di oggetti
df["emb"] = list(X_va_img)

# Aggrega per post_id
agg = df.groupby("post_id")["emb"].apply(lambda x: np.mean(x.tolist(), axis=0))

X_va_img = np.stack(agg.values)
post_ids_unique_va_img = agg.index.values

(556982, 512) 556982


In [6]:
meta_val_final = pd.read_csv("D:/dataset/meta_classification/meta_val_final.csv")

In [7]:
# Alignment train

ids_text = set(ids_va_text)
ids_img = set(post_ids_unique_va_img)
ids_meta = set(meta_val_final.post_id)

ids_va_common = sorted(list(ids_text & ids_img & ids_meta))

In [8]:
# Text
df_text = pd.DataFrame(X_va_text, index=ids_va_text)
X_va_text_aligned = df_text.loc[ids_va_common].values # reordering

In [9]:
# Img
df_img_agg = pd.DataFrame(X_va_img, index=post_ids_unique_va_img)
X_va_img_aligned = df_img_agg.loc[ids_va_common].values

In [10]:
# metadata
meta_val_aligned = (
    meta_val_final.set_index("post_id")
                    .loc[ids_va_common]
                    .reset_index())

In [11]:
# target
y_df = con.execute("""
    SELECT post_id, er_bins
    FROM md1718
    WHERE split = 'validation'""").df()


y_va_aligned = (
    y_df.set_index("post_id")
        .loc[ids_va_common, "er_bins"]
        .to_numpy()
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [12]:
assert X_va_text_aligned.shape[0] == X_va_img_aligned.shape[0] == len(y_va_aligned)
print("Tutto allineato correttamente!")

Tutto allineato correttamente!


In [13]:
X_va = np.concatenate((X_va_text_aligned, X_va_img_aligned, meta_val_aligned.drop(columns=["post_id"])), axis=1)

In [14]:
np.save("D:/dataset/multimodal3/X_va.npy", X_va)
np.save("D:/dataset/multimodal3/y_va_5.npy", y_va_aligned)
np.save("D:/dataset/multimodal3/ids_va_order.npy", ids_va_common)

# CLASSIFICATION

In [2]:
X_tr = np.load("D:/dataset/multimodal3/X_tr.npy", allow_pickle = True).astype(np.float32)
X_va = np.load("D:/dataset/multimodal3/X_va.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/multimodal3/y_tr_5.npy", allow_pickle = True)
y_va = np.load("D:/dataset/multimodal3/y_va_5.npy", allow_pickle = True)

In [3]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.30567533295750626 | accuracy (val): 0.3439665312556842

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.28707184825034193 | accuracy (val): 0.3439058994725035

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.3011291696763808 | accuracy (val): 0.3444685624204208

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.2815948284113871 | accuracy (val): 0.34449038986236585

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.29265261111623764 | accuracy (val): 0.3436148669132359

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.2747436191445606 | accuracy (val): 0.3426665858242891

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.287368476260512 | accuracy (val): 0.3421863821014976

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.27279159830595273 | accurac

In [4]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.2571 | accuracy (val): 0.2991

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.2571 | accuracy (val): 0.2991

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.2571 | accuracy (val): 0.2992

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.2571 | accuracy (val): 0.2991

Best hyperparameter configuration:
{'var_smoothing': 1e-07}
Validation macro-F1: 0.2571072970651239

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
2   1.000000e-07      0.257107      0.299177
0   1.000000e-09      0.257081      0.299150
1   1.000000e-08      0.257076      0.299148
3   1.000000e-06      0.257070      0.299145


In [5]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [50, 80],
    "max_depth": [10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.3151 | accuracy (val): 0.3185

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.3140 | accuracy (val): 0.3177

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.3181 | accuracy (val): 0.3217

Combination: {'max_depth': 10, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.3170 | accuracy (val): 0.3204

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.3053 | accuracy (val): 0.3079

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.3038 | accuracy (val): 0.3077

Combination: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.3045 | ac

In [6]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3250 | accuracy (val): 0.3389

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3294 | accuracy (val): 0.3427

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3336 | accuracy (val): 0.3456

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3385 | accuracy (val): 0.3492

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3250 | accuracy (val): 0.3389

Combination: {'colsample_bytr

# PERFORMANCE SUL TEST

In [3]:
# TEXT
test = np.load("D:/dataset/clip_text_emb_ALL/clip-vit-base-patch32_test_ids_y.npz", allow_pickle = True)

X_te_text = test["embeddings"]
ids_te_text = test["ids"]

print(X_te_text.shape, len(ids_te_text))

(423604, 512) 423604


In [4]:
# IMAGE
test = np.load("D:/dataset/clip_img_emb_ALL/clip_vit_b32_test_ALL.npz", allow_pickle = True)

X_te_img = test["feats"]
ids_te_img = test["post_id"]

print(X_te_img.shape, len(ids_te_img))

df = pd.DataFrame({
    "post_id": ids_te_img
})

# Inseriamo gli embeddings in un array di oggetti
df["emb"] = list(X_te_img)

# Aggrega per post_id
agg = df.groupby("post_id")["emb"].apply(lambda x: np.mean(x.tolist(), axis=0))

X_te_img = np.stack(agg.values)
post_ids_unique_te_img = agg.index.values

(588557, 512) 588557


In [6]:
meta_te_final = pd.read_csv("D:/dataset/meta_classification/meta_test_final.csv")

In [7]:
# Alignment train

ids_text = set(ids_te_text)
ids_img = set(post_ids_unique_te_img)
ids_meta = set(meta_te_final.post_id)

ids_te_common = sorted(list(ids_text & ids_img & ids_meta))

In [8]:
# Text
df_text = pd.DataFrame(X_te_text, index=ids_te_text)
X_te_text_aligned = df_text.loc[ids_te_common].values # reordering

In [9]:
# Img
df_img_agg = pd.DataFrame(X_te_img, index=post_ids_unique_te_img)
X_te_img_aligned = df_img_agg.loc[ids_te_common].values

In [10]:
# metadata
meta_te_aligned = (
    meta_te_final.set_index("post_id")
                    .loc[ids_te_common]
                    .reset_index())

In [11]:
# target
y_df = con.execute("""
    SELECT post_id, er_bins
    FROM md1718
    WHERE split = 'test'""").df()


y_te_aligned = (
    y_df.set_index("post_id")
        .loc[ids_te_common, "er_bins"]
        .to_numpy()
)

In [12]:
assert X_te_text_aligned.shape[0] == X_te_img_aligned.shape[0] == len(y_te_aligned)
print("Tutto allineato correttamente!")

Tutto allineato correttamente!


In [13]:
X_te = np.concatenate((X_te_text_aligned, X_te_img_aligned, meta_te_aligned.drop(columns=["post_id"])), axis=1)

In [14]:
np.save("D:/dataset/multimodal3/X_te.npy", X_te)
np.save("D:/dataset/multimodal3/y_te_5.npy", y_te_aligned)
np.save("D:/dataset/multimodal3/ids_te_order.npy", ids_te_common)

In [2]:
X_tr = np.load("D:/dataset/multimodal3/X_tr.npy", allow_pickle = True).astype(np.float32)
y_tr = np.load("D:/dataset/multimodal3/y_tr_5.npy", allow_pickle = True)

X_va = np.load("D:/dataset/multimodal3/X_va.npy", allow_pickle = True).astype(np.float32)
y_va = np.load("D:/dataset/multimodal3/y_va_5.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

np.save("D:/dataset/multimodal3/X_trva.npy", X_trva)
np.save("D:/dataset/multimodal3/X_trva_5.npy", X_trva)

In [None]:
X_tr = np.load("D:/dataset/multimodal3/X_trva.npy", allow_pickle = True).astype(np.float32)
y_tr = np.load("D:/dataset/multimodal3/y_trva_5.npy", allow_pickle = True)

X_te = np.load("D:/dataset/multimodal3/X_te.npy", allow_pickle = True).astype(np.float32)
y_te = np.load("D:/dataset/multimodal3/y_te_5.npy", allow_pickle = True)

In [3]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_tr, y_tr)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.3041 | accuracy (test): 0.3467


In [5]:
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)

cfgs = [
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=2, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_tr, y_tr_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_tr, y_tr)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (train): {macro_f1:.4f} | accuracy (train): {acc:.4f}")


Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=2,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (train): 0.3194 | accuracy (train): 0.3232

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=150, n_jobs=-1, num_class=5, ...)
macro-F1 (train): 0.3381 | 

In [6]:
# TEST SU GAUSSIAN NAIVE BAYES

batch_size = 256
classes = np.unique(y_tr)

clf = GaussianNB(var_smoothing = 1e-07)


# Fit the model using minibatch for memory
for start in range(0, X_tr.shape[0], batch_size):
    # print(f"Batch {start} fit")
    end = min(start + batch_size, X_tr.shape[0])

    Xb = X_tr[start:end]
    yb = y_tr[start:end]

    if start == 0:
        clf.partial_fit(Xb, yb, classes=classes)
    else:
        clf.partial_fit(Xb, yb)

    del Xb, yb
    gc.collect()

# Predict using minibatches
y_te_pred = []

for start in range(0, X_te.shape[0], batch_size):
    # print(f"Batch {start} predict")
    end = min(start + batch_size, X_te.shape[0])

    Xb = X_te[start:end]
    y_te_pred.append(clf.predict(Xb))

    del Xb
    gc.collect()

y_te_pred = np.concatenate(y_te_pred)

macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2589 | accuracy (test): 0.3027
