In [1]:
# Import
from sklearn.model_selection import train_test_split
import duckdb, torch, time, os, gc
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from pathlib import Path
from scipy.sparse import load_npz, hstack, save_npz

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


# TRAIN

In [4]:
# METADATA
meta_train_final = pd.read_csv("D:/dataset/meta_classification/meta_train_final.csv")

# SBERT
train = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_train_ids_y.npz", allow_pickle=True)
SB_tr = train["embeddings"]
SB_ids_tr = train["ids"]

# EFFICIENT NET
EN_tr_data = np.load("D:/dataset/efficientnetb0_emb/train_data.npz", allow_pickle=True)
EN_tr = EN_tr_data["X"]
EN_tr_ids = EN_tr_data["ids"]

In [5]:
# ALIGN THE THREE SETS
meta_tr = meta_train_final.sort_values("post_id").reset_index(drop=True)
tr_ids_ord = meta_tr["post_id"].values

SB_ids_tr = SB_ids_tr.astype(str)
EN_tr_ids = EN_tr_ids.astype(str)
tr_ids_ord = tr_ids_ord.astype(str)

metadata_tr = con.sql("""
    SELECT post_id, er_bins2 FROM md1718
    WHERE split = 'train'
""").df().set_index('post_id')

y_tr = metadata_tr.loc[tr_ids_ord, 'er_bins2'].values
assert len(y_tr) == len(tr_ids_ord)

# SBERT -> DataFrame indicizzato
df_sbert = pd.DataFrame(SB_tr, index=SB_ids_tr)
df_sbert.index.name = "post_id"

# EfficientNet -> DataFrame indicizzato
df_eff = pd.DataFrame(EN_tr, index=EN_tr_ids)
df_eff.index.name = "post_id"

df_sbert.index = df_sbert.index.astype(str)
df_eff.index   = df_eff.index.astype(str)
metadata_tr.index = metadata_tr.index.astype(str)

# Allineamento perfetto tramite reindex
SBERT_tr_aligned = df_sbert.reindex(tr_ids_ord).values
EN_tr_aligned = df_eff.reindex(tr_ids_ord).values

# Metadata numerici
meta_tr_np = meta_tr.to_numpy()

In [6]:
for idx in np.random.choice(len(tr_ids_ord), 10, replace=False):
    pid = tr_ids_ord[idx]  # the ID used for alignment
    
    # Extract SBERT original label for that post_id
    original_label_sbert = train["y"][np.where(SB_ids_tr == pid)[0][0]]
    
    # Our final label
    aligned_label = y_tr[idx]

    print(f"ID {pid} -> SBERT label: {original_label_sbert}, aligned label: {aligned_label}")

ID layersofchic_-1796685831977753780 -> SBERT label: low, aligned label: low
ID sinmamaftm-1876283278257119194 -> SBERT label: medium, aligned label: high
ID thewhitefarmhouseblog-1879936584292551409 -> SBERT label: low, aligned label: low
ID msvaughntv-1698099046373233136 -> SBERT label: high, aligned label: high
ID erniespotted-1757359311917693949 -> SBERT label: high, aligned label: high
ID thebeautyspyglass-1887646225039025490 -> SBERT label: very_high, aligned label: high
ID sahmsg_jaimi-1814843296225530647 -> SBERT label: very_low, aligned label: low
ID lindaaos60-1891620384625870632 -> SBERT label: low, aligned label: low
ID skysins-1696547504756451666 -> SBERT label: very_high, aligned label: high
ID andreadipietro10-1865225852783253059 -> SBERT label: medium, aligned label: high


In [7]:
for idx in np.random.choice(len(tr_ids_ord), 10, replace=False):
    pid = tr_ids_ord[idx]
    
    # EN original label for that post_id
    original_label_en = EN_tr_data["y"][np.where(EN_tr_ids == pid)[0][0]]
    
    aligned_label = y_tr[idx]

    print(f"ID {pid} -> EN label: {original_label_en}, aligned label: {aligned_label}")

ID codycuddlebug-1749765126175874855 -> EN label: high, aligned label: high
ID cjh_natural-1821408945573383554 -> EN label: very_high, aligned label: high
ID caitpoli-1826325523577863143 -> EN label: low, aligned label: low
ID eglegestautaite-1500076546752642621 -> EN label: medium, aligned label: low
ID gaynycdad-1885917973309247987 -> EN label: medium, aligned label: low
ID vanessamujica-1514779476335251457 -> EN label: very_low, aligned label: low
ID thebeautyspyglass-1751390709313142656 -> EN label: very_high, aligned label: high
ID mjheubach-1598223180013656578 -> EN label: medium, aligned label: high
ID theluxetwo-1719283556621901715 -> EN label: medium, aligned label: high
ID thirdstopright-1628010618597326388 -> EN label: low, aligned label: low


In [8]:
def check_alignment(idx):
    pid = tr_ids_ord[idx]
    sbert_ok = np.allclose(SBERT_tr_aligned[idx], df_sbert.loc[pid].values)
    en_ok = np.allclose(EN_tr_aligned[idx], df_eff.loc[pid].values)
    label_ok = (y_tr[idx] == metadata_tr.loc[pid, 'er_bins2'])
    return sbert_ok, en_ok, label_ok

results = np.array([check_alignment(i) for i in range(200)])  # check first 200 rows

print("SBERT alignment OK:", results[:,0].all())
print("EN alignment OK:", results[:,1].all())
print("Label alignment OK:", results[:,2].all())


SBERT alignment OK: True
EN alignment OK: True
Label alignment OK: True


In [9]:
# Checks
assert SBERT_tr_aligned.shape[0] == len(meta_tr)
assert EN_tr_aligned.shape[0] == len(meta_tr)

print(tr_ids_ord[:5])
print(df_sbert.reindex(tr_ids_ord).index[:5])
print(df_eff.reindex(tr_ids_ord).index[:5])
print(len(tr_ids_ord), SBERT_tr_aligned.shape[0], EN_tr_aligned.shape[0], meta_tr_np.shape[0], len(y_tr))

['100pintas-1769662389073991144' '100pintas-1782702664733979876'
 '100pintas-1797067212467389817' '100pintas-1807955339238986900'
 '100pintas-1808039696742034708']
Index(['100pintas-1769662389073991144', '100pintas-1782702664733979876',
       '100pintas-1797067212467389817', '100pintas-1807955339238986900',
       '100pintas-1808039696742034708'],
      dtype='object', name='post_id')
Index(['100pintas-1769662389073991144', '100pintas-1782702664733979876',
       '100pintas-1797067212467389817', '100pintas-1807955339238986900',
       '100pintas-1808039696742034708'],
      dtype='object', name='post_id')
773497 773497 773497 773497 773497


In [10]:
assert (meta_tr["post_id"].values == tr_ids_ord).all()

# Per SBERT
assert (df_sbert.loc[tr_ids_ord].index.values == tr_ids_ord).all()

# Per EfficientNet
assert (df_eff.loc[tr_ids_ord].index.values == tr_ids_ord).all()

print("SBERT missing:", df_sbert.reindex(tr_ids_ord).isna().any(axis=1).sum())
print("EN missing:", df_eff.reindex(tr_ids_ord).isna().any(axis=1).sum())

SBERT missing: 0
EN missing: 0


In [11]:
post_id_tr = meta_train_final['post_id'].values
meta_tr_num = meta_train_final.drop(columns=['post_id']).values

meta_tr_num = meta_tr_num.astype('float32')
SBERT_tr_aligned = SBERT_tr_aligned.astype('float32')
EN_tr_aligned = EN_tr_aligned.astype('float32')

X_tr = np.concatenate((meta_tr_num, SBERT_tr_aligned, EN_tr_aligned), axis = 1)

In [12]:
print(X_tr.shape, meta_tr_num.shape, SBERT_tr_aligned.shape, EN_tr_aligned.shape)

(773497, 1692) (773497, 28) (773497, 384) (773497, 1280)


In [13]:
np.save("D:/dataset/multimodal2/X_train_2.npy", X_tr)
np.save("D:/dataset/multimodal2/y_tr_2.npy", y_tr)
np.save("D:/dataset/multimodal2/post_id_tr_2.npy", post_id_tr)

# VALIDATION

In [3]:
# METADATA
meta_val_final = pd.read_csv("D:/dataset/meta_classification/meta_val_final.csv")

# SBERT
val = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_val_ids_y.npz", allow_pickle=True)
SB_va = val["embeddings"]
SB_ids_va = val["ids"]

# EFFICIENT NET
EN_va_data = np.load("D:/dataset/efficientnetb0_emb/val_data.npz", allow_pickle=True)
EN_va = EN_va_data["X"]
EN_va_ids = EN_va_data["ids"]

# ALIGN THE THREE SETS
meta_va = meta_val_final.sort_values("post_id").reset_index(drop=True)
val_ids_ord = meta_va["post_id"].values

SB_ids_va = SB_ids_va.astype(str)
EN_va_ids = EN_va_ids.astype(str)
val_ids_ord = val_ids_ord.astype(str)

In [4]:
metadata_va = con.sql("""
    SELECT post_id, er_bins2 FROM md1718
    WHERE split = 'validation'
""").df().set_index('post_id')

# allineamento diretto ai post_id in X
y_va = metadata_va.loc[val_ids_ord, 'er_bins2'].values
assert len(y_va) == len(val_ids_ord)

# SBERT -> DataFrame indicizzato
df_sbert = pd.DataFrame(SB_va, index=SB_ids_va)
df_sbert.index.name = "post_id"

# EfficientNet -> DataFrame indicizzato
df_eff = pd.DataFrame(EN_va, index=EN_va_ids)
df_eff.index.name = "post_id"

df_sbert.index = df_sbert.index.astype(str)
df_eff.index   = df_eff.index.astype(str)
metadata_va.index = metadata_va.index.astype(str)

# Allineamento perfetto tramite reindex
SBERT_va_aligned = df_sbert.reindex(val_ids_ord).values
EN_va_aligned = df_eff.reindex(val_ids_ord).values

# Metadata numerici
meta_va_np = meta_va.to_numpy()

In [5]:
# Checks
assert SBERT_va_aligned.shape[0] == len(meta_va)
assert EN_va_aligned.shape[0] == len(meta_va)

print(val_ids_ord[:5])
print(df_sbert.reindex(val_ids_ord).index[:5])
print(df_eff.reindex(val_ids_ord).index[:5])
print(len(val_ids_ord), SBERT_va_aligned.shape[0], EN_va_aligned.shape[0], meta_va_np.shape[0], len(y_va))

['00_rocketgirl-1905372469183881641' '00_rocketgirl-1905378100137554269'
 '00_rocketgirl-1906985289541418664' '00_rocketgirl-1912887291135668511'
 '00_rocketgirl-1913179760499084122']
Index(['00_rocketgirl-1905372469183881641',
       '00_rocketgirl-1905378100137554269',
       '00_rocketgirl-1906985289541418664',
       '00_rocketgirl-1912887291135668511',
       '00_rocketgirl-1913179760499084122'],
      dtype='object', name='post_id')
Index(['00_rocketgirl-1905372469183881641',
       '00_rocketgirl-1905378100137554269',
       '00_rocketgirl-1906985289541418664',
       '00_rocketgirl-1912887291135668511',
       '00_rocketgirl-1913179760499084122'],
      dtype='object', name='post_id')
412325 412325 412325 412325 412325


In [6]:
assert (meta_va["post_id"].values == val_ids_ord).all()

# Per SBERT
assert (df_sbert.loc[val_ids_ord].index.values == val_ids_ord).all()

# Per EfficientNet
assert (df_eff.loc[val_ids_ord].index.values == val_ids_ord).all()

print("SBERT missing:", df_sbert.reindex(val_ids_ord).isna().any(axis=1).sum())
print("EN missing:", df_eff.reindex(val_ids_ord).isna().any(axis=1).sum())

SBERT missing: 0
EN missing: 0


In [7]:
post_id_va = meta_val_final['post_id'].values
meta_va_num = meta_val_final.drop(columns=['post_id']).values

meta_va_num = meta_va_num.astype('float32')
SBERT_va_aligned = SBERT_va_aligned.astype('float32')
EN_va_aligned = EN_va_aligned.astype('float32')

X_va = np.concatenate((meta_va_num, SBERT_va_aligned, EN_va_aligned), axis = 1)

In [8]:
print(X_va.shape, meta_va_num.shape, SBERT_va_aligned.shape, EN_va_aligned.shape)

(412325, 1692) (412325, 28) (412325, 384) (412325, 1280)


In [9]:
np.save("D:/dataset/multimodal2/X_val_2.npy", X_va)
np.save("D:/dataset/multimodal2/y_va_2.npy", y_va)
np.save("D:/dataset/multimodal2/post_id_va_2.npy", post_id_va)

# TEST

In [3]:
# METADATA
meta_test_final = pd.read_csv("D:/dataset/meta_classification/meta_test_final.csv")

# SBERT
test = np.load("D:/dataset/sbert_emb/paraphrase-MiniLM-L6-v2_test_ids_y.npz", allow_pickle=True)
SB_te = test["embeddings"]
SB_ids_te = test["ids"]

# EFFICIENT NET

EN_te_data = np.load("D:/dataset/efficientnetb0_emb/test_data.npz", allow_pickle=True)
EN_te = EN_te_data["X"]
EN_te_ids = EN_te_data["ids"]

# ALIGN THE THREE SETS
meta_te = meta_test_final.sort_values("post_id").reset_index(drop=True)
te_ids_ord = meta_te["post_id"].values


SB_ids_te = SB_ids_te.astype(str)
EN_te_ids = EN_te_ids.astype(str)
te_ids_ord = te_ids_ord.astype(str)

# Recupero le label y
metadata_te = con.sql("""
    SELECT post_id, er_bins2 FROM md1718
    WHERE split = 'test'
""").df().set_index('post_id')

y_te = metadata_te.loc[te_ids_ord, 'er_bins2'].values
assert len(y_te) == len(te_ids_ord)


# SBERT -> DataFrame indicizzato
df_sbert = pd.DataFrame(SB_te, index=SB_ids_te)
df_sbert.index.name = "post_id"

# EfficientNet -> DataFrame indicizzato
df_eff = pd.DataFrame(EN_te, index=EN_te_ids)
df_eff.index.name = "post_id"

df_sbert.index = df_sbert.index.astype(str)
df_eff.index   = df_eff.index.astype(str)
metadata_te.index = metadata_te.index.astype(str)

# Allineamento perfetto tramite reindex
SBERT_te_aligned = df_sbert.reindex(te_ids_ord).values
EN_te_aligned = df_eff.reindex(te_ids_ord).values

# Metadata numerici
meta_te_np = meta_te.to_numpy()


In [4]:
# Checks
assert SBERT_te_aligned.shape[0] == len(meta_te)
assert EN_te_aligned.shape[0] == len(meta_te)

print(te_ids_ord[:5])
print(df_sbert.reindex(te_ids_ord).index[:5])
print(df_eff.reindex(te_ids_ord).index[:5])
print(len(te_ids_ord), SBERT_te_aligned.shape[0], EN_te_aligned.shape[0], meta_te_np.shape[0], len(y_te))

['00_rocketgirl-1924887425826593106' '00_rocketgirl-1926960433592504542'
 '00_rocketgirl-1931680985582309140' '00_rocketgirl-1936081881737003967'
 '00_rocketgirl-1942428720468033285']
Index(['00_rocketgirl-1924887425826593106',
       '00_rocketgirl-1926960433592504542',
       '00_rocketgirl-1931680985582309140',
       '00_rocketgirl-1936081881737003967',
       '00_rocketgirl-1942428720468033285'],
      dtype='object', name='post_id')
Index(['00_rocketgirl-1924887425826593106',
       '00_rocketgirl-1926960433592504542',
       '00_rocketgirl-1931680985582309140',
       '00_rocketgirl-1936081881737003967',
       '00_rocketgirl-1942428720468033285'],
      dtype='object', name='post_id')
423604 423604 423604 423604 423604


In [5]:
assert (meta_te["post_id"].values == te_ids_ord).all()

# Per SBERT
assert (df_sbert.loc[te_ids_ord].index.values == te_ids_ord).all()

# Per EfficientNet
assert (df_eff.loc[te_ids_ord].index.values == te_ids_ord).all()

print("SBERT missing:", df_sbert.reindex(te_ids_ord).isna().any(axis=1).sum())
print("EN missing:", df_eff.reindex(te_ids_ord).isna().any(axis=1).sum())

SBERT missing: 0
EN missing: 0


In [6]:
post_id_te = meta_test_final['post_id'].values
meta_te_num = meta_test_final.drop(columns=['post_id']).values

meta_te_num = meta_te_num.astype('float32')
SBERT_te_aligned = SBERT_te_aligned.astype('float32')
EN_te_aligned = EN_te_aligned.astype('float32')

In [7]:
X_te = np.concatenate((meta_te_num, SBERT_te_aligned, EN_te_aligned), axis = 1)

In [8]:
print(X_te.shape, meta_te_num.shape, SBERT_te_aligned.shape, EN_te_aligned.shape)

(423604, 1692) (423604, 28) (423604, 384) (423604, 1280)


In [9]:
np.save("D:/dataset/multimodal2/X_test_2.npy", X_te)
np.save("D:/dataset/multimodal2/y_te_2.npy", y_te)
np.save("D:/dataset/multimodal2/post_id_te_2.npy", post_id_te)

# LOAD FUSED DATA

In [9]:
X_tr = np.load("D:/dataset/multimodal2/X_train_2.npy", allow_pickle = True)
y_tr = np.load("D:/dataset/multimodal2/y_tr_2.npy", allow_pickle = True)

X_va = np.load("D:/dataset/multimodal2/X_val_2.npy", allow_pickle = True)
y_va = np.load("D:/dataset/multimodal2/y_va_2.npy", allow_pickle = True)

In [8]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
    }

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=200,
        tol=1e-3,
        early_stopping = True
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'class_weight': None}
macro-F1 (val): 0.5949765514119276 | accuracy (val): 0.5949821136239617

Combination: {'alpha': 1e-05, 'class_weight': 'balanced'}
macro-F1 (val): 0.5954705167063417 | accuracy (val): 0.5954914206026799

Combination: {'alpha': 0.0001, 'class_weight': None}
macro-F1 (val): 0.5967565783176477 | accuracy (val): 0.596779239677439

Combination: {'alpha': 0.0001, 'class_weight': 'balanced'}
macro-F1 (val): 0.5977846858028595 | accuracy (val): 0.5977930030922209

Combination: {'alpha': 0.001, 'class_weight': None}
macro-F1 (val): 0.5968852542753329 | accuracy (val): 0.5970557206087431

Combination: {'alpha': 0.001, 'class_weight': 'balanced'}
macro-F1 (val): 0.5965450636108014 | accuracy (val): 0.5966773782816953

Combination: {'alpha': 0.01, 'class_weight': None}
macro-F1 (val): 0.5907977061289537 | accuracy (val): 0.5918680652397987

Combination: {'alpha': 0.01, 'class_weight': 'balanced'}
macro-F1 (val): 0.5911305280081083 | accuracy (va

In [10]:
# NAIVE BAYES - GAUSSIAN

param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.5781 | accuracy (val): 0.5810

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.5781 | accuracy (val): 0.5810

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.5781 | accuracy (val): 0.5810

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.5781 | accuracy (val): 0.5810

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.5780619644267415

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.578062      0.581047
1   1.000000e-08      0.578062      0.581047
2   1.000000e-07      0.578059      0.581044
3   1.000000e-06      0.578056      0.581042


In [11]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.5843 | accuracy (val): 0.5862

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.5854 | accuracy (val): 0.5875

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.5858 | accuracy (val): 0.5879

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.5842 | accuracy (val): 0.5862

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.5849 | accuracy (val): 0.5870

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.5858 | accuracy (val): 0.5878

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.5829 | accuracy (val

In [12]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150], 
    "max_depth": [4, 6], 
    "learning_rate": [0.1], 
    "subsample": [0.8], 
    "colsample_bytree": [0.5], 
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    clf.fit(X_tr, y_tr_enc)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5951 | accuracy (val): 0.5951

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5977 | accuracy (val): 0.5977

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5995 | accuracy (val): 0.5995

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.6018 | accuracy (val): 0.6018

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.5951 | accuracy (val): 0.5951

Combination: {'colsample_bytr

# PERFORMANCE ON TEST SET

In [None]:
X_tr = np.load("D:/dataset/multimodal2/X_train_2.npy", allow_pickle = True).astype(np.float32)
y_tr = np.load("D:/dataset/multimodal2/y_tr_2.npy", allow_pickle = True)

X_va = np.load("D:/dataset/multimodal2/X_val_2.npy", allow_pickle = True).astype(np.float32)
y_va = np.load("D:/dataset/multimodal2/y_va_2.npy", allow_pickle = True)

X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

np.save("D:/dataset/multimodal2/X_trainval_2.npy", X_trva)
np.save("D:/dataset/multimodal2/y_trva_2.npy", y_trva)

In [2]:
X_tr = np.load("D:/dataset/multimodal2/X_trainval_2.npy", allow_pickle = True).astype(np.float32)
y_tr = np.load("D:/dataset/multimodal2/y_trva_2.npy", allow_pickle = True)

X_te   = np.load("D:/dataset/multimodal2/X_test_2.npy", allow_pickle=True).astype(np.float32)
y_te = np.load("D:/dataset/multimodal2/y_te_2.npy", allow_pickle = True)

In [3]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 0.0001,
        average = True,
        class_weight = 'balanced',
        random_state=42,
        max_iter=1000,
        tol=1e-3,
        early_stopping = True
    )


cfg.fit(X_tr, y_tr)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.5953 | accuracy (test): 0.5954


In [18]:
# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)


cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=2, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 1, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]

for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_tr, y_tr_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_tr, y_tr)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: GaussianNB()
macro-F1 (test): 0.5775 | accuracy (test): 0.5789

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=2,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (test): 0.5875 | accuracy (test): 0.5877

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=1,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=Non