In [1]:
import duckdb, os, gc
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, hstack, save_npz, vstack


from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

In [2]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


# SET UP

In [3]:
# RETRIVE METADATA - CHECK FEATURES
columns = con.sql("""PRAGMA table_info(md1718)""").fetchdf()
print(columns['name'].to_list())

['filename', 'username', 'like_count', 'comment_count', 'width', 'height', 'time_utc', 'caption', 'aspect_ratio', 'area', 'orientation', 'date_day', 'dow', 'hour_utc', 'has_caption', 'caption_len_char', 'month', 'year', 'n_hashtags', 'n_mentions', 'n_urls', 'n_emojis', 'category', 'followers', 'followees', 'posts', 'engagement_rate', 'er_log', 'er_bins', 'post_id', 'caption_language', 'caption_clean', 'caption_lang', 'caption_tfidf', 'caption_bert_clip', 'split', 'in_train_balanced', 'er_bins3', 'er_bins2']


In [4]:
metadata_tr = con.sql("""SELECT * FROM md1718 WHERE split = 'train'""").df()
metadata_val = con.sql("""SELECT * FROM md1718 WHERE split = 'validation'""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [5]:
col_to_exclude = ['filename', 'username', 'like_count', 'comment_count', 'caption', 'followers', 'engagement_rate', 'er_log', 'caption_language', 'caption_tfidf', 'caption_bert_clip',
                  'er_bins', 'split', 'in_train_balanced', 'time_utc', 'date_day', 'caption_lang', 'caption_clean']

feature_columns = [col for col in metadata_tr.columns if col not in col_to_exclude]

y_tr = metadata_tr['er_bins']
y_val = metadata_val['er_bins']

meta_tr = metadata_tr[feature_columns]
meta_val = metadata_val[feature_columns] 

In [6]:
meta_tr.dtypes

width                 int32
height                int32
aspect_ratio        float64
area                  int32
orientation          object
dow                   int64
hour_utc              int64
has_caption            bool
caption_len_char      Int64
month                 int64
year                  int64
n_hashtags            int64
n_mentions            int64
n_urls                int64
n_emojis              int64
category             object
followees             int32
posts                 int32
post_id              object
er_bins3             object
er_bins2             object
dtype: object

In [7]:
# One hot encoding for orientation and category
cat_cols = ['orientation', 'category']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
meta_train_encoded = encoder.fit_transform(meta_tr[cat_cols])
meta_val_encoded = encoder.transform(meta_val[cat_cols])

In [8]:
encoded_cols = encoder.get_feature_names_out(cat_cols)
meta_train_encoded = pd.DataFrame(meta_train_encoded, columns=encoded_cols, index=meta_tr.index)
meta_val_encoded = pd.DataFrame(meta_val_encoded, columns=encoded_cols, index=meta_val.index)

In [9]:
# Scaling numerical variables
scaler = StandardScaler()
num_cols = [
    'width', 'height', 'aspect_ratio', 'area',
    'dow', 'hour_utc', 'month', 'year', 'caption_len_char',
    'n_hashtags', 'n_mentions', 'n_urls', 'n_emojis',
    'followees', 'posts'
]

bin_cols = ['has_caption'] 

meta_train_scaled = pd.DataFrame(
    scaler.fit_transform(meta_tr[num_cols]),
    columns=num_cols,
    index=meta_tr.index
)
meta_val_scaled = pd.DataFrame(
    scaler.transform(meta_val[num_cols]),
    columns=num_cols,
    index=meta_val.index
)

In [10]:
meta_train_bin = meta_tr[bin_cols].astype(int)
meta_val_bin = meta_val[bin_cols].astype(int)

# Merge all: scaled numeric + boolean + one-hot
meta_train_final = pd.concat([meta_train_scaled, meta_train_bin, meta_train_encoded], axis=1)
meta_val_final = pd.concat([meta_val_scaled, meta_val_bin, meta_val_encoded], axis=1)
# Add post_id
meta_train_final.insert(0, 'post_id', metadata_tr['post_id'].values)
meta_val_final.insert(0, 'post_id', metadata_val['post_id'].values)

In [11]:
print("NaN metadata (train):", np.isnan(meta_train_final.drop(columns=['post_id'])).any().any())
print("NaN metadata (val):", np.isnan(meta_val_final.drop(columns=['post_id'])).any().any())

NaN metadata (train): True
NaN metadata (val): True


In [12]:
meta_train_final.isna().sum().sort_values(ascending=False).head(10)

caption_len_char     6634
post_id                 0
posts                   0
category_pet            0
category_other          0
category_interior       0
category_food           0
category_fitness        0
category_fashion        0
category_family         0
dtype: int64

In [13]:
meta_train_final['caption_len_char'] = meta_train_final['caption_len_char'].fillna(0)
meta_val_final['caption_len_char'] = meta_val_final['caption_len_char'].fillna(0)

In [14]:
meta_train_final.isna().sum().sort_values(ascending=False).head(10)

post_id              0
posts                0
category_pet         0
category_other       0
category_interior    0
category_food        0
category_fitness     0
category_fashion     0
category_family      0
category_beauty      0
dtype: int64

In [15]:
# RETRIEVE CAPTIONS TF-IDF
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

x_train_tfidf = load_npz(f"{OUT_DIR}/tfidf_topwords_train.npz")
x_val_tfidf = load_npz(f"{OUT_DIR}/tfidf_topwords_val.npz")

x_train_postids = np.load(os.path.join(OUT_DIR, "tfidf_train_post_ids.npy"), allow_pickle=True)
x_val_postids = np.load(os.path.join(OUT_DIR, "tfidf_val_post_ids.npy"), allow_pickle=True)

In [16]:
df_tfidf_train = pd.DataFrame({'post_id': x_train_postids})
df_tfidf_val   = pd.DataFrame({'post_id': x_val_postids})

In [17]:
column_names = con.sql("""PRAGMA table_info('features.img_handcrafted');""").fetchdf()
print(column_names['name'].to_list())

['post_id', 'h_mean', 'h_std', 's_mean', 's_std', 'v_mean', 'v_std', 'h_hist_00', 'h_hist_01', 'h_hist_02', 'h_hist_03', 'h_hist_04', 'h_hist_05', 'h_hist_06', 'h_hist_07', 'h_hist_08', 'h_hist_09', 'h_hist_10', 'h_hist_11', 'h_hist_12', 'h_hist_13', 'h_hist_14', 'h_hist_15', 's_hist_00', 's_hist_01', 's_hist_02', 's_hist_03', 's_hist_04', 's_hist_05', 's_hist_06', 's_hist_07', 's_hist_08', 's_hist_09', 's_hist_10', 's_hist_11', 's_hist_12', 's_hist_13', 's_hist_14', 's_hist_15', 'v_hist_00', 'v_hist_01', 'v_hist_02', 'v_hist_03', 'v_hist_04', 'v_hist_05', 'v_hist_06', 'v_hist_07', 'v_hist_08', 'v_hist_09', 'v_hist_10', 'v_hist_11', 'v_hist_12', 'v_hist_13', 'v_hist_14', 'v_hist_15', 'gray_hist_00', 'gray_hist_01', 'gray_hist_02', 'gray_hist_03', 'gray_hist_04', 'gray_hist_05', 'gray_hist_06', 'gray_hist_07', 'gray_hist_08', 'gray_hist_09', 'gray_hist_10', 'gray_hist_11', 'gray_hist_12', 'gray_hist_13', 'gray_hist_14', 'gray_hist_15', 'laplacian_var', 'edge_density', 'entropy_gray', 'c

In [18]:
# RETRIEVE IMAGES
# Exctract the df
df_images = con.sql("""SELECT * FROM features.img_handcrafted""").df()
print("Images dataset extracted")
# Split

train = df_images[df_images["split"] == "train"].copy()
val   = df_images[df_images["split"] == "validation"].copy()
print("Train, validation and test split")

cols_to_drop = ['rn', 'split', 'er_bins']
feature_cols = [col for col in train.columns if col not in cols_to_drop]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Images dataset extracted
Train, validation and test split


In [20]:
X_train_img = train[feature_cols]
X_val_img1 = val[feature_cols] ì

col = 'laplacian_var'

# found infinite values and substitute
mask = ~np.isfinite(X_val_img1[col])

print(f"Found {mask.sum()} infinite values in {col}")
X_val_img1.loc[mask, col] = 0
X_val_np = X_val_img1.values

X_val_img = pd.DataFrame(
    X_val_np,
    index=X_val_img1.index,
    columns=X_val_img1.columns
)

Found 1 infinite values in laplacian_var


In [21]:
mask = ~np.isfinite(X_val_img1[col])

print(f"Found {mask.sum()} infinite values in {col}")

Found 0 infinite values in laplacian_var


In [23]:
print([col for col in X_train_img.columns])

['post_id', 'h_mean', 'h_std', 's_mean', 's_std', 'v_mean', 'v_std', 'h_hist_00', 'h_hist_01', 'h_hist_02', 'h_hist_03', 'h_hist_04', 'h_hist_05', 'h_hist_06', 'h_hist_07', 'h_hist_08', 'h_hist_09', 'h_hist_10', 'h_hist_11', 'h_hist_12', 'h_hist_13', 'h_hist_14', 'h_hist_15', 's_hist_00', 's_hist_01', 's_hist_02', 's_hist_03', 's_hist_04', 's_hist_05', 's_hist_06', 's_hist_07', 's_hist_08', 's_hist_09', 's_hist_10', 's_hist_11', 's_hist_12', 's_hist_13', 's_hist_14', 's_hist_15', 'v_hist_00', 'v_hist_01', 'v_hist_02', 'v_hist_03', 'v_hist_04', 'v_hist_05', 'v_hist_06', 'v_hist_07', 'v_hist_08', 'v_hist_09', 'v_hist_10', 'v_hist_11', 'v_hist_12', 'v_hist_13', 'v_hist_14', 'v_hist_15', 'gray_hist_00', 'gray_hist_01', 'gray_hist_02', 'gray_hist_03', 'gray_hist_04', 'gray_hist_05', 'gray_hist_06', 'gray_hist_07', 'gray_hist_08', 'gray_hist_09', 'gray_hist_10', 'gray_hist_11', 'gray_hist_12', 'gray_hist_13', 'gray_hist_14', 'gray_hist_15', 'laplacian_var', 'edge_density', 'entropy_gray', 'c

In [24]:
# StandardScaler
scaler = StandardScaler()
num_cols = [col for col in X_train_img.columns if col not in ('post_id', 'er_bins3', 'er_bins2')]


img_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_img[num_cols]),
    columns=num_cols,
    index=X_train_img.index
)
img_val_scaled = pd.DataFrame(
    scaler.transform(X_val_img[num_cols]),
    columns=num_cols,
    index=X_val_img.index
)

print("StandardScaler applied")

img_train_final = pd.concat([img_train_scaled, X_train_img['post_id']], axis=1)
img_val_final = pd.concat([img_val_scaled, X_val_img['post_id']], axis=1)
# img_test_final = pd.concat([img_test_scaled, X_test_img['post_id']], axis=1)
print("Post Id added")

StandardScaler applied
Post Id added


In [25]:
# Common post_ids
common_train_ids = (
    set(meta_train_final['post_id'])
    & set(df_tfidf_train['post_id'])
    & set(img_train_final['post_id'])
)

In [26]:
# Filter to ensure they all have the same ids and order
meta_train_final = meta_train_final[meta_train_final['post_id'].isin(common_train_ids)]
img_train_final = img_train_final[img_train_final['post_id'].isin(common_train_ids)]
df_tfidf_train   = df_tfidf_train[df_tfidf_train['post_id'].isin(common_train_ids)]

In [27]:
common_val_ids = (
    set(meta_val_final['post_id'])
    & set(img_val_final['post_id'])
    & set(df_tfidf_val['post_id'])
)

In [28]:
meta_val_final = meta_val_final[meta_val_final['post_id'].isin(common_val_ids)]
img_val_final = img_val_final[img_val_final['post_id'].isin(common_val_ids)]
df_tfidf_val   = df_tfidf_val[df_tfidf_val['post_id'].isin(common_val_ids)]

In [29]:
# Filter TF-IDF matrix
# Define a mask of booleans to filter the matrix

mask_train = np.array([pid in common_train_ids for pid in x_train_postids])
x_train_tfidf_aligned = x_train_tfidf[mask_train]

mask_val = np.array([pid in common_val_ids for pid in x_val_postids])
x_val_tfidf_aligned = x_val_tfidf[mask_val]

In [30]:
# Merge metadata and immagini

train_meta_img = meta_train_final.merge(img_train_final, on='post_id')
val_meta_img   = meta_val_final.merge(img_val_final, on='post_id')

In [31]:
# Free memory
del metadata_tr, metadata_val
del meta_tr, meta_val
del meta_train_scaled, meta_val_scaled
del meta_train_encoded, meta_val_encoded
del img_train_scaled, img_val_scaled
del df_images, train, val
del X_train_img, X_val_img

gc.collect()

3667

In [32]:
# Convert the dataframe into a matrix to merge it with the TF-IDF matrix
X_train_meta_img = train_meta_img.drop(columns=['post_id']).values

X_train_full = hstack([x_train_tfidf_aligned, X_train_meta_img])
X_val_full   = hstack([x_val_tfidf_aligned, val_meta_img.drop(columns=['post_id']).values])

In [33]:
print("TF-IDF train shape:", x_train_tfidf_aligned.shape)
print("Meta+Img train shape:", X_train_meta_img.shape)
print("Full train shape:", X_train_full.shape)
print()
print("TF-IDF val shape:", x_val_tfidf_aligned.shape)
print("Meta+Img val shape:", val_meta_img.drop(columns=['post_id']).shape)
print("Full val shape:", X_val_full.shape)

TF-IDF train shape: (773497, 50079)
Meta+Img train shape: (773497, 177)
Full train shape: (773497, 50256)

TF-IDF val shape: (412325, 50079)
Meta+Img val shape: (412325, 177)
Full val shape: (412325, 50256)


In [34]:
# Controllo ordine dei post_id sia coerente per non mescolare i target
print("Controllo ordine post_id train:", 
      (df_tfidf_train['post_id'].values == train_meta_img['post_id'].values).all())

print("Controllo ordine post_id val:", 
      (df_tfidf_val['post_id'].values == val_meta_img['post_id'].values).all())

Controllo ordine post_id train: True
Controllo ordine post_id val: True


In [35]:
# Controllo dimensioni di y
print("y_train:", len(y_tr))
print("X_train_full:", X_train_full.shape[0])
print("y_val:", len(y_val))
print("X_val_full:", X_val_full.shape[0])

y_train: 773497
X_train_full: 773497
y_val: 412325
X_val_full: 412325


In [36]:
print(type(X_train_full))
print(X_train_full.dtype)

<class 'scipy.sparse._coo.coo_matrix'>
float64


In [37]:
print(f"Train set: {X_train_full.shape[0]} samples, {X_train_full.shape[1]} features")
print(f"Validation set: {X_val_full.shape[0]} samples, {X_val_full.shape[1]} features")

Train set: 773497 samples, 50256 features
Validation set: 412325 samples, 50256 features


In [38]:
save_npz("D:/dataset/multimodal/X_train_full_TR.npz", X_train_full)
save_npz("D:/dataset/multimodal/X_val_full_TR.npz", X_val_full)

np.save("D:/dataset/multimodal/y_train.npy", y_tr.values)
np.save("D:/dataset/multimodal/y_val.npy", y_val.values)

# LOAD AND CLASSIFICATION

In [2]:
DIR = "D:/dataset/multimodal"
Xtr_full = load_npz(f"{DIR}/X_train_full_TR.npz").tocsr().astype(np.float32)
Xva_full = load_npz(f"{DIR}/X_val_full_TR.npz").tocsr().astype(np.float32)

y_tr = np.load(os.path.join(DIR, "y_train.npy"), allow_pickle = True)
y_val = np.load(os.path.join(DIR, "y_val.npy"), allow_pickle = True)

In [3]:
print(type(Xtr_full))

<class 'scipy.sparse._csr.csr_matrix'>


In [4]:
print(Xtr_full.shape, Xva_full.shape, y_tr.shape, y_val.shape)

(773497, 50256) (412325, 50256) (773497,) (412325,)


In [5]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"]
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(Xtr_full, y_tr)

    y_val_pred = clf.predict(Xva_full)

    macro_f1 = f1_score(y_val, y_val_pred, average="macro")
    acc = accuracy_score(y_val, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "average": params["average"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'average': False, 'class_weight': None}
macro-F1 (val): 0.2954541537732168 | accuracy (val): 0.3139683502091796

Combination: {'alpha': 1e-05, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.2865529469701135 | accuracy (val): 0.3153847086642818

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': None}
macro-F1 (val): 0.30391785896759793 | accuracy (val): 0.33945795185836414

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': 'balanced'}
macro-F1 (val): 0.29232628597564436 | accuracy (val): 0.3409688958952283

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': None}
macro-F1 (val): 0.280857715201816 | accuracy (val): 0.3079367004183593

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.280356124983388 | accuracy (val): 0.31666040138240464

Combination: {'alpha': 0.0001, 'average': True, 'class_weight': None}
macro-F1 (val): 0.2964958035483933 | accuracy (val)

In [7]:
# NAIVE BAYES

param_grid = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

batch_size = 256
classes = np.unique(y_tr)

for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit the model using minibatch for memory
    for start in range(0, Xtr_full.shape[0], batch_size):
        # print(f"Batch {start} fit")
        end = min(start + batch_size, Xtr_full.shape[0])

        Xb = Xtr_full[start:end].toarray()
        yb = y_tr[start:end]

        if start == 0:
            clf.partial_fit(Xb, yb, classes=classes)
        else:
            clf.partial_fit(Xb, yb)

        del Xb, yb
        gc.collect()

    # Predict using minibatches
    y_val_pred = []

    for start in range(0, Xva_full.shape[0], batch_size):
        # print(f"Batch {start} predict")
        end = min(start + batch_size, Xva_full.shape[0])

        Xb = Xva_full[start:end].toarray()
        y_val_pred.append(clf.predict(Xb))

        del Xb
        gc.collect()

    y_val_pred = np.concatenate(y_val_pred)

    macro_f1 = f1_score(y_val, y_val_pred, average="macro")
    acc = accuracy_score(y_val, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values(
    "val_macro_f1", ascending=False
)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.1862 | accuracy (val): 0.2011

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.1892 | accuracy (val): 0.2013

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.1944 | accuracy (val): 0.2018

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.1915 | accuracy (val): 0.1989

Best hyperparameter configuration:
{'var_smoothing': 1e-07}
Validation macro-F1: 0.19444250202226981

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
2   1.000000e-07      0.194443      0.201792
3   1.000000e-06      0.191519      0.198884
1   1.000000e-08      0.189157      0.201293
0   1.000000e-09      0.186222      0.201103


In [3]:
# RANDOM FOREST

param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],   
    "min_samples_leaf": [2, 5], 
    "max_features": [0.05, "sqrt"],
}
results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit
    clf.fit(X_train_full, y_tr)

    # Validation
    y_val_pred = clf.predict(X_val_full)

    macro_f1 = f1_score(y_val, y_val_pred, average="macro")
    acc = accuracy_score(y_val, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_rf)


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.3005 | accuracy (val): 0.3021

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.3031 | accuracy (val): 0.3044

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.3022 | accuracy (val): 0.3043

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.2986 | accuracy (val): 0.3013

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.3001 | accuracy (val): 0.3022

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.3033 | accuracy (val): 0.3046

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1921 | accuracy (val

In [3]:
# XGBOOST

# Convert the labels into numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_val)

param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6], 
    "learning_rate": [0.1],
    "subsample": [0.8], 
    "colsample_bytree": [0.5],
    "gamma": [0, 1], 
    "reg_lambda": [1], 
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_train_full, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_val_full)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3152 | accuracy (val): 0.3315

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3174 | accuracy (val): 0.3337

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3241 | accuracy (val): 0.3375

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3258 | accuracy (val): 0.3391

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.3153 | accuracy (val): 0.3314

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE ON TEST SET

In [4]:
# METADATA
metadata_tr = con.sql("""SELECT * FROM md1718 WHERE split = 'train' OR split = 'validation'""").df()
metadata_te = con.sql("""SELECT * FROM md1718 WHERE split = 'test'""").df()
col_to_exclude = ['filename', 'username', 'like_count', 'comment_count', 'caption', 'followers', 'engagement_rate', 'er_log', 'caption_language', 'caption_tfidf', 'caption_bert_clip',
                  'er_bins', 'split', 'time_utc', 'date_day', 'caption_lang', 'caption_clean']
feature_columns = [col for col in metadata_tr.columns if col not in col_to_exclude]

meta_tr = metadata_tr[feature_columns].copy()
meta_te = metadata_te[feature_columns].copy()

# One hot encoding
cat_cols = ['orientation', 'category']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
meta_train_encoded = encoder.fit_transform(meta_tr[cat_cols])
meta_test_encoded = encoder.transform(meta_te[cat_cols])
encoded_cols = encoder.get_feature_names_out(cat_cols)
meta_train_encoded = pd.DataFrame(meta_train_encoded, columns=encoded_cols, index=meta_tr.index)
meta_test_encoded = pd.DataFrame(meta_test_encoded, columns=encoded_cols, index=meta_te.index)

# Scaling
scaler = StandardScaler()
num_cols = [
    'width', 'height', 'aspect_ratio', 'area',
    'dow', 'hour_utc', 'month', 'year', 'caption_len_char',
    'n_hashtags', 'n_mentions', 'n_urls', 'n_emojis',
    'followees', 'posts'
]

bin_cols = ['has_caption'] 
meta_train_scaled = pd.DataFrame(
    scaler.fit_transform(meta_tr[num_cols]),
    columns=num_cols,
    index=meta_tr.index
)

meta_test_scaled = pd.DataFrame(
    scaler.transform(meta_te[num_cols]),
    columns=num_cols,
    index=meta_te.index
)

meta_train_bin = meta_tr[bin_cols].astype(int)
meta_test_bin = meta_te[bin_cols].astype(int)

# Merge all: scaled numeric + boolean + one-hot
meta_train_final = pd.concat([meta_train_scaled, meta_train_bin, meta_train_encoded], axis=1)
meta_test_final = pd.concat([meta_test_scaled, meta_test_bin, meta_test_encoded], axis=1)
# Add back post_id
meta_train_final.insert(0, 'post_id', metadata_tr['post_id'].values)
meta_test_final.insert(0, 'post_id', metadata_te['post_id'].values)

print("NaN metadata (train):", np.isnan(meta_train_final.drop(columns=['post_id'])).any().any())
print("NaN metadata (test):", np.isnan(meta_test_final.drop(columns=['post_id'])).any().any())
meta_train_final['caption_len_char'] = meta_train_final['caption_len_char'].fillna(0)
meta_test_final['caption_len_char'] = meta_test_final['caption_len_char'].fillna(0)

# TEXTUAL DATA
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

x_train_tfidf = load_npz(f"{OUT_DIR}/tfidf_topwords_train.npz")
x_val_tfidf = load_npz(f"{OUT_DIR}/tfidf_topwords_val.npz")
x_test_tfidf = load_npz(f"{OUT_DIR}/tfidf_topwords_test.npz")

x_trainval_tfidf = vstack([x_train_tfidf, x_val_tfidf])

train_postids = np.load(os.path.join(OUT_DIR, "tfidf_train_post_ids.npy"), allow_pickle=True)
val_postids = np.load(os.path.join(OUT_DIR, "tfidf_val_post_ids.npy"), allow_pickle=True)
test_postids = np.load(os.path.join(OUT_DIR, "tfidf_test_post_ids.npy"), allow_pickle=True)

trainval_postids = np.concatenate([train_postids, val_postids])

df_tfidf_test   = pd.DataFrame({'post_id': test_postids})
df_tfidf_trainval = pd.DataFrame({'post_id': trainval_postids})

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

NaN metadata (train): True
NaN metadata (test): True
Mancanti in metadata_tr: 0
Mancanti in metadata_te: 0


In [4]:
print(meta_train_final.shape)
print(meta_test_final.shape)
print(x_trainval_tfidf.shape)
print(x_test_tfidf.shape)
print(trainval_postids.shape)
print(test_postids.shape)

(1185822, 29)
(423604, 29)
(1185822, 50079)
(423604, 50079)
(1185822,)
(423604,)


In [None]:
# Free memory
del metadata_tr, metadata_te
del meta_tr, meta_te
del meta_train_scaled, meta_test_scaled
del meta_train_encoded, meta_test_encoded
del x_train_tfidf, x_val_tfidf
del train_postids, val_postids
del meta_train_bin, meta_test_bin
gc.collect()

In [6]:
# VISUAL DATA
# RETRIEVE IMAGES
# Exctract the df
df_images = con.sql("""SELECT * FROM features.img_handcrafted""").df()
print("Images dataset extracted")

# Split

train = df_images[df_images["split"].isin(["train", "validation"])].copy()
test   = df_images[df_images["split"] == "test"].copy()
print("Train and test split")

cols_to_drop = ['rn', 'split', 'er_bins']
feature_cols = [col for col in train.columns if col not in cols_to_drop]

X_train_img1 = train[feature_cols]
X_test_img1 = test[feature_cols] 

col = 'laplacian_var'

mask = ~np.isfinite(X_train_img1[col])
print(f"Train: Found {mask.sum()} infinite values in {col}")
X_train_img1.loc[mask, col] = 0

X_train_np = X_train_img1.values

X_train_img = pd.DataFrame(
    X_train_np,
    index=X_train_img1.index,
    columns=X_train_img1.columns
)

mask = ~np.isfinite(X_train_img1[col])
print(f"Check train: Found {mask.sum()} infinite values in {col}")


# Test
mask = ~np.isfinite(X_test_img1[col])
print(f"Test: Found {mask.sum()} infinite values in {col}")
X_test_img1.loc[mask, col] = 0
X_test_np = X_test_img1.values

X_test_img = pd.DataFrame(
    X_test_np,
    index=X_test_img1.index,
    columns=X_test_img1.columns
)

mask = ~np.isfinite(X_test_img1[col])
print(f"Check test: Found {mask.sum()} infinite values in {col}")

del df_images, train, test
del X_train_img1, X_test_img1
del X_train_np, X_test_np
gc.collect()

num_cols = [col for col in X_train_img.columns if col != 'post_id']
scaler = StandardScaler()

# Incremental fit on train
chunk_size = 200_000
n_train = len(X_train_img)

print("Fitting scaler incrementally...")
for start in range(0, n_train, chunk_size):
    end = start + chunk_size
    chunk = X_train_img.iloc[start:end][num_cols]
    scaler.partial_fit(chunk)
    print(f"  Fitted rows {start} → {min(end, n_train)}")

print("Incremental fit complete.\n")

# Incremental transform on train
print("Transforming train incrementally...")
train_chunks = []

for start in range(0, n_train, chunk_size):
    end = start + chunk_size
    chunk = X_train_img.iloc[start:end][num_cols]
    chunk_scaled = scaler.transform(chunk).astype('float32')
    train_chunks.append(chunk_scaled)
    print(f"  Transformed rows {start} → {min(end, n_train)}")

img_train_scaled = np.vstack(train_chunks)
del train_chunks
gc.collect()

print("Train transform complete.\n")

# Incremental transform on train
print("Transforming test...")
img_test_scaled = scaler.transform(X_test_img[num_cols]).astype('float32')
print("Test transform complete.\n")

img_train_final = pd.DataFrame(
    img_train_scaled,
    index=X_train_img.index,
    columns=num_cols
)
img_train_final['post_id'] = X_train_img['post_id'].values

img_test_final = pd.DataFrame(
    img_test_scaled,
    index=X_test_img.index,
    columns=num_cols
)
img_test_final['post_id'] = X_test_img['post_id'].values

print("Final DataFrames created!")
print(img_train_final.shape)
print(img_test_final.shape)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Images dataset extracted
Train and test split
Train: Found 1 infinite values in laplacian_var
Check train: Found 0 infinite values in laplacian_var
Test: Found 1 infinite values in laplacian_var
Check test: Found 0 infinite values in laplacian_var
Fitting scaler incrementally...
  Fitted rows 0 → 200000
  Fitted rows 200000 → 400000
  Fitted rows 400000 → 600000
  Fitted rows 600000 → 800000
  Fitted rows 800000 → 1000000
  Fitted rows 1000000 → 1185822
Incremental fit complete.

Transforming train incrementally...
  Transformed rows 0 → 200000
  Transformed rows 200000 → 400000
  Transformed rows 400000 → 600000
  Transformed rows 600000 → 800000
  Transformed rows 800000 → 1000000
  Transformed rows 1000000 → 1185822
Train transform complete.

Transforming test...
Test transform complete.

Final DataFrames created!
(1185822, 150)
(423604, 150)


In [7]:
del X_train_img, X_test_img
del img_train_scaled, img_test_scaled

gc.collect()

0

In [8]:
# Common post_ids
common_train_ids = (
    set(meta_train_final['post_id'])
    & set(df_tfidf_trainval['post_id'])
    & set(img_train_final['post_id'])
)

In [9]:
len(common_train_ids)

1185822

In [10]:
# Filter to ensure they all have the same ids and order
meta_train_final = meta_train_final[meta_train_final['post_id'].isin(common_train_ids)]
img_train_final = img_train_final[img_train_final['post_id'].isin(common_train_ids)]
df_tfidf_trainval   = df_tfidf_trainval[df_tfidf_trainval['post_id'].isin(common_train_ids)]

In [11]:
common_test_ids = (
    set(meta_test_final['post_id'])
    & set(img_test_final['post_id'])
    & set(df_tfidf_test['post_id'])
)

In [12]:
meta_test_final = meta_test_final[meta_test_final['post_id'].isin(common_test_ids)]
img_test_final = img_test_final[img_test_final['post_id'].isin(common_test_ids)]
df_tfidf_test = df_tfidf_test[df_tfidf_test['post_id'].isin(common_test_ids)]

In [13]:
len(common_test_ids)

423604

In [14]:
# Filter TF-IDF matrix
# Define a mask of booleans to filter the matrix

mask_train = np.array([pid in common_train_ids for pid in trainval_postids])
x_trainval_tfidf_aligned = x_trainval_tfidf[mask_train]

mask_test = np.array([pid in common_test_ids for pid in test_postids])
x_test_tfidf_aligned = x_test_tfidf[mask_test]

In [15]:
# Merge metadata and immagini

train_meta_img = meta_train_final.merge(img_train_final, on='post_id')
test_meta_img   = meta_test_final.merge(img_test_final, on='post_id')

In [16]:
# Check post_id order, it must be coherent between the two to not mix the y
print("Controllo ordine post_id train:", 
      (df_tfidf_trainval['post_id'].values == train_meta_img['post_id'].values).all())

print("Controllo ordine post_id test:", 
      (df_tfidf_test['post_id'].values == test_meta_img['post_id'].values).all())

Controllo ordine post_id train: False
Controllo ordine post_id test: True


In [17]:
# Reorder meta according to tfidf order
train_meta_img_ordered = train_meta_img.set_index('post_id').loc[trainval_postids].reset_index()
test_meta_img_ordered = test_meta_img.set_index('post_id').loc[test_postids].reset_index()

In [18]:
# Chech again
print("Controllo ordine post_id train:", 
      (df_tfidf_trainval['post_id'].values == train_meta_img_ordered['post_id'].values).all())

print("Controllo ordine post_id test:", 
      (df_tfidf_test['post_id'].values == test_meta_img['post_id'].values).all())

Controllo ordine post_id train: True
Controllo ordine post_id test: True


In [19]:
del train_meta_img, x_trainval_tfidf, x_test_tfidf
gc.collect()

0

In [20]:
X_train_meta_img = train_meta_img_ordered.drop(columns=[]).copy()
train_postids = train_meta_img_ordered['post_id'].values

In [21]:
test_postids = test_meta_img_ordered['post_id'].values

In [22]:
X_train_full = hstack([x_trainval_tfidf_aligned, 
                       train_meta_img_ordered.drop(columns=['post_id']).values])
X_test_full  = hstack([x_test_tfidf_aligned, 
                       test_meta_img_ordered.drop(columns=['post_id']).values])

In [23]:
print("TF-IDF train shape:", x_trainval_tfidf_aligned.shape)
print("Meta+Img train shape:", X_train_meta_img.shape)
print("Full train shape:", X_train_full.shape)
print()
print("TF-IDF test shape:", x_test_tfidf_aligned.shape)
print("Meta+Img test shape:", test_meta_img.drop(columns=['post_id']).shape)
print("Full test shape:", X_test_full.shape)

TF-IDF train shape: (1185822, 50079)
Meta+Img train shape: (1185822, 178)
Full train shape: (1185822, 50256)

TF-IDF test shape: (423604, 50079)
Meta+Img test shape: (423604, 177)
Full test shape: (423604, 50256)


In [23]:
# Controllo dimensioni di y
print("y_train:", len(y_trainval))
print("X_train_full:", X_train_full.shape[0])
print("y_test:", len(y_te))
print("X_test_full:", X_test_full.shape[0])

y_train: 1185822
X_train_full: 1185822
y_test: 423604
X_test_full: 423604


In [24]:
print(type(X_train_full))
print(X_train_full.dtype)

<class 'scipy.sparse._coo.coo_matrix'>
float64


In [25]:
print(f"Train set: {X_train_full.shape[0]} samples, {X_train_full.shape[1]} features")
print(f"Test set: {X_test_full.shape[0]} samples, {X_test_full.shape[1]} features")

Train set: 1185822 samples, 50256 features
Test set: 423604 samples, 50256 features


In [26]:
# da DuckDB
metadata_tr = con.sql("""
    SELECT post_id, er_bins FROM md1718
    WHERE split = 'train' OR split = 'validation'
""").df().set_index('post_id')

metadata_te = con.sql("""
    SELECT post_id, er_bins FROM md1718
    WHERE split = 'test'
""").df().set_index('post_id')

# allineamento diretto ai post_id in X
y_train_full = metadata_tr.loc[train_postids, 'er_bins'].values
y_test       = metadata_te.loc[test_postids, 'er_bins'].values

In [27]:
save_npz("D:/dataset/multimodal/X_trval_full_TR.npz", X_train_full)
save_npz("D:/dataset/multimodal/X_test_full_TR.npz", X_test_full)

y_trainval_np = np.asarray(y_trainval)
y_test_np     = np.asarray(y_test)

np.save("D:/dataset/multimodal/y_trval_full_TR.npy", y_trainval_np)
np.save("D:/dataset/multimodal/y_test_TR.npy", y_test_np)

train_postids_np = np.asarray(train_postids)
test_postids_np  = np.asarray(test_postids)

np.save("D:/dataset/multimodal/postid_trval_full_TR.npy", train_postids_np)
np.save("D:/dataset/multimodal/postid_test_TR.npy", test_postids_np)

In [2]:
DIR = r"D:/dataset/multimodal"

X_train_full = load_npz(f"{DIR}/X_trval_full_TR.npz")
X_test_full = load_npz(f"{DIR}/X_test_full_TR.npz")

y_tr = np.load(os.path.join(DIR, "y_trval_full_TR.npy"), allow_pickle = True)
y_te = np.load(os.path.join(DIR, "y_test_TR.npy"), allow_pickle = True)

In [3]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)

In [20]:
cfgs = [
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=5, n_estimators=80, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 1, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_train_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_train_full, y_train_enc)
        y_te_pred = cfg.predict(X_test_full)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_train_full, y_tr)
        y_te_pred = cfg.predict(X_test_full)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

# Bernoulli results are wrong as they refer to another alpha, check cell below


Configuration: BernoulliNB(alpha=10)
macro-F1 (test): 0.2643 | accuracy (test): 0.3094

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=5,
                       n_estimators=80, n_jobs=-1, random_state=42)
macro-F1 (test): 0.3185 | accuracy (test): 0.3368

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=1,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_str

In [6]:
DIR = r"D:/dataset/multimodal"

X_full = load_npz(f"{DIR}/X_trval_full_TR.npz").tocsr().astype(np.float32)
Xte_full = load_npz(f"{DIR}/X_test_full_TR.npz").tocsr().astype(np.float32)

y_tr = np.load(os.path.join(DIR, "y_trval_full_TR.npy"), allow_pickle = True)
y_te = np.load(os.path.join(DIR, "y_test_TR.npy"), allow_pickle = True)

In [8]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_full, y_tr)
y_te_pred = cfg.predict(Xte_full)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.3104 | accuracy (test): 0.3619


In [4]:
# TEST SU GAUSSIAN NAIVE BAYES

batch_size = 256
classes = np.unique(y_tr)

clf = GaussianNB(var_smoothing = 1e-07)

# Fit the model using minibatch for memory
for start in range(0, X_full.shape[0], batch_size):
    # print(f"Batch {start} fit")
    end = min(start + batch_size, X_full.shape[0])

    Xb = X_full[start:end].toarray()
    yb = y_tr[start:end]

    if start == 0:
        clf.partial_fit(Xb, yb, classes=classes)
    else:
        clf.partial_fit(Xb, yb)

    del Xb, yb
    gc.collect()

# Predict using minibatches
y_te_pred = []

for start in range(0, Xte_full.shape[0], batch_size):
    # print(f"Batch {start} predict")
    end = min(start + batch_size, Xte_full.shape[0])

    Xb = Xte_full[start:end].toarray()
    y_te_pred.append(clf.predict(Xb))

    del Xb
    gc.collect()

y_te_pred = np.concatenate(y_te_pred)

macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2299 | accuracy (test): 0.2790
