In [1]:
import os, time, duckdb, torch, timm, gc
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import torchvision.transforms as T
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

# Set up

In [10]:
IMAGE_FOLDER = r"D:\dataset\images_224_rgb"

DB_PATH = "D:/db/meta.duckdb"

SAMPLE_SIZE = 5000
TEST_SIZE = 0.2
RANDOM_STATE = 42

con = duckdb.connect(DB_PATH)

query = f"""
    WITH imgs AS (
        SELECT
            post_id,
            full_image_file,
            ROW_NUMBER() OVER (PARTITION BY post_id ORDER BY full_image_file) AS rn
        FROM images_manifest1718_clean
        WHERE full_image_file IS NOT NULL
    )
    SELECT
        imgs.full_image_file AS image_filename,
        m.er_bins
    FROM md1718 m
    JOIN imgs
        ON m.post_id = imgs.post_id
    WHERE imgs.rn = 1
    ORDER BY random()
    LIMIT {SAMPLE_SIZE}
"""

df = con.execute(query).df()
print(f"Campione estratto da DuckDB: {len(df)} righe")

# Numeric labels
le = LabelEncoder()
y = le.fit_transform(df['er_bins'])

# Fixed split
indices = np.arange(len(df))
train_idx, test_idx = train_test_split(
    indices,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Campione estratto da DuckDB: 5000 righe


In [11]:
EFF_MODELS = {
    "b0": "efficientnet_b0",
    "b3": "efficientnet_b3",
}

transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


In [12]:
def extract_image_embeddings(model_name, model_id, df):
    print(f"\nLoading EfficientNet {model_name}...")

    model = timm.create_model(model_id, pretrained=True, num_classes=0)  
    model.eval()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    embeddings = []
    start_time = time.time()

    for filename in df["image_filename"]:
        path = os.path.join(IMAGE_FOLDER, filename)

        try:
            img = Image.open(path).convert("RGB")
        except:
            # fallback: random noise if image missing
            img = Image.fromarray(np.zeros((224,224,3), dtype=np.uint8))

        tensor = transform(img).unsqueeze(0).to(device)

        with torch.no_grad():
            emb = model(tensor).cpu().numpy().flatten()

        embeddings.append(emb)

    embeddings = np.vstack(embeddings)

    end_time = time.time()
    elapsed = end_time - start_time

    print(f"Extraction time for {model_name}: {elapsed:.2f} s")
    print(f"Embedding dimension: {embeddings.shape[1]}")

    return embeddings, elapsed


In [13]:
def benchmark_classifiers(X, y, train_idx, test_idx):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    results = []

    # Linear SVM
    svm = LinearSVC(random_state=RANDOM_STATE)
    t0 = time.time()
    svm.fit(X_train, y_train)
    t1 = time.time()

    t2 = time.time()
    pred = svm.predict(X_test)
    t3 = time.time()

    results.append({
        "Classifier": "LinearSVM",
        "Train Time (s)": t1 - t0,
        "Inference Time (s)": t3 - t2,
        "Accuracy": accuracy_score(y_test, pred),
        "F1-macro": f1_score(y_test, pred, average="macro")
    })

    # Gaussian NB
    nb = GaussianNB()
    t0 = time.time()
    nb.fit(X_train, y_train)
    t1 = time.time()

    t2 = time.time()
    pred = nb.predict(X_test)
    t3 = time.time()

    results.append({
        "Classifier": "NaiveBayes",
        "Train Time (s)": t1 - t0,
        "Inference Time (s)": t3 - t2,
        "Accuracy": accuracy_score(y_test, pred),
        "F1-macro": f1_score(y_test, pred, average="macro")
    })

    # XGBoost
    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softmax",
        eval_metric="mlogloss",
        tree_method="auto",
        n_jobs=-1,
        random_state=RANDOM_STATE
    )

    t0 = time.time()
    xgb.fit(X_train, y_train)
    t1 = time.time()

    t2 = time.time()
    pred = xgb.predict(X_test)
    t3 = time.time()

    results.append({
        "Classifier": "XGBoost",
        "Train Time (s)": t1 - t0,
        "Inference Time (s)": t3 - t2,
        "Accuracy": accuracy_score(y_test, pred),
        "F1-macro": f1_score(y_test, pred, average="macro")
    })

    return results


In [14]:
all_results = []

for name, model_id in EFF_MODELS.items():
    X, emb_time = extract_image_embeddings(name, model_id, df)
    clf_results = benchmark_classifiers(X, y, train_idx, test_idx)

    for r in clf_results:
        all_results.append({
            "Encoder": name,
            "Embedding Time (s)": emb_time,
            **r
        })

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(["F1-macro", "Accuracy"], ascending=False)

print("\n FINAL RESULTS")
print(results_df.to_string(index=False))
results_df.to_csv("classification_image_benchmark_results.csv", index=False)


Loading EfficientNet b0...
Extraction time for b0: 731.12 s
Embedding dimension: 1280

Loading EfficientNet b3...
Extraction time for b3: 665.18 s
Embedding dimension: 1536

 FINAL RESULTS
Encoder  Embedding Time (s) Classifier  Train Time (s)  Inference Time (s)  Accuracy  F1-macro
     b0          731.115472  LinearSVM       19.204432            0.032499     0.233  0.232503
     b0          731.115472    XGBoost      231.400897            0.034284     0.230  0.227878
     b3          665.178898  LinearSVM       34.021003            0.015665     0.222  0.221060
     b3          665.178898    XGBoost      279.483445            0.031693     0.218  0.215283
     b3          665.178898 NaiveBayes        0.055974            0.103390     0.222  0.212353
     b0          731.115472 NaiveBayes        0.045640            0.064164     0.216  0.199012


# EMBEDDINGS EXTRACTION

In [5]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [4]:
# con.execute("""CREATE OR REPLACE TABLE img_splits AS
# SELECT m.post_id, m.split, i.full_image_file
# FROM md1718 m
# JOIN images_manifest1718_clean i ON m.post_id = i.post_id
# """)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x2277cbe2730>

In [5]:
con.sql("""SELECT split, COUNT(*) AS n_images
FROM img_splits
GROUP BY split""").df()

Unnamed: 0,split,n_images
0,test,588557
1,validation,556982
2,train,960048


In [6]:
# 1) Configuration and set up

# Define the model used
MODEL_NAME = "efficientnet_b0"

# Define the pre-trained backbone used to extract image features:
# - num_classes = 0 allows to return a feature vector
# - global_pool='avg' applies a global average on the last spatial layer to get a single vector 1Ã—1280
# - pretrained = True uploads ImageNet pre-trained weights
backbone = timm.create_model(MODEL_NAME, pretrained=True, num_classes=0, global_pool='avg')

# Retrieve the model's data configuration, and store it as a dictionary, as the model expects images with a certain aspect, as it has been trained on ImageNet
data_cfg = resolve_data_config({}, model=backbone)

# Define the transformations that needs to be made for the images to align with the model requests
eval_tfms  = create_transform(**data_cfg, is_training=False)

# Freeze weights (requires_grad=False) to just extract the embeddings and not train it on the images
for p in backbone.parameters():
    p.requires_grad = False
backbone.eval()

# Set the device that will be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
backbone = backbone.to(device)
feat_dim = backbone.num_features
print(f"Backbone {MODEL_NAME} | feat_dim={feat_dim} | device={device}")

print(f"Backbone {MODEL_NAME} | feat_dim={feat_dim} | device={device}")
print("Data config:", data_cfg)


Data config: {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}


In [7]:
# 2.1) Load data, extend the full_image_file with the entire image path

df = con.sql("""
    SELECT *
    FROM img_splits
""").df()
print("List of files and splits extracted")
con.close()

IMG_DIR = 'D:/dataset/images_224_rgb'
df["full_image_file"] = df["full_image_file"].apply(lambda x: os.path.join(IMG_DIR, x))

# Check
print(df["full_image_file"].head(3))

List of files and splits extracted
0    D:/dataset/images_224_rgb\breemarieblog-190483...
1    D:/dataset/images_224_rgb\breemarieblog-190535...
2    D:/dataset/images_224_rgb\breemarieblog-190684...
Name: full_image_file, dtype: object


In [8]:
# 2.2) Split and check
train_df = df[df["split"]=="train"].reset_index(drop=True)
val_df   = df[df["split"]=="validation"].reset_index(drop=True)
test_df  = df[df["split"]=="test"].reset_index(drop=True)
print(f"train={len(train_df)} | val={len(val_df)} | test={len(test_df)}")

train=960048 | val=556982 | test=588557


In [9]:
del df
gc.collect()

0

In [11]:
# 3.1 Test the model and observe if it works correctly

BATCH_SIZE = 32

# Set of training samples, retrive the image path 
train_samples = train_df["full_image_file"].tolist()
print("Samples ready")

def make_loader(paths, transform, batch_size=32, shuffle=False):
    def _collate(batch):
        imgs = []
        for path in batch:
            img = Image.open(path).convert("RGB")
            img = transform(img)
            imgs.append(img)
        return torch.stack(imgs)
    
    return DataLoader(paths, batch_size=batch_size, shuffle=shuffle,
                      num_workers=0, pin_memory=True, collate_fn=_collate)

train_loader = make_loader(train_samples, eval_tfms, batch_size=BATCH_SIZE, shuffle=False)
print("DataLoader ready")

# Test a batch from the train_loader and check if it the backbone passes without errors, and with correct form
imgs = next(iter(train_loader))
imgs = imgs.to(device, non_blocking=True)

with torch.no_grad():
    t0 = time.time()
    feats = backbone(imgs)
    dt = time.time() - t0

print(f"Batch features shape: {feats.shape}")
print(f"Time forward batch: {dt:.3f}s  |  ~{imgs.size(0)/dt:.1f} img/s")

Backbone: efficientnet_b0 | feature dim = 1280 | device = cpu
Batch features shape: torch.Size([32, 1280]) (es. 32 x 1280)
Tempo forward batch: 1.374s  |  ~23.3 img/s


In [12]:
# 3.2 Define functions to extract images, embeddings and save them

# Build samples: (path, post_id)
def build_samples(df, path_col="full_image_file"):
    return list(zip(df[path_col].tolist(), df["post_id"].tolist()))

# Loader con post_id + resume
def make_loader_with_ids(samples, transform, batch_size=64, shuffle=False, pin_memory=True, start_index=0):
    if start_index > 0:
        samples = samples[start_index:]

    def _collate(batch):
        imgs = []
        pids = []
        for path, pid in batch:
            img = Image.open(path).convert("RGB")
            img = transform(img)
            imgs.append(img)
            pids.append(pid)
        return torch.stack(imgs), pids

    return DataLoader(samples,
                      batch_size=batch_size,
                      shuffle=shuffle,
                      num_workers=0,
                      pin_memory=pin_memory,
                      collate_fn=_collate)

def extract_and_save(
    split_name,
    samples,
    transform,
    out_dir="emb_cache",
    shard_size=20000,
    batch_size=64,
):
    
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    prefix = f"{MODEL_NAME}_{split_name}"

    n = len(samples)
    print(f"[{split_name}] Totale esempi: {n}")

    shard_files = []

    # loop a shard di samples
    for start in range(0, n, shard_size):
        shard_samples = samples[start:start + shard_size]
        shard_path = out_dir / f"{prefix}_{start:07d}.npz"

        if shard_path.exists():
            print(f"[skip] {shard_path.name}")
            shard_files.append(shard_path)
            continue

        loader = make_loader_with_ids(
            shard_samples,
            transform,
            batch_size=batch_size,
            shuffle=False,
        )

        feats_buf = []
        id_buf = []

        t0 = time.time()
        with torch.no_grad():
            for imgs, pids in loader:
                imgs = imgs.to(device, non_blocking=True)
                feats = backbone(imgs).float().cpu().numpy()

                feats_buf.append(feats)
                id_buf.extend(pids)

        if torch.cuda.is_available():
            torch.cuda.synchronize()

        F = np.concatenate(feats_buf, axis=0)
        I = np.array(id_buf, dtype=object)

        dt = time.time() - t0
        rate = F.shape[0] / max(dt, 1e-9)

        np.savez_compressed(
            shard_path,
            feats=F,
            post_id=I,
            model=MODEL_NAME,
            feat_dim=feat_dim,
        )

        print(f"[save] {shard_path.name} | {F.shape[0]} esempi | {dt:.1f}s | {rate:.1f} img/s")
        shard_files.append(shard_path)

    print(f"[{split_name}] Concateno {len(shard_files)} shard...")

    feats_all = []
    ids_all = []

    for f in shard_files:
        data = np.load(f, allow_pickle=True)
        feats_all.append(data["feats"])
        ids_all.append(data["post_id"])

    F_all = np.concatenate(feats_all, axis=0)
    I_all = np.concatenate(ids_all, axis=0)

    all_path = out_dir / f"{prefix}_ALL.npz"
    np.savez_compressed(
        all_path,
        feats=F_all,
        post_id=I_all,
        model=MODEL_NAME,
        feat_dim=feat_dim,
    )

    print(f"[{split_name}] File unico: {all_path.name} | {F_all.shape[0]} esempi totali")

    return F_all, I_all

Backbone efficientnet_b0 | feat_dim=1280 | device=cpu


In [14]:
train_samples = build_samples(train_df)
extract_and_save("train", train_samples, eval_tfms, out_dir="D:/dataset/efficientnetb0_emb", shard_size=20000, batch_size=64)

[train] Totale esempi: 960048
[save] efficientnet_b0_train_0000000.npz | 20000 esempi | 1213.8s | 16.5 img/s
[save] efficientnet_b0_train_0020000.npz | 20000 esempi | 1219.2s | 16.4 img/s
[save] efficientnet_b0_train_0040000.npz | 20000 esempi | 1217.8s | 16.4 img/s
[save] efficientnet_b0_train_0060000.npz | 20000 esempi | 1210.4s | 16.5 img/s
[save] efficientnet_b0_train_0080000.npz | 20000 esempi | 1232.5s | 16.2 img/s
[save] efficientnet_b0_train_0100000.npz | 20000 esempi | 1229.9s | 16.3 img/s
[save] efficientnet_b0_train_0120000.npz | 20000 esempi | 1226.4s | 16.3 img/s
[save] efficientnet_b0_train_0140000.npz | 20000 esempi | 1235.6s | 16.2 img/s
[save] efficientnet_b0_train_0160000.npz | 20000 esempi | 1209.2s | 16.5 img/s
[save] efficientnet_b0_train_0180000.npz | 20000 esempi | 1208.5s | 16.5 img/s
[save] efficientnet_b0_train_0200000.npz | 20000 esempi | 1212.2s | 16.5 img/s
[save] efficientnet_b0_train_0220000.npz | 20000 esempi | 1218.3s | 16.4 img/s
[save] efficientnet_b0

(array([[-0.08969685,  0.10721488, -0.05978236, ..., -0.10028088,
          0.00112186,  0.52130145],
        [ 0.05077013,  0.0330933 , -0.01632524, ..., -0.08097073,
         -0.06800617, -0.09015599],
        [ 0.08415666,  0.41065317,  0.0378902 , ...,  0.27694616,
          0.974736  , -0.17756397],
        ...,
        [-0.12931013,  0.34465432, -0.13878323, ..., -0.14236386,
          0.63322747, -0.14479609],
        [-0.04166615,  0.8030047 ,  0.00396485, ..., -0.09260282,
          0.21869075, -0.12982321],
        [-0.14429834, -0.08170366, -0.18868421, ..., -0.0985931 ,
         -0.1829363 , -0.15703775]], dtype=float32),
 array(['brentrivera-1543333336599822270',
        'brentrivera-1543799326014076320',
        'brentrivera-1544838263016975056', ...,
        'willljns-1900980824614909347',
        'viaggio_animamente-1732100330503797620',
        'viaggio_animamente-1794516667193399367'], dtype=object))

In [23]:
val_samples   = build_samples(val_df)
extract_and_save("val", val_samples, eval_tfms, out_dir="D:/dataset/efficientnetb0_emb", shard_size=20000, batch_size=64)

[val] Totale esempi: 556982
[save] efficientnet_b0_val_0000000.npz | 20000 esempi | 1308.1s | 15.3 img/s
[save] efficientnet_b0_val_0020000.npz | 20000 esempi | 1285.2s | 15.6 img/s
[save] efficientnet_b0_val_0040000.npz | 20000 esempi | 1287.2s | 15.5 img/s
[save] efficientnet_b0_val_0060000.npz | 20000 esempi | 1283.0s | 15.6 img/s
[save] efficientnet_b0_val_0080000.npz | 20000 esempi | 1279.3s | 15.6 img/s
[save] efficientnet_b0_val_0100000.npz | 20000 esempi | 1282.3s | 15.6 img/s
[save] efficientnet_b0_val_0120000.npz | 20000 esempi | 1276.4s | 15.7 img/s
[save] efficientnet_b0_val_0140000.npz | 20000 esempi | 1287.2s | 15.5 img/s
[save] efficientnet_b0_val_0160000.npz | 20000 esempi | 1300.4s | 15.4 img/s
[save] efficientnet_b0_val_0180000.npz | 20000 esempi | 1273.0s | 15.7 img/s
[save] efficientnet_b0_val_0200000.npz | 20000 esempi | 1273.5s | 15.7 img/s
[save] efficientnet_b0_val_0220000.npz | 20000 esempi | 1273.4s | 15.7 img/s
[save] efficientnet_b0_val_0240000.npz | 20000 e

(array([[-0.00955195,  0.50942224, -0.02472188, ..., -0.08392821,
          0.2056811 , -0.10758663],
        [-0.06945565,  0.01800106, -0.16118643, ..., -0.08920514,
         -0.0598452 ,  0.5940528 ],
        [-0.11799315, -0.12160274,  0.21269923, ..., -0.08459441,
         -0.08547642,  1.6258874 ],
        ...,
        [ 0.90571356, -0.03089917, -0.06676721, ...,  0.17215267,
          0.37960213,  0.96493304],
        [-0.10017259, -0.15703921,  0.09351613, ..., -0.10389128,
         -0.01287235,  0.0635795 ],
        [-0.12107382, -0.13659331,  0.5893331 , ..., -0.10891076,
          0.91433775,  0.46491456]], dtype=float32),
 array(['coffeethentravel-1903359253075455932',
        'coffeethentravel-1907017958346063810',
        'coffeethentravel-1907811296334629541', ...,
        'victoirefouquets-1923190337942107249',
        'victorborsuk-1916192906608627066',
        'victorborsuk-1916192906608627066'], dtype=object))

In [24]:
test_samples  = build_samples(test_df)
extract_and_save("test", test_samples, eval_tfms, out_dir= "D:/dataset/efficientnetb0_emb", shard_size=20000, batch_size=64)

[test] Totale esempi: 588557
[save] efficientnet_b0_test_0000000.npz | 20000 esempi | 1276.2s | 15.7 img/s
[save] efficientnet_b0_test_0020000.npz | 20000 esempi | 1272.1s | 15.7 img/s
[save] efficientnet_b0_test_0040000.npz | 20000 esempi | 1261.5s | 15.9 img/s
[save] efficientnet_b0_test_0060000.npz | 20000 esempi | 1259.3s | 15.9 img/s
[save] efficientnet_b0_test_0080000.npz | 20000 esempi | 1254.6s | 15.9 img/s
[save] efficientnet_b0_test_0100000.npz | 20000 esempi | 1249.2s | 16.0 img/s
[save] efficientnet_b0_test_0120000.npz | 20000 esempi | 1266.2s | 15.8 img/s
[save] efficientnet_b0_test_0140000.npz | 20000 esempi | 1255.5s | 15.9 img/s
[save] efficientnet_b0_test_0160000.npz | 20000 esempi | 1274.1s | 15.7 img/s
[save] efficientnet_b0_test_0180000.npz | 20000 esempi | 1255.6s | 15.9 img/s
[save] efficientnet_b0_test_0200000.npz | 20000 esempi | 1247.6s | 16.0 img/s
[save] efficientnet_b0_test_0220000.npz | 20000 esempi | 1265.8s | 15.8 img/s
[save] efficientnet_b0_test_0240000

(array([[-0.11214414,  0.21377252, -0.02436068, ...,  0.00886863,
         -0.11792702, -0.14089029],
        [ 0.910225  ,  0.2848291 , -0.05918846, ...,  0.4655865 ,
          0.18165158,  0.0702652 ],
        [-0.08773034, -0.12038821, -0.12598914, ..., -0.13597332,
         -0.12463147,  0.00323523],
        ...,
        [ 0.43508133, -0.03962286,  0.03036041, ...,  0.25463566,
         -0.07364696, -0.05859775],
        [-0.02892335, -0.14100108,  0.52792597, ...,  0.01876375,
         -0.03665268, -0.10178239],
        [-0.0559464 , -0.03932384,  0.28081572, ...,  0.4798931 ,
          0.47459742, -0.14791866]], dtype=float32),
 array(['coffeethentravel-1924875490145956005',
        'coffeethentravel-1927668952411117227',
        'coffeethentravel-1928708312795253682', ...,
        'victoirefouquets-1927519468976779024',
        'victoirefouquets-1937798696130142630',
        'victoirefouquets-1946535289107301734'], dtype=object))

# LOAD EMBEDDINGS

In [2]:
emb_dir = Path("D:/dataset/efficientnetb0_emb")
MODEL_NAME = "efficientnet_b0" 

def load_effnet_split(split_name, emb_dir=emb_dir, model_name=MODEL_NAME):
    all_path = emb_dir / f"{model_name}_{split_name}_ALL.npz"
    data = np.load(all_path, allow_pickle=True)
    F = data["feats"]
    P = data["post_id"]
    print(split_name, F.shape, len(P))
    return F, P

In [3]:
X_train, ids_train = load_effnet_split("train")
X_val, ids_val = load_effnet_split("val")

train (960048, 1280) 960048
val (556982, 1280) 556982


In [4]:
unique_posts_tr = len(np.unique(ids_train))
unique_posts_va = len(np.unique(ids_val))
print("post_id unici in ids_train:", unique_posts_tr)
print("post_id unici in ids_val:", unique_posts_va)

post_id unici in ids_train: 773497
post_id unici in ids_val: 412325


In [3]:
def aggregate_by_post(F, P, agg="mean"):
    df = pd.DataFrame({
        "post_id": P,
        "feat": list(F)
    })

    if agg == "mean":
        agg_func = lambda arrs: np.mean(np.stack(arrs), axis=0)
    elif agg == "max":
        agg_func = lambda arrs: np.max(np.stack(arrs), axis=0)
    else:
        raise ValueError("agg deve essere 'mean' o 'max'")

    df_post = (
        df.groupby("post_id")["feat"]
          .apply(agg_func)
          .reset_index()
    )
    # return one record per post
    return df_post

In [6]:
df_train_img = aggregate_by_post(X_train, ids_train, agg="mean")
df_val_img = aggregate_by_post(X_val, ids_val, agg="mean")

In [7]:
print(df_train_img.shape, df_val_img.shape)

(773497, 2) (412325, 2)


In [9]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [10]:
y_tr_ids = con.sql("""SELECT post_id, er_bins FROM md1718 WHERE split = 'train'""").df()
y_val_ids = con.sql("""SELECT post_id, er_bins FROM md1718 WHERE split = 'validation'""").df()

In [11]:
df_train = df_train_img.merge(
    y_tr_ids[["post_id", "er_bins"]],
    on="post_id", how="inner"
)

df_val = df_val_img.merge(
    y_val_ids[["post_id", "er_bins"]],
    on="post_id", how="inner"
)

In [12]:
# feat is an array, where the array represent the embedding. It needs to be transformed into a matrix
X_tr = np.stack(df_train["feat"].values)
y_tr = df_train["er_bins"].values

X_va = np.stack(df_val["feat"].values)
y_va = df_val["er_bins"].values

In [13]:
print(X_tr.shape, X_va.shape, y_tr.shape, y_va.shape)

(773497, 1280) (412325, 1280) (773497,) (412325,)


In [14]:
df_train['post_id'].head()

0    100pintas-1769662389073991144
1    100pintas-1782702664733979876
2    100pintas-1797067212467389817
3    100pintas-1807955339238986900
4    100pintas-1808039696742034708
Name: post_id, dtype: object

In [16]:
y_tr[:5]

array(['medium', 'medium', 'medium', 'medium', 'very_high'], dtype=object)

In [17]:
ids_tr = df_train['post_id'].values
ids_va = df_val['post_id'].values

In [21]:
del X_train, X_val, df_train, df_train_img, df_val, df_val_img, ids_train, ids_val, y_tr_ids, y_val_ids
gc.collect()

1531

In [18]:
np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/train_data.npz",
    X=X_tr,
    y=y_tr,
    ids = ids_tr
)

np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/val_data.npz",
    X=X_va,
    y=y_va,
    ids = ids_va
)

In [6]:
X_test, ids_test = load_effnet_split("test")
unique_posts_te = len(np.unique(ids_test))
print("post_id unici in ids_test:", unique_posts_te)
df_test_img = aggregate_by_post(X_test, ids_test, agg="mean")
print(df_test_img.shape)
y_te_ids = con.sql("""SELECT post_id, er_bins FROM md1718 WHERE split = 'test'""").df()
df_te = df_test_img.merge(
    y_te_ids[["post_id", "er_bins"]],
    on="post_id", how="inner"
)

X_te = np.stack(df_te["feat"].values)
y_te = df_te["er_bins"].values
ids_te = df_te['post_id'].values

np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/test_data.npz",
    X=X_te,
    y=y_te,
    ids = ids_te
)

test (588557, 1280) 588557
post_id unici in ids_test: 423604
(423604, 2)


In [2]:
train_data = np.load("D:/dataset/efficientnetb0_emb/train_data.npz", allow_pickle = True)
X_tr = train_data["X"]
y_tr = train_data["y"]

val_data = np.load("D:/dataset/efficientnetb0_emb/val_data.npz", allow_pickle = True)
X_va = val_data["X"]
y_va = val_data["y"]

del train_data, val_data
gc.collect()

521

In [None]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"]
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(X_tr, y_tr)

    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'average': False, 'class_weight': None}
macro-F1 (val): 0.21087314492196318 | accuracy (val): 0.23118898926817438

Combination: {'alpha': 1e-05, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.17910031035258323 | accuracy (val): 0.2193221366640393

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': None}
macro-F1 (val): 0.25418532318082965 | accuracy (val): 0.2557666888983205

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': 'balanced'}
macro-F1 (val): 0.25074167495132277 | accuracy (val): 0.26270539016552474

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': None}
macro-F1 (val): 0.20017188705326125 | accuracy (val): 0.22225671496998728

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.1948621138196483 | accuracy (val): 0.21830109743527557

Combination: {'alpha': 0.0001, 'average': True, 'class_weight': None}
macro-F1 (val): 0.2528260323979179 | accurac

In [6]:
# NAIVE BAYES - GAUSSIAN
param_grid_nb = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_nb):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.2096 | accuracy (val): 0.2616

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.2096 | accuracy (val): 0.2616

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.2096 | accuracy (val): 0.2616

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.2096 | accuracy (val): 0.2616

Best hyperparameter configuration:
{'var_smoothing': 1e-09}
Validation macro-F1: 0.2096407068589068

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
0   1.000000e-09      0.209641      0.261595
1   1.000000e-08      0.209641      0.261595
2   1.000000e-07      0.209641      0.261595
3   1.000000e-06      0.209639      0.261592


In [9]:
# RANDOM FOREST
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    # Fit su TRAIN
    clf.fit(X_tr, y_tr)

    # Valutazione su VALIDATION
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    # Aggiorno il best model in base alla macro-F1
    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

# Metto i risultati in un DataFrame per ispezionarli meglio
results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)



Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.2014 | accuracy (val): 0.2366

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.1975 | accuracy (val): 0.2363

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.1957 | accuracy (val): 0.2367

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.2024 | accuracy (val): 0.2363

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.1981 | accuracy (val): 0.2364

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.1965 | accuracy (val): 0.2367

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1978 | accuracy (val

In [10]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
    "gamma": [0, 1],
    "reg_lambda": [1],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(X_tr, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(X_va)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2517 | accuracy (val): 0.2549

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2546 | accuracy (val): 0.2574

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2552 | accuracy (val): 0.2575

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2565 | accuracy (val): 0.2587

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.2517 | accuracy (val): 0.2549

Combination: {'colsample_bytr

# PERFORMANCE SUL TEST SET

In [4]:
emb_dir = Path("D:/dataset/efficientnetb0_emb")
MODEL_NAME = "efficientnet_b0"  # stesso nome di prima

In [5]:
X_train, ids_train = load_effnet_split("train")
X_val, ids_val = load_effnet_split("val")

train (960048, 1280) 960048
val (556982, 1280) 556982


In [6]:
X_test, ids_test = load_effnet_split("test")

test (588557, 1280) 588557


In [7]:
unique_posts_tr = len(np.unique(ids_train))
unique_posts_va = len(np.unique(ids_val))
unique_posts_te = len(np.unique(ids_test))
print("post_id unici in ids_train:", unique_posts_tr)
print("post_id unici in ids_val:", unique_posts_va)
print("post_id unici in ids_test:", unique_posts_te)

post_id unici in ids_train: 773497
post_id unici in ids_val: 412325
post_id unici in ids_test: 423604


In [8]:
del unique_posts_tr, unique_posts_va, unique_posts_te
gc.collect()

66

In [9]:
df_train_img = aggregate_by_post(X_train, ids_train, agg="mean")
df_val_img = aggregate_by_post(X_val,   ids_val,   agg="mean")
df_test_img = aggregate_by_post(X_test,   ids_test,   agg="mean")

In [10]:
print(df_train_img.shape, df_val_img.shape, df_test_img.shape)

(773497, 2) (412325, 2) (423604, 2)


In [11]:
DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [12]:
y_tr_ids = con.sql("""SELECT post_id, er_bins FROM md1718 WHERE split = 'train'""").df()
y_val_ids = con.sql("""SELECT post_id, er_bins FROM md1718 WHERE split = 'validation'""").df()
y_test_ids = con.sql("""SELECT post_id, er_bins FROM md1718 WHERE split = 'test'""").df()

In [13]:
df_train = df_train_img.merge(
    y_tr_ids[["post_id", "er_bins"]],
    on="post_id", how="inner"
)

df_val = df_val_img.merge(
    y_val_ids[["post_id", "er_bins"]],
    on="post_id", how="inner"
)

df_test = df_test_img.merge(
    y_test_ids[["post_id", "er_bins"]],
    on="post_id", how="inner"
)

In [14]:
del X_train, X_val, X_test, df_train_img, df_val_img, df_test_img
gc.collect()

0

In [15]:
X_tr = np.stack(df_train["feat"].values)
y_tr = df_train["er_bins"].values

X_va = np.stack(df_val["feat"].values)
y_va = df_val["er_bins"].values

X_te = np.stack(df_test["feat"].values)
y_te = df_test["er_bins"].values

X_trva = np.concatenate((X_tr, X_va), axis = 0)
y_trva = np.concatenate((y_tr, y_va), axis = 0)

In [16]:
print(X_trva.shape, X_te.shape, y_trva.shape, y_te.shape)

(1185822, 1280) (423604, 1280) (1185822,) (423604,)


In [17]:
del df_train, df_val, df_test
del ids_train, ids_val, ids_test, y_tr_ids, y_val_ids, y_test_ids
gc.collect()

0

In [18]:
np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/trainval_data.npz",
    X=X_trva,
    y=y_trva
)

np.savez_compressed(
    "D:/dataset/efficientnetb0_emb/test_data.npz",
    X=X_te,
    y=y_te
)

In [None]:
train = np.load("D:/dataset/efficientnetb0_emb/train_data.npz", allow_pickle = True)
X_tr = train["X"]
y_tr = train["y"]

test_data = np.load("D:/dataset/efficientnetb0_emb/test_data.npz", allow_pickle = True)
X_te = test_data["X"]
y_te = test_data["y"]

In [6]:
# del train, test_data
# gc.collect()
print(X_tr.shape, X_te.shape, y_tr.shape, y_te.shape)

(773497, 1280) (423604, 1280) (773497,) (423604,)


In [4]:
le = LabelEncoder()
y_trva_enc = le.fit_transform(y_trva)
y_te_enc = le.transform(y_te)

In [None]:
cfgs = [
    GaussianNB(var_smoothing = 1e-09),
    # RandomForestClassifier(
    #     max_depth=12, max_features=0.05, min_samples_leaf=5, n_estimators=30, n_jobs=-1, random_state=42
    # ),
    # XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
    #     objective="multi:softmax",
    #     num_class=len(np.unique(y_trva_enc)),
    #     tree_method="hist", eval_metric="mlogloss",
    #     n_jobs=-1, random_state=42, verbosity=0
    # ),
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: GaussianNB()
macro-F1 (test): 0.2101 | accuracy (test): 0.2641

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=5,
                       n_estimators=30, n_jobs=-1, random_state=42)


In [5]:
cfgs = [
    # GaussianNB(var_smoothing = 1e-09),
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=5, n_estimators=30, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_trva_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_trva, y_trva_enc)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_trva, y_trva)
        y_te_pred = cfg.predict(X_te)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=5,
                       n_estimators=30, n_jobs=-1, random_state=42)
macro-F1 (test): 0.2525 | accuracy (test): 0.2544

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=150, n_jobs=-1, num_class=5, ...)
macro-F1 (test): 0.2554 | acc

In [7]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_tr, y_tr)
y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2535 | accuracy (test): 0.2558
