## Caricamento e operazioni sul dataset

In [2]:
from utils import data_manipulation

df = data_manipulation.load_dataframe()

Metodi disponibili:
method
stylegan2                  1000000
lsun                        539163
coco                        163846
taming_transformer          105000
stylegan3                    97494
imagenet                     96788
ffhq                         70000
mat                          60000
pro_gan                      40000
afhq                         31933
celebahq                     30000
lama                         24705
generative_inpainting        22000
stable_diffusion             21444
glide                        20903
latent_diffusion             20000
diffusion_gan                15507
cycle_gan                    15210
projected_gan                12000
cips                         11200
vq_diffusion                 10000
gansformer                   10000
sfhq                         10000
big_gan                      10000
denoising_diffusion_gan      10000
stylegan1                    10000
face_synthetics              10000
star_gan                    

Calcolando varianze: 100%|██████████| 20/20 [10:14<00:00, 30.74s/it]



--- Statistiche Filtro Varianza: Latent Diffusion ---
Immagini totali:      20000
Immagini valide:      20000
Immagini rimosse:         0
Percentuale rimossa:   0.00%
Varianza min:          3.46e+00
Varianza max:          1.45e+04
Varianza media:        3.45e+03
Varianza mediana:      3.18e+03
Inizio filtro varianza zero su 21444 immagini...
Soglia varianza: 1e-06


Calcolando varianze: 100%|██████████| 22/22 [06:47<00:00, 18.52s/it]



--- Statistiche Filtro Varianza: Stable Diffusion ---
Immagini totali:      21444
Immagini valide:      21334
Immagini rimosse:       110
Percentuale rimossa:   0.51%
Varianza min:          0.00e+00
Varianza max:          1.19e+04
Varianza media:        4.22e+03
Varianza mediana:      4.12e+03
Inizio filtro varianza zero su 10000 immagini...
Soglia varianza: 1e-06


Calcolando varianze: 100%|██████████| 10/10 [03:02<00:00, 18.22s/it]



--- Statistiche Filtro Varianza: COCO Sample ---
Immagini totali:      10000
Immagini valide:      10000
Immagini rimosse:         0
Percentuale rimossa:   0.00%
Varianza min:          3.81e+01
Varianza max:          1.37e+04
Varianza media:        3.46e+03
Varianza mediana:      3.28e+03

REPORT FINALE
Selezionate dopo filtro varianza:
  - Latent Diffusion:   20000 /  20000 originali
  - Stable Diffusion:   21334 /  21444 originali
  - COCO (train2017):   10000 /  10000 originali
  - TOTALE:             51334

Immagini rimosse per varianza zero: 110 (0.21%)

Forma del DataFrame finale: (51334, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51334 entries, 0 to 51333
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   filename         51334 non-null  object
 1   image_path       51334 non-null  object
 2   target           51334 non-null  int64 
 3   category         32444 non-null  object
 4   metadata_di

In [3]:
sub_folders_paths = data_manipulation.organize_images_with_symlinks(df)

for folder in sub_folders_paths:
    print(folder)

Metodi trovati: ['latent_diffusion' 'stable_diffusion' 'coco']

Distribuzione immagini per metodo:
method
stable_diffusion    21334
latent_diffusion    20000
coco                10000
Name: count, dtype: int64

--- Processando latent_diffusion (20000 immagini) ---


Linkando latent_diffusion: 100%|██████████| 20000/20000 [00:08<00:00, 2282.12it/s]



--- Processando stable_diffusion (21334 immagini) ---


Linkando stable_diffusion: 100%|██████████| 21334/21334 [00:08<00:00, 2664.89it/s]



--- Processando coco (10000 immagini) ---


Linkando coco: 100%|██████████| 10000/10000 [00:03<00:00, 2741.31it/s]



REPORT FINALE:
latent_diffusion    :  20000 /  20000 linkate
stable_diffusion    :  21334 /  21334 linkate
coco                :  10000 /  10000 linkate
TOTALE LINKATE:       51334 /  51334

Cartelle create in: /home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets
  - latent_diffusion/: 20000 link simbolici
  - stable_diffusion/: 21334 link simbolici
  - coco/: 10000 link simbolici
/home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/latent_diffusion
/home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/stable_diffusion
/home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/coco


## Definizione delle trasformazioni e del dataset

In [4]:
from PIL import Image
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

seed = 42

base_transform = T.Compose([
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
])

class ArtifactDataset(Dataset):
    def __init__(self, df, transform=base_transform):
        self.samples = list(zip(
            df['image_path_full'].tolist(),
            df['label'].tolist()
        ))
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert('RGB')
        return self.transform(img), label
    
# Split stratificato e DataLoader
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=seed
)
df_train = df_train.reset_index(drop=True)
df_test  = df_test.reset_index(drop=True)

# Helper per creare DataLoader
def make_loader(df, batch_size=64, shuffle=False, workers=8):
    ds = ArtifactDataset(df)
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=workers,
        pin_memory=True,
        persistent_workers=True,
        prefetch_factor=2
    )

train_loader = make_loader(df_train, shuffle=True)
test_loader  = make_loader(df_test,  shuffle=False)
    

## Preparazione backbone ViT in FP16

In [5]:
# Configurazione modello ViT e Backbone
import torch
import torch.nn as nn
from torchvision.models import vit_b_16, ViT_B_16_Weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Using device:", device)

weights = ViT_B_16_Weights.IMAGENET1K_V1
vit = vit_b_16(weights=weights).to(device).eval()
if device.type == 'cuda':
    vit.half()

class ViTBackbone(nn.Module):
    def __init__(self, vit_model):
        super().__init__()
        self._process_input = vit_model._process_input
        self.class_token    = vit_model.class_token
        self.encoder        = vit_model.encoder
        self.norm           = (
            vit_model.encoder.ln if hasattr(vit_model.encoder, 'ln')
            else vit_model.encoder.norm
        )

    def forward(self, x):
        x = self._process_input(x)
        B = x.size(0)
        cls = self.class_token.expand(B, -1, -1)
        x = torch.cat([cls, x], dim=1)
        x = self.encoder(x)
        x = self.norm(x)
        return x[:,0]

backbone = ViTBackbone(vit).to(device).eval()


Using device: cuda


## Estrazione e salvataggio degli embeddings

In [8]:
import os
import joblib
import numpy as np
from tqdm.auto import tqdm
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Funzione di estrazione embedding e caching
@torch.inference_mode()
def extract_embeddings(backbone, loader, device, cache_path=None):
    if cache_path and os.path.exists(cache_path):
        return joblib.load(cache_path)
    backbone.eval()
    embs, labs = [], []
    for imgs, labels in tqdm(loader, desc="Extracting", leave=False):
        imgs = imgs.to(device, non_blocking=True)
        if device.type == 'cuda':
            imgs = imgs.half()
        feats = backbone(imgs).cpu()
        embs.append(feats)
        labs.extend(labels)
    X = torch.vstack(embs).numpy()
    y = np.array(labs)
    if cache_path:
        joblib.dump((X, y), cache_path)
    return X, y


# Estrazione e salvataggio degli embeddings

# definisci il folder per gli embeddings
project_root   = Path.cwd()
emb_cache_dir  = project_root / "cache" / "embeddings"
emb_cache_dir.mkdir(parents=True, exist_ok=True)

# file di cache
train_cache = emb_cache_dir / "train_embeddings.joblib"
test_cache  = emb_cache_dir / "test_embeddings.joblib"


X_train, y_train = extract_embeddings(
    backbone,
    train_loader,
    device,
    str(train_cache)
)
X_test, y_test = extract_embeddings(
    backbone,
    test_loader,
    device,
    str(test_cache)
)

print("Train embeddings:", X_train.shape)
print("Test embeddings:",  X_test.shape)


Extracting:   0%|          | 0/642 [00:00<?, ?it/s]

                                                               

Train embeddings: (41067, 768)
Test embeddings: (10267, 768)




## Allenamento e valutazione dei classificatori

In [9]:
# Caricamento degli embedding da disco

# Import librerie per classificatori e metriche
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb

project_root      = Path.cwd()
emb_cache_dir     = project_root / "cache" / "embeddings"
train_cache_path  = emb_cache_dir / "train_embeddings.joblib"
test_cache_path   = emb_cache_dir / "test_embeddings.joblib"

if not train_cache_path.exists():
    raise FileNotFoundError(f"Non trovo {train_cache_path}")
if not test_cache_path.exists():
    raise FileNotFoundError(f"Non trovo {test_cache_path}")

X_train, y_train = joblib.load(train_cache_path)
X_test,  y_test  = joblib.load(test_cache_path)

print("Shape X_train:", X_train.shape)
print("Shape y_train:", y_train.shape)
print("Shape X_test: ", X_test.shape)
print("Shape y_test: ", y_test.shape)


def train_and_evaluate(clf, X_train, y_train, X_test, y_test, model_name="model"):
    """
    Addestra clf su X_train, y_train, valuta su X_test, y_test.
    Stampa accuracy, classification report, AUC (se binario).
    Salva il modello su disco con nome '{model_name}.joblib'.
    """
    # 1. Addestramento
    print(f"\n---> Addestramento di {model_name} ...")
    clf.fit(X_train, y_train)

    # 2. Predizioni
    y_pred = clf.predict(X_test)

    # 3. Metriche base
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy su test: {acc:.4f}")

    # 4. Report di classificazione
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # 5. Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # 6. Se problema binario, calcola anche AUC (usando decision_function o predict_proba)
    #    Verifica che y_train abbia esattamente 2 classi uniche
    if len(set(y_train)) == 2:
        try:
            if hasattr(clf, "predict_proba"):
                y_scores = clf.predict_proba(X_test)[:, 1]
            else:
                # es. LogisticRegression ha decision_function
                y_scores = clf.decision_function(X_test)
            auc = roc_auc_score(y_test, y_scores)
            print(f"AUC-ROC su test: {auc:.4f}")
        except Exception as e:
            print(f"Impossibile calcolare l'AUC: {e}")

    # 7. Salvataggio modello
    project_root   = Path.cwd()
    models_dir  = project_root / "cache" / "models"
    models_dir.mkdir(parents=True, exist_ok=True)

    model_path = models_dir / f"{model_name}.joblib"
    joblib.dump(clf, model_path)
    print(f"Modello salvato in: {model_path}")

    return clf


Shape X_train: (41067, 768)
Shape y_train: (41067,)
Shape X_test:  (10267, 768)
Shape y_test:  (10267,)


### Logistic Regression

In [10]:
# È possibile regolare i parametri a piacere; qui usiamo solver e regularizzazione di base
lr_clf = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='lbfgs',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

trained_lr = train_and_evaluate(
    clf=lr_clf,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model_name="logistic_regression"
)



---> Addestramento di logistic_regression ...
Accuracy su test: 0.8922

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.62      0.69      2000
           1       0.91      0.96      0.93      8267

    accuracy                           0.89     10267
   macro avg       0.85      0.79      0.81     10267
weighted avg       0.89      0.89      0.89     10267

Confusion Matrix:
[[1250  750]
 [ 357 7910]]
AUC-ROC su test: 0.9192
Modello salvato in: /home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/models/logistic_regression.joblib


### LightGBM

In [11]:
lgb_clf = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',        # classificazione binaria
    num_leaves=31,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

trained_lgb = train_and_evaluate(
    clf=lgb_clf,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model_name="lightgbm_classifier"
)



---> Addestramento di lightgbm_classifier ...
[LightGBM] [Info] Number of positive: 33067, number of negative: 8000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.918791 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195456
[LightGBM] [Info] Number of data points in the train set: 41067, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.805196 -> initscore=1.419094
[LightGBM] [Info] Start training from score 1.419094




Accuracy su test: 0.8592

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.36      0.50      2000
           1       0.86      0.98      0.92      8267

    accuracy                           0.86     10267
   macro avg       0.84      0.67      0.71     10267
weighted avg       0.85      0.86      0.84     10267

Confusion Matrix:
[[ 713 1287]
 [ 159 8108]]
AUC-ROC su test: 0.8911
Modello salvato in: /home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/models/lightgbm_classifier.joblib




### MLPClassifier

In [12]:
mlp_clf = MLPClassifier(
    hidden_layer_sizes=(512, 256),  # esempio di due hidden layer
    activation='relu',              # funzione di attivazione
    solver='adam',                  # ottimizzatore
    alpha=1e-4,                     # weight decay (L2)
    batch_size=64,
    learning_rate='adaptive',
    max_iter=200,                   # numero di epoche per l'ottimizzazione
    random_state=42,
    verbose=True
)

trained_mlp = train_and_evaluate(
    clf=mlp_clf,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model_name="mlp_classifier"
)



---> Addestramento di mlp_classifier ...
Iteration 1, loss = 0.27919013
Iteration 2, loss = 0.16831079
Iteration 3, loss = 0.10793421
Iteration 4, loss = 0.06894816
Iteration 5, loss = 0.04946948
Iteration 6, loss = 0.03613069
Iteration 7, loss = 0.03366466
Iteration 8, loss = 0.03041962
Iteration 9, loss = 0.02737288
Iteration 10, loss = 0.02444518
Iteration 11, loss = 0.02376785
Iteration 12, loss = 0.02311260
Iteration 13, loss = 0.01714639
Iteration 14, loss = 0.02013545
Iteration 15, loss = 0.02019859
Iteration 16, loss = 0.01542918
Iteration 17, loss = 0.02294822
Iteration 18, loss = 0.01769559
Iteration 19, loss = 0.01796268
Iteration 20, loss = 0.01428690
Iteration 21, loss = 0.01505338
Iteration 22, loss = 0.01754476
Iteration 23, loss = 0.01707358
Iteration 24, loss = 0.01598318
Iteration 25, loss = 0.01673614
Iteration 26, loss = 0.01523434
Iteration 27, loss = 0.01618171
Iteration 28, loss = 0.01211900
Iteration 29, loss = 0.01767356
Iteration 30, loss = 0.01304683
Iterati

## Angular Spectrum

In [13]:
import os
import subprocess
from typing import List

def run_generate_spectra(
    files_paths: List[str]
) -> None:
    """
    Esegue il comando `python generate_spectra.py` per ogni path nella lista.

    Args:
        files_paths (List[str]): Lista di percorsi ai file di input.
    """
    # Percorso allo script da eseguire
    script_path = "SyntheticImagesAnalysis/generate_spectra.py"
    # Directory di output comune
    out_dir = "cache/spectra_output"
    os.makedirs(out_dir, exist_ok=True)

    for path in files_paths:
        # Estrae il nome del file senza estensione
        file_name = os.path.splitext(os.path.basename(path))[0]
        # Directory di output, basata sul nome del file
        out_name = file_name

        # Costruisce ed esegue il comando
        cmd = [
            "python",
            script_path,
            "--files_path", path,
             "--out_dir", out_dir,
            "--out_name", out_name,
        ]
        print(f"Eseguo: {' '.join(cmd)}")
        subprocess.run(cmd, check=True)

run_generate_spectra(sub_folders_paths)


Eseguo: python SyntheticImagesAnalysis/generate_spectra.py --files_path /home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/latent_diffusion --out_dir cache/spectra_output --out_name latent_diffusion
Starting generation of spectra
/home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/latent_diffusion
20000
Starting to generate fingerprints


100%|██████████| 1000/1000 [00:19<00:00, 52.38it/s]
100%|██████████| 1000/1000 [00:16<00:00, 60.26it/s]
100%|██████████| 1000/1000 [00:08<00:00, 119.70it/s]


Eseguo: python SyntheticImagesAnalysis/generate_spectra.py --files_path /home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/stable_diffusion --out_dir cache/spectra_output --out_name stable_diffusion
Starting generation of spectra
/home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/stable_diffusion
21334
Starting to generate fingerprints


100%|██████████| 1000/1000 [00:18<00:00, 55.39it/s]
100%|██████████| 1000/1000 [00:18<00:00, 54.51it/s]
100%|██████████| 1000/1000 [00:08<00:00, 112.37it/s]


Eseguo: python SyntheticImagesAnalysis/generate_spectra.py --files_path /home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/coco --out_dir cache/spectra_output --out_name coco
Starting generation of spectra
/home/alessandro/Desktop/BIOMETRIA/PROGETTO/ProjectDetectivev2/cache/data_subsets/coco
10000
Starting to generate fingerprints


100%|██████████| 1000/1000 [00:17<00:00, 57.26it/s]
100%|██████████| 1000/1000 [00:16<00:00, 61.32it/s]
100%|██████████| 1000/1000 [00:08<00:00, 119.90it/s]
