In [1]:
# ==============================================================================
# CELLA 0: SETUP TOTALE (MINIMAL & STABILE)
# ==============================================================================
import sys
import os
from IPython.display import clear_output

# 1. BLOCCO MODULI PROBLEMATICI
sys.modules["vllm"] = None
sys.modules["vllm.sampling_params"] = None

print("‚è≥ Setup Ambiente in corso... (Attendere, output nascosto)")

# 2. INSTALLAZIONE & AGGIORNAMENTO SILENZIOSO
# Scarica l'ultima versione di Unsloth da Git e aggiorna automaticamente 
# PyTorch e Transformers alle versioni pi√π recenti e compatibili.
!pip install --upgrade --no-cache-dir --quiet \
    "torch" "torchvision" "torchaudio" \
    "transformers" "trl" "peft" "accelerate" "bitsandbytes" \
    "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" \
    "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo.git" \
    "pillow" "scikit-learn" "pandas"

# 3. VERIFICA E PULIZIA
clear_output()

import torch
import unsloth
import transformers
from PIL import Image

print(f"‚úÖ Ambiente Pronto e Pulito.")
print(f"   ‚Ä¢ GPU: {torch.cuda.get_device_name(0)}")
print(f"   ‚Ä¢ PyTorch: {torch.__version__}")
print(f"   ‚Ä¢ Unsloth: {unsloth.__version__}")
print(f"   ‚Ä¢ Transformers: {transformers.__version__}")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from pandas.core.computation.check import NUMEXPR_INSTALLED


Unsloth: Using MoE backend 'grouped_mm'
ü¶• Unsloth Zoo will now patch everything to make training faster!
‚úÖ Ambiente Pronto e Pulito.
   ‚Ä¢ GPU: Tesla V100S-PCIE-32GB
   ‚Ä¢ PyTorch: 2.10.0+cu128
   ‚Ä¢ Unsloth: 2026.2.1
   ‚Ä¢ Transformers: 4.57.6


In [10]:
# ==============================================================================
# CELLA 1: SETUP, IMPORT & PATHS
# ==============================================================================
import sys
import os
import gc
import json
import time
import re
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from datasets import load_from_disk
from unsloth import FastVisionModel

# Setup Display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.4f}'.format)

# Installazione dipendenze
print("‚è≥ Verifica librerie...")
try:
    import unsloth
    from qwen_vl_utils import process_vision_info 
    import seaborn as sns
except ImportError:
    !pip install --upgrade --no-cache-dir --quiet "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" "scikit-learn" "pandas" "unsloth_zoo" "qwen-vl-utils" "seaborn"
    from qwen_vl_utils import process_vision_info
    import seaborn as sns

# Cartella Output Finale
TEST_ID = "TEST_N3_PIPELINE_FULL"
BASE_ROOT = "TEST_EXPERIMENTS"
RESULTS_DIR = os.path.join(BASE_ROOT, TEST_ID)
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"‚úÖ Setup completato. GPU: {torch.cuda.get_device_name(0)}")

‚è≥ Verifica librerie...
‚úÖ Setup completato. GPU: Tesla V100S-PCIE-32GB


In [11]:
# ==============================================================================
# CELLA 2: CONFIGURAZIONE PIPELINE & SORGENTI DATI
# ==============================================================================
SEEDS_TO_TEST = [101, 285, 3692, 92]

# --- 1. FONTI DATI ---
# Dataset HF (Per gli INPUT: Immagini e Testo)
DATASET_HF_PATH = os.path.join("DATASET_ITA", "PROCESSED_DATA", "HF_DATASETS", "M1_detection") 

# CSV Originale (Per la GROUND TRUTH: Label reali 0-4)
CSV_GT_PATH = os.path.join("DATASET_ITA", "PROCESSED_DATA", "splits", "master_test.csv")

# --- 2. INPUT M1 (Predizioni cached) ---
M1_RESULTS_DIR = os.path.join(BASE_ROOT, "TEST_N1_M1_BINARY", "Qwen2.5-VL-M1-Detection")

# --- 3. MODELLO M2 (Adapter da caricare) ---
TRAIN_OUTPUTS_ROOT = "outputs" 
M2_MODEL_SHORTNAME = "Qwen2.5-VL-M2-Classification"

# --- 4. PARAMETRI ---
TARGET_CLASSES_PIPELINE = ["0", "1", "2", "3", "4"] 
MODEL_ID_BASE = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"
MAX_NEW_TOKENS = 8          
TEMPERATURE = 0.0 

# --- 5. SYSTEM PROMPT M2 ---
SYSTEM_INSTRUCTION_M2 = """Sei un classificatore esperto specializzato nella tipologia di contenuti offensivi online.
Il contenuto che analizzerai (testo del commento e frame del video) √® GIA' stato identificato come offensivo.
Il tuo compito √® classificare ESATTAMENTE il tipo di offesa in una delle seguenti 4 categorie:

1. FLAMING: Insulti diretti, linguaggio ostile, aggressivit√† verbale, minacce, uso di parolacce contro una persona.
2. DENIGRATION: Attacchi alla reputazione, ridicolizzazione, svalutazione, diffamazione o umiliazione pubblica.
3. SEXUAL: Molestie sessuali, commenti lascivi, oggettivazione sessuale, riferimenti espliciti non consensuali.
4. RACISM: Discriminazione, stereotipi o insulti basati su razza, etnia, nazionalit√†, religione o colore della pelle.

Analizza CONGIUNTAMENTE il testo del commento e i frame del video.
Scegli la categoria che meglio descrive l'offesa predominante.
Se pi√π categorie sono presenti, scegli quella DOMINANTE.

Formato di Output OBBLIGATORIO:
Rispondi SOLAMENTE con il numero della classe (1, 2, 3 o 4).
Non aggiungere spiegazioni, punteggiatura o testo extra."""

print(f"‚öôÔ∏è Configurazione Pipeline Pronta.")

‚öôÔ∏è Configurazione Pipeline Pronta.


In [12]:
# ==============================================================================
# CELLA 3: DATASET & SANITY CHECK
# ==============================================================================
import re

print("üìÇ Caricamento Risorse...")

# 1. Caricamento HF
try:
    dataset_raw = load_from_disk(DATASET_HF_PATH)
    test_dataset = dataset_raw["test"] if "test" in dataset_raw else dataset_raw["val"]
    print(f"   ‚úÖ Dataset HF caricato: {len(test_dataset)} campioni.")
except Exception as e:
    raise RuntimeError(f"Errore caricamento HF: {e}")

# 2. Caricamento CSV GT
try:
    df_gt = pd.read_csv(CSV_GT_PATH, sep=';', dtype={'video_id': str}) 
    print(f"   ‚úÖ CSV Master caricato: {len(df_gt)} righe.")
except Exception as e:
    raise RuntimeError(f"Errore caricamento CSV: {e}")

# 3. Allineamento Lunghezze
if len(test_dataset) != len(df_gt):
    print(f"‚ö†Ô∏è DISALLINEAMENTO LUNGHEZZE: HF={len(test_dataset)} vs CSV={len(df_gt)}")
    min_len = min(len(test_dataset), len(df_gt))
    test_dataset = test_dataset.select(range(min_len))
    df_gt = df_gt.iloc[:min_len]
    print(f"   ‚úÇÔ∏è Troncato dataset a {min_len} righe.")

# --- SANITY CHECK ROBUSTO ---
print("\nüîç Eseguo SANITY CHECK sull'allineamento (HF vs CSV)...")

def normalize_simple(text):
    if not isinstance(text, str): return ""
    # Rimuoviamo formattazione, virgolette e punteggiatura varia per il confronto
    clean = text.replace("Commento:", "").replace('"', '').replace("'", "").strip().lower()
    return re.sub(r'[\W_]+', '', clean) # Solo caratteri alfanumerici uniti

matches = 0
check_limit = len(test_dataset)

for i in range(check_limit):
    # --- ESTRAZIONE ROBUSTA TESTO HF ---
    hf_text_raw = ""
    try:
        for msg in test_dataset[i]["messages"]:
            if msg["role"] == "user":
                for item in msg["content"]:
                    if item["type"] == "text":
                        hf_text_raw = item["text"]
                        break
    except (KeyError, TypeError, IndexError):
        hf_text_raw = ""

    # --- ESTRAZIONE TESTO CSV ---
    csv_text_raw = str(df_gt.iloc[i]['Comment']) if i < len(df_gt) else ""
    
    # --- CONFRONTO ---
    if normalize_simple(hf_text_raw) == normalize_simple(csv_text_raw):
        matches += 1
    else:
        # Stampiamo solo il primo errore per capire cosa succede
        if matches == i: 
            print(f"   ‚ùå Mismatch all'indice {i}:")
            print(f"      HF (raw):  {repr(hf_text_raw[:100])}")
            print(f"      CSV (raw): {repr(csv_text_raw[:100])}")

match_rate = matches / check_limit
print(f"   üìä Match Rate: {match_rate:.2%} ({matches}/{check_limit})")

if match_rate < 0.95: # Tolleranza leggermente pi√π alta per via di encoding/emoji
    print("‚õî ERRORE: L'ordine √® sballato o il testo √® troppo diverso.")
    # Se vuoi forzare lo stop scommenta la riga sotto:
    # raise ValueError("Allineamento fallito.")
else:
    print("   ‚úÖ ALLINEAMENTO ACCETTABILE. Procedo per indice.")

# --- HELPER FUNCTIONS ---
def get_gt_by_index(idx):
    raw_type = str(df_gt.iloc[idx]['Type']).strip().lower()
    mapping = {
        "none": "0", "0": "0", "neutral": "0",
        "flaming": "1", "1": "1",
        "denigration": "2", "2": "2",
        "sexual": "3", "3": "3",
        "racism": "4", "4": "4"
    }
    return mapping.get(raw_type, "0")

def get_m1_predictions_map(seed_or_baseline):
    if seed_or_baseline == "baseline": filename = "predictions_baseline.csv"
    else: filename = f"predictions_seed_{seed_or_baseline}.csv"
    path = os.path.join(M1_RESULTS_DIR, filename)
    if not os.path.exists(path): return {}
    df = pd.read_csv(path)
    return dict(zip(df["sample_idx"], df["pred_label"].astype(str)))

print("‚úÖ Setup Dati Completato.")

üìÇ Caricamento Risorse...
   ‚úÖ Dataset HF caricato: 855 campioni.
   ‚úÖ CSV Master caricato: 855 righe.

üîç Eseguo SANITY CHECK sull'allineamento (HF vs CSV)...
   üìä Match Rate: 100.00% (855/855)
   ‚úÖ ALLINEAMENTO ACCETTABILE. Procedo per indice.
‚úÖ Setup Dati Completato.


In [13]:
# ==============================================================================
# CELLA 4: HELPER PER COSTRUIRE INPUT M2 (CON CHECK IMMAGINI)
# ==============================================================================
def build_input_for_m2(sample):
    """
    Prepara l'input per M2.
    Restituisce: (prompt_msgs, has_missing_images_flag)
    """
    user_content_raw = None
    for msg in sample["messages"]:
        if msg["role"] == "user":
            user_content_raw = msg["content"]
            break
            
    if not user_content_raw: return None, False

    final_user_content = []
    has_missing_images = False
    
    for item in user_content_raw:
        if item["type"] == "image":
            raw_path = item["image"]
            clean_path = raw_path.replace("file://", "")
            clean_path = "/" + clean_path.lstrip("/")
            
            if os.path.exists(clean_path):
                final_user_content.append({"type": "image", "image": f"file://{clean_path}"})
            else:
                # Segnaliamo che un'immagine prevista non √® stata trovata
                has_missing_images = True
                
        elif item["type"] == "text":
            text_nude = item["text"].replace("Commento:", "").strip().strip('"').strip("'")
            final_user_content.append({"type": "text", "text": f"Commento: \"{text_nude}\""})

    prompt_msgs = [
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_INSTRUCTION_M2}]},
        {"role": "user", "content": final_user_content}
    ]
    return prompt_msgs, has_missing_images

def parse_m2_prediction(pred_text):
    clean = pred_text.strip()
    if clean in ["1", "2", "3", "4"]: return clean
    matches = re.findall(r'\b(1|2|3|4)\b', clean)
    return matches[0] if matches else "INVALID"

In [14]:
# ==============================================================================
# CELLA 5: PIPELINE ENGINE (AUDIT LOGGING & RELIABILITY CHECK)
# ==============================================================================
def run_pipeline_test(m2_model, m2_tokenizer, m1_preds_map, dataset, desc="Pipeline"):
    
    if m2_model: FastVisionModel.for_inference(m2_model)
    
    final_predictions = []
    ground_truth_5class = []
    
    # --- AUDIT COUNTERS (Per onest√† scientifica) ---
    stats = {
        "m2_calls": 0,              # Quante volte M1 ha detto "1"
        "m1_missing": 0,            # M1 non aveva output per questo sample
        "m2_input_empty": 0,        # M2 non ha ricevuto input valido (es. no text/img)
        "m2_invalid_output": 0,     # M2 ha risposto "blabla" invece di 1-4
        "m2_missing_images": 0,     # M2 ha girato solo col testo (img mancante)
        "fallback_events": 0        # Totale volte che abbiamo "inventato" la risposta
    }
    
    print(f"üöÄ Avvio Pipeline: {desc}")
    
    for idx, sample in enumerate(tqdm(dataset, desc=desc)):
        # 1. Ground Truth
        gt_5 = get_gt_by_index(idx)
        ground_truth_5class.append(gt_5)
        
        # 2. M1 Output
        m1_out = str(m1_preds_map.get(idx, "MISSING"))
        
        if m1_out == "MISSING":
            stats["m1_missing"] += 1
            final_predictions.append("0") # Fallback conservativo (Safe)
            continue

        # 3. Pipeline Logic
        if m1_out == "0":
            final_predictions.append("0")
        else:
            stats["m2_calls"] += 1
            
            # Input M2
            prompt_msgs, missing_imgs = build_input_for_m2(sample)
            if missing_imgs: stats["m2_missing_images"] += 1
            
            if prompt_msgs is None:
                stats["m2_input_empty"] += 1
                stats["fallback_events"] += 1
                final_predictions.append("2") # Fallback tecnico
                continue
                
            # Inferenza M2
            image_inputs, video_inputs = process_vision_info(prompt_msgs)
            text = m2_tokenizer.apply_chat_template(prompt_msgs, tokenize=False, add_generation_prompt=True)
            inputs = m2_tokenizer(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to("cuda")
            
            with torch.no_grad():
                outputs = m2_model.generate(**inputs, max_new_tokens=8, use_cache=True, temperature=0, do_sample=False)
                
            pred_text = m2_tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
            m2_out = parse_m2_prediction(pred_text)
            
            if m2_out == "INVALID":
                stats["m2_invalid_output"] += 1
                stats["fallback_events"] += 1
                final_predictions.append("2") # Fallback tecnico
            else:
                final_predictions.append(m2_out)

    # --- METRICHE ---
    acc = accuracy_score(ground_truth_5class, final_predictions)
    p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(ground_truth_5class, final_predictions, average='macro', zero_division=0)
    cm = confusion_matrix(ground_truth_5class, final_predictions, labels=TARGET_CLASSES_PIPELINE)
    
    # Calcolo percentuali audit
    total = len(dataset)
    audit_report = {
        "m2_activation_rate": stats["m2_calls"] / total,
        "reliability_risk": stats["fallback_events"] / max(1, stats["m2_calls"]), # % di M2 calls che sono fallback
        "missing_images_rate": stats["m2_missing_images"] / max(1, stats["m2_calls"]),
        "raw_stats": stats
    }
    
    results = {
        "accuracy": acc, "f1_macro": f1_macro, "precision_macro": p_macro, "recall_macro": r_macro,
        "audit": audit_report,
        "conf_matrix": cm.tolist()
    }
    
    # Per-Class
    p, r, f1, _ = precision_recall_fscore_support(ground_truth_5class, final_predictions, average=None, labels=TARGET_CLASSES_PIPELINE, zero_division=0)
    for i, c in enumerate(TARGET_CLASSES_PIPELINE):
        results[f"class_{c}_p"] = p[i]
        results[f"class_{c}_r"] = r[i]
        results[f"class_{c}_f1"] = f1[i]
        
    return results, final_predictions

In [15]:
# ==============================================================================
# CELLA 6: ESECUZIONE TEST CON RECUPERO METADATI TEMPI
# ==============================================================================
import glob

def get_train_time_from_file(model_shortname, seed):
    """Cerca il JSON di training per estrarre i minuti."""
    path = os.path.join(TRAIN_OUTPUTS_ROOT, f"{model_shortname}_Seed_{seed}", f"training_report_Seed_{seed}.json")
    try:
        with open(path, 'r') as f: return json.load(f)["4_TRAINING_PERFORMANCE"]["total_duration_minutes"]
    except: return 0.0

def get_inference_time_from_test(test_dir, seed):
    """Cerca il JSON di test per estrarre i secondi di inferenza."""
    path = os.path.join(test_dir, f"results_seed_{seed}.json")
    try:
        with open(path, 'r') as f: return json.load(f)["avg_inference_time_sec"]
    except: return 0.0

pipeline_results = []
M1_TEST_DIR = os.path.join(BASE_ROOT, "TEST_N1_M1_BINARY", "Qwen2.5-VL-M1-Detection")
M2_TEST_DIR = os.path.join(BASE_ROOT, "TEST_N2_M2_MULTICLASS", "Qwen2.5-VL-M2-Classification")

# --- BASELINE ---
m1_base_map = get_m1_predictions_map("baseline")
if m1_base_map:
    print("\nüîµ TEST BASELINE PIPELINE...")
    model, tokenizer = FastVisionModel.from_pretrained(MODEL_ID_BASE, load_in_4bit=True)
    res_base, _ = run_pipeline_test(model, tokenizer, m1_base_map, test_dataset, desc="Pipeline Baseline")
    res_base.update({"model": "Baseline", "seed": "N/A"})
    
    # Salvataggio
    with open(os.path.join(RESULTS_DIR, "results_baseline.json"), 'w') as f:
        json.dump(res_base, f, indent=4)
    del model, tokenizer; gc.collect(); torch.cuda.empty_cache()
else:
    res_base = {}

# --- FINE-TUNED SEEDS ---
for seed in SEEDS_TO_TEST:
    print(f"\nüü£ TEST PIPELINE SEED {seed}...")
    
    m1_seed_map = get_m1_predictions_map(seed)
    if not m1_seed_map: continue
        
    adapter_path = os.path.join(TRAIN_OUTPUTS_ROOT, f"{M2_MODEL_SHORTNAME}_Seed_{seed}", "final_adapter_latest")
    if not os.path.exists(adapter_path): continue
        
    model, tokenizer = FastVisionModel.from_pretrained(model_name=MODEL_ID_BASE, load_in_4bit=True)
    model.load_adapter(adapter_path)
    
    res, _ = run_pipeline_test(model, tokenizer, m1_seed_map, test_dataset, desc=f"Pipe Seed {seed}")
    
    # --- ARRICCHIMENTO DATI TEMPI ---
    t_train_m1 = get_train_time_from_file("Qwen2.5-VL-M1-Detection", seed)
    t_train_m2 = get_train_time_from_file("Qwen2.5-VL-M2-Classification", seed)
    t_inf_m1 = get_inference_time_from_test(M1_TEST_DIR, seed)
    t_inf_m2 = get_inference_time_from_test(M2_TEST_DIR, seed)
    
    res.update({
        "model": "Fine-Tuned", "seed": seed,
        "times": {
            "train_m1_min": t_train_m1, "train_m2_min": t_train_m2,
            "inf_m1_sec": t_inf_m1, "inf_m2_sec": t_inf_m2,
            "total_train_min": t_train_m1 + t_train_m2
        }
    })
    
    pipeline_results.append(res)
    with open(os.path.join(RESULTS_DIR, f"results_seed_{seed}.json"), 'w') as f:
        json.dump(res, f, indent=4)
        
    del model, tokenizer; gc.collect(); torch.cuda.empty_cache()

print("\n‚úÖ Pipeline Testing Completato.")


üîµ TEST BASELINE PIPELINE...
==((====))==  Unsloth 2026.2.1: Fast Qwen2_5_Vl patching. Transformers: 4.57.6. vLLM: 0.6.3.
   \\   /|    Tesla V100S-PCIE-32GB. Num GPUs = 1. Max memory: 31.739 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
üöÄ Avvio Pipeline: Pipeline Baseline


Pipeline Baseline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 855/855 [00:19<00:00, 43.09it/s] 



üü£ TEST PIPELINE SEED 101...
==((====))==  Unsloth 2026.2.1: Fast Qwen2_5_Vl patching. Transformers: 4.57.6. vLLM: 0.6.3.
   \\   /|    Tesla V100S-PCIE-32GB. Num GPUs = 1. Max memory: 31.739 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
üöÄ Avvio Pipeline: Pipe Seed 101


Pipe Seed 101: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 855/855 [01:37<00:00,  8.80it/s] 



üü£ TEST PIPELINE SEED 285...
==((====))==  Unsloth 2026.2.1: Fast Qwen2_5_Vl patching. Transformers: 4.57.6. vLLM: 0.6.3.
   \\   /|    Tesla V100S-PCIE-32GB. Num GPUs = 1. Max memory: 31.739 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
üöÄ Avvio Pipeline: Pipe Seed 285


Pipe Seed 285: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 855/855 [02:27<00:00,  5.80it/s] 



üü£ TEST PIPELINE SEED 3692...
==((====))==  Unsloth 2026.2.1: Fast Qwen2_5_Vl patching. Transformers: 4.57.6. vLLM: 0.6.3.
   \\   /|    Tesla V100S-PCIE-32GB. Num GPUs = 1. Max memory: 31.739 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
üöÄ Avvio Pipeline: Pipe Seed 3692


Pipe Seed 3692: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 855/855 [02:20<00:00,  6.10it/s] 



üü£ TEST PIPELINE SEED 92...
==((====))==  Unsloth 2026.2.1: Fast Qwen2_5_Vl patching. Transformers: 4.57.6. vLLM: 0.6.3.
   \\   /|    Tesla V100S-PCIE-32GB. Num GPUs = 1. Max memory: 31.739 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
üöÄ Avvio Pipeline: Pipe Seed 92


Pipe Seed 92: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 855/855 [02:33<00:00,  5.56it/s] 



‚úÖ Pipeline Testing Completato.


In [21]:
# ==============================================================================
# CELLA 7: REPORT PDF PIPELINE (REV7 - 3 pagine, zero overlap, CM labels fix)
# ==============================================================================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime
from textwrap import fill
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch

# -----------------------------
# Utilities
# -----------------------------
def safe_get(d, key, default=None):
    return d.get(key, default) if isinstance(d, dict) else default

def to_float(x):
    try:
        if x is None: return None
        if isinstance(x, (int, float, np.number)): return float(x)
        if isinstance(x, str) and x.strip() == "": return None
        return float(x)
    except Exception:
        return None

def mean_std(values, ddof=1):
    vals = [to_float(v) for v in values if to_float(v) is not None]
    if len(vals) == 0: return None, None, 0
    if len(vals) == 1: return float(vals[0]), 0.0, 1
    return float(np.mean(vals)), float(np.std(vals, ddof=ddof)), len(vals)

def fmt_mean_std(m, s, fmt="{:.4f}", unit=""):
    if m is None: return "N/D"
    if s is None: return (fmt.format(m) + (f" {unit}" if unit else "")).strip()
    return (fmt.format(m) + " ¬± " + fmt.format(s) + (f" {unit}" if unit else "")).strip()

def fmt_val(v, fmt="{:.4f}", unit=""):
    v = to_float(v)
    if v is None: return "N/D"
    return (fmt.format(v) + (f" {unit}" if unit else "")).strip()

def pct(v, decimals=1):
    v = to_float(v)
    if v is None: return "N/D"
    return f"{v*100:.{decimals}f}%"

def pick_gpu_name():
    try:
        import torch
        if torch.cuda.is_available():
            return torch.cuda.get_device_name(0)
    except Exception:
        pass
    return "N/D"

def add_section_header(ax, title, color="#1f4e79"):
    ax.axis("off")
    ax.text(0.0, 0.65, title, fontsize=12, fontweight="bold", color=color, va="center", ha="left")
    ax.plot([0, 1], [0.08, 0.08], color="#d0d0d0", lw=1)

def make_table(ax, data, col_widths=None, header_rows=1, font_size=9, row_scale=1.35,
               header_face="#e8f1fb", zebra=True, align_left_cols=None,
               body_text_color="#000000", header_text_color="#000000"):
    ax.axis("off")
    table = ax.table(cellText=data, loc="center", cellLoc="center", colWidths=col_widths)
    table.auto_set_font_size(False)
    table.set_fontsize(font_size)
    table.scale(1, row_scale)
    for (r, c), cell in table.get_celld().items():
        cell.set_edgecolor("#c7c7c7")
        cell.set_linewidth(0.6)
        if r < header_rows:
            cell.set_facecolor(header_face)
            cell.set_text_props(fontweight="bold", color=header_text_color)
        else:
            if zebra and (r % 2 == 0):
                cell.set_facecolor("#fafafa")
            cell.set_text_props(color=body_text_color)
        if align_left_cols and c in align_left_cols:
            cell._loc = "left"
            cell.PAD = 0.02
    return table

def extract_time_from_row(row, key):
    v = safe_get(row, key, None)
    if v is not None: return v
    t = safe_get(row, "times", None)
    if isinstance(t, dict): return t.get(key, None)
    return None

def extract_audit_from_row(row, key):
    v = safe_get(row, key, None)
    if v is not None: return v
    a = safe_get(row, "audit", None)
    if isinstance(a, dict): return a.get(key, None)
    return None

# -----------------------------
# Preconditions
# -----------------------------
required = ["pipeline_results", "res_base", "RESULTS_DIR", "TARGET_CLASSES_PIPELINE"]
missing = [x for x in required if x not in globals()]
if missing:
    raise ValueError(f"Mancano variabili richieste per la reportistica: {missing}")

TARGET_CLASSES_PIPELINE = [str(x) for x in TARGET_CLASSES_PIPELINE]
df = pd.DataFrame(pipeline_results)
if df.empty:
    raise ValueError("pipeline_results √® vuoto: impossibile generare report.")

N_TEST = len(test_dataset) if "test_dataset" in globals() else None
SEEDS_TO_TEST = globals().get("SEEDS_TO_TEST", [safe_get(r, "seed", None) for r in pipeline_results if safe_get(r, "seed", None) is not None])

# Best run per visual
if "f1_macro" in df.columns:
    best_idx = df["f1_macro"].astype(float).idxmax()
elif "accuracy" in df.columns:
    best_idx = df["accuracy"].astype(float).idxmax()
else:
    best_idx = df.index[0]
best_run = df.loc[best_idx].to_dict()
best_seed = safe_get(best_run, "seed", "N/D")

# -----------------------------
# Stats
# -----------------------------
overall_keys = ["accuracy", "precision_macro", "recall_macro", "f1_macro"]
overall_stats = {k: (mean_std(df[k].tolist(), ddof=1) if k in df.columns else (None, None, 0)) for k in overall_keys}

per_class_stats = {}
for c in TARGET_CLASSES_PIPELINE:
    for suffix in ["p", "r", "f1"]:
        key = f"class_{c}_{suffix}"
        per_class_stats[key] = mean_std(df[key].tolist(), ddof=1) if key in df.columns else (None, None, 0)

time_keys = [
    ("train_m1_min", "Train M1", "min", "{:.1f}"),
    ("train_m2_min", "Train M2", "min", "{:.1f}"),
    ("inf_m1_sec",   "Inferenza M1", "sec/campione", "{:.3f}"),
    ("inf_m2_sec",   "Inferenza M2", "sec/campione", "{:.3f}"),
]
time_stats = []
for key, label, unit, fmt in time_keys:
    vals = [extract_time_from_row(r, key) for r in pipeline_results]
    m, s, n = mean_std(vals, ddof=1)
    time_stats.append((key, label, m, s, n, unit, fmt))

audit_keys = [
    ("m2_activation_rate", "Attivazione M2", "quota campioni inviati a M2 (m2_calls / N)"),
    ("reliability_risk", "Fallback/Invalid", "quota chiamate M2 finite in fallback (fallback_events / m2_calls)"),
    ("missing_images_rate", "Missing immagini (su M2)", "quota chiamate M2 con immagini mancanti (m2_missing_images / m2_calls)"),
]
audit_stats = []
for key, label, note in audit_keys:
    vals = [extract_audit_from_row(r, key) for r in pipeline_results]
    m, s, n = mean_std(vals, ddof=1)
    audit_stats.append((key, label, m, s, n, note))

# Missing pred M1 rate
m1_missing_rates = []
for r in pipeline_results:
    a = safe_get(r, "audit", {})
    rs = safe_get(a, "raw_stats", {})
    miss = safe_get(rs, "m1_missing", None)
    if miss is not None and N_TEST:
        m1_missing_rates.append(miss / N_TEST)
m1_miss_m, m1_miss_s, _ = mean_std(m1_missing_rates, ddof=1) if m1_missing_rates else (None, None, 0)

# -----------------------------
# Labels
# -----------------------------
classes_map = {"0": "Safe", "1": "Flaming", "2": "Denigration", "3": "Sexual", "4": "Racism"}
labels_long = [classes_map.get(str(i), str(i)) for i in TARGET_CLASSES_PIPELINE]
labels_short = ["Safe", "Flaming", "Denigr.", "Sexual", "Racism"]  # per CM: evita attaccamenti

TITLE_COLOR = "#1f4e79"
SOFT_BG = "#f3f6fb"
plt.rcParams["font.family"] = "DejaVu Sans"

# -----------------------------
# Pipeline diagram (stilizzato)
# -----------------------------
def draw_pipeline_diagram(ax):
    ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")

    def box(x, y, w, h, text, fc, ec="#c9d2dc", lw=1.2, fs=10, bold=False):
        p = FancyBboxPatch((x, y), w, h, boxstyle="round,pad=0.02,rounding_size=0.03",
                           linewidth=lw, edgecolor=ec, facecolor=fc)
        ax.add_patch(p)
        ax.text(x+w/2, y+h/2, text, ha="center", va="center",
                fontsize=fs, fontweight=("bold" if bold else "normal"), color="#000000")
        return (x, y, w, h)

    def arrow(x1, y1, x2, y2):
        ax.add_patch(FancyArrowPatch((x1, y1), (x2, y2),
                                     arrowstyle="-|>", mutation_scale=14,
                                     linewidth=1.6, color="#2f3b46"))

    box(0.05, 0.38, 0.22, 0.26, "INPUT\n(Frame + Commento)", fc="#ffffff", ec="#b9c6d3", bold=True)
    box(0.33, 0.44, 0.20, 0.14, "M1\nDetection\n0 / 1*", fc="#e8f1fb", ec="#93b4d8", bold=True)
    box(0.72, 0.62, 0.23, 0.16, "OUTPUT = 0\n(Safe)", fc="#f2f2f2", ec="#c9c9c9", bold=True)
    box(0.63, 0.28, 0.20, 0.14, "M2\nClassificazione\n1..4", fc="#eaf7ef", ec="#8bc6a0", bold=True)
    box(0.86, 0.28, 0.12, 0.14, "MERGE\n0..4", fc="#fff7e6", ec="#f0d9a8", bold=True)

    arrow(0.27, 0.51, 0.33, 0.51)     # input -> M1
    arrow(0.53, 0.54, 0.72, 0.70)     # M1 -> out0
    arrow(0.53, 0.46, 0.63, 0.35)     # M1 -> M2
    ax.text(0.60, 0.72, "se M1=0", fontsize=9, color="#000000")
    ax.text(0.57, 0.35, "se M1=1*", fontsize=9, color="#000000")
    arrow(0.83, 0.35, 0.86, 0.35)     # M2 -> merge
    arrow(0.92, 0.28, 0.92, 0.12)     # merge -> final
    ax.text(0.92, 0.07, "OUTPUT finale\n(0..4)", ha="center", va="center",
            fontsize=10, fontweight="bold", color="#000000")

# -----------------------------
# Confusion matrix easy (stretta + etichette corte)
# -----------------------------
def draw_easy_confusion_matrix(ax, cm, row_labels, col_labels, scale_x=0.70, scale_y=1.60, font_size=9):
    ax.axis("off")
    cm = np.array(cm, dtype=int)
    n = cm.shape[0]

    cell_text = []
    for i in range(n):
        row = []
        for j in range(n):
            row.append(f"{cm[i,j]} ‚úì" if i == j else f"{cm[i,j]}")
        cell_text.append(row)

    table = ax.table(cellText=cell_text, rowLabels=row_labels, colLabels=col_labels,
                     cellLoc="center", loc="center")
    table.auto_set_font_size(False)
    table.set_fontsize(font_size)
    table.scale(scale_x, scale_y)

    bg = "#111315"; header_bg = "#1b1f24"
    text_col = "#f2f2f2"; diag_bg = "#163a2a"; diag_text = "#b8f2d3"; edge = "#2a2f36"

    for (r, c), cell in table.get_celld().items():
        cell.set_edgecolor(edge); cell.set_linewidth(0.8)
        if r == 0 or c == -1:
            cell.set_facecolor(header_bg)
            cell.get_text().set_color(text_col)
            cell.get_text().set_fontweight("bold")
            continue
        i = r - 1; j = c
        if i == j:
            cell.set_facecolor(diag_bg)
            cell.get_text().set_color(diag_text)
            cell.get_text().set_fontweight("bold")
        else:
            cell.set_facecolor(bg)
            cell.get_text().set_color(text_col)

    ax.text(0.0, 1.03, "Reale \\ Predetto", transform=ax.transAxes,
            ha="left", va="bottom", fontsize=11, fontweight="bold", color="#000000")

# -----------------------------
# Stabilit√† boxplot P/R/F1 (sintesi sotto, alta abbastanza)
# -----------------------------
def draw_stability(ax_box, ax_summary, df_metrics):
    keys = [("precision_macro", "Precision (macro)"),
            ("recall_macro", "Recall (macro)"),
            ("f1_macro", "F1 (macro)")]

    data = []
    ticks = []
    lines = []
    for k, lab in keys:
        vals = [to_float(x) for x in df_metrics.get(k, pd.Series([])).tolist() if to_float(x) is not None]
        data.append(vals if vals else [np.nan])
        ticks.append(lab)
        m, s, n = mean_std(vals, ddof=1) if vals else (None, None, 0)
        lines.append(f"‚Ä¢ {lab}: {fmt_mean_std(m, s, '{:.4f}')}  (n={n})")

    ax_box.set_facecolor("white")
    bp = ax_box.boxplot(data, tick_labels=ticks, patch_artist=True, widths=0.55)

    box_colors = ["#d6eaf8", "#fdebd0", "#d5f5e3"]
    edge_colors = ["#5b7aa6", "#b9770e", "#1e8449"]
    for i, patch in enumerate(bp["boxes"]):
        patch.set(facecolor=box_colors[i], edgecolor=edge_colors[i], linewidth=1.2)
    for med in bp["medians"]:
        med.set(color="#1f2d3a", linewidth=1.4)

    for i, (k, _) in enumerate(keys, start=1):
        vals = [to_float(x) for x in df_metrics.get(k, pd.Series([])).tolist() if to_float(x) is not None]
        if vals:
            ax_box.scatter([i]*len(vals), vals, s=22, alpha=0.85)

    ax_box.set_ylabel("Valore metrica", color="#000000")
    ax_box.tick_params(colors="#000000")
    ax_box.grid(axis="y", linestyle="--", alpha=0.20)

    ax_summary.axis("off")
    ax_summary.text(
        0.0, 1.0,
        "Sintesi (media ¬± dev.std):\n" + "\n".join(lines),
        va="top", ha="left", fontsize=9, color="#000000",
        bbox=dict(facecolor="#fff7e6", edgecolor="#f0d9a8", boxstyle="round,pad=0.5")
    )

# -----------------------------
# PDF (3 pagine)
# -----------------------------
pdf_path = os.path.join(RESULTS_DIR, "Report_PIPELINE_FINAL.pdf")
gpu_name = pick_gpu_name()
now_str = datetime.now().strftime("%Y-%m-%d %H:%M")

with PdfPages(pdf_path) as pdf:

    # =========================
    # PAGINA 1: intro + pipeline + overall
    # =========================
    fig = plt.figure(figsize=(8.27, 11.69), facecolor="white")
    gs = fig.add_gridspec(nrows=24, ncols=6, left=0.06, right=0.94, top=0.97, bottom=0.06, hspace=1.10, wspace=0.8)

    ax = fig.add_subplot(gs[0:2, :]); ax.axis("off")
    ax.text(0.5, 0.78, "TEST FINALE ‚Äî Pipeline M1 (Detection) ‚Üí M2 (Classificazione)",
            ha="center", va="center", fontsize=16, fontweight="bold", color="#000000")
    meta1 = f"N test: {N_TEST if N_TEST is not None else 'N/D'} | Seed: {SEEDS_TO_TEST} | Best seed (visual): {best_seed}"
    meta2 = f"{now_str} | GPU: {gpu_name} | Output: {os.path.basename(RESULTS_DIR)}"
    ax.text(0.5, 0.35, meta1, ha="center", va="center", fontsize=9, color="#000000")
    ax.text(0.5, 0.10, meta2, ha="center", va="center", fontsize=9, color="#000000")

    ax_h = fig.add_subplot(gs[2:3, :]); add_section_header(ax_h, "1) Configurazione e logica della pipeline", TITLE_COLOR)

    ax_chk = fig.add_subplot(gs[3:6, :]); ax_chk.axis("off")
    checklist = [
        "‚Ä¢ M1: Detection binaria (0=Safe, 1*=Offensive dove 1*={1..4})",
        "‚Ä¢ M2: Classificazione multi-classe (training solo su campioni Offensive 1..4)",
        "‚Ä¢ Inference: M1 filtra; M2 gira solo sui campioni con M1=1*; merge nello spazio {0..4}",
        "‚Ä¢ Output finale: etichette {0,1,2,3,4} + metriche globali/per-classe + tempi + stabilit√†"
    ]
    ax_chk.text(0.0, 1.0, "\n".join(checklist), va="top", fontsize=9, color="#000000", linespacing=1.55)

    ax_flow = fig.add_subplot(gs[6:10, :])
    draw_pipeline_diagram(ax_flow)

    ax_h = fig.add_subplot(gs[10:11, :]); add_section_header(ax_h, "2) Performance complessiva (Baseline vs Fine-Tuned)", TITLE_COLOR)

    ax_t = fig.add_subplot(gs[11:16, :])
    t_data = [["Metrica", "Baseline", "Fine-Tuned (media ¬± dev.std)"]]
    for label, k in [("Accuracy", "accuracy"), ("Precision (macro)", "precision_macro"), ("Recall (macro)", "recall_macro"), ("F1 (macro)", "f1_macro")]:
        bm = fmt_val(safe_get(res_base, k, None), "{:.4f}")
        m, s, _ = overall_stats.get(k, (None, None, 0))
        ft = fmt_mean_std(m, s, "{:.4f}")
        t_data.append([label, bm, ft])
    make_table(ax_t, t_data, col_widths=[0.35, 0.25, 0.40], header_rows=1, font_size=10, row_scale=1.65)

    # spazio ‚Äúrespirazione‚Äù + mini nota (non obbligatoria)
    ax_note = fig.add_subplot(gs[16:18, :]); ax_note.axis("off")
    ax_note.text(0.0, 0.6, "Nota: le metriche Fine-Tuned sono aggregate su seed (media ¬± dev.std).",
                 fontsize=9, color="#000000")

    pdf.savefig(fig); plt.close(fig)

    # =========================
    # PAGINA 2: stabilit√† + tempi + audit (senza overlap)
    # =========================
    fig = plt.figure(figsize=(8.27, 11.69), facecolor="white")
    gs = fig.add_gridspec(nrows=26, ncols=6, left=0.06, right=0.94, top=0.97, bottom=0.06, hspace=1.15, wspace=0.8)

    ax = fig.add_subplot(gs[0:2, :]); ax.axis("off")
    ax.text(0.5, 0.65, "STABILIT√Ä, TEMPI E AUDIT OPERATIVO",
            ha="center", va="center", fontsize=15, fontweight="bold", color="#000000")

    ax_h = fig.add_subplot(gs[2:3, :]); add_section_header(ax_h, "3) Stabilit√† tra seed (Precision/Recall/F1 macro)", TITLE_COLOR)
    ax_box = fig.add_subplot(gs[3:10, :])
    ax_sum = fig.add_subplot(gs[10:13, :])   # pi√π alto -> mai sovrapposizione
    draw_stability(ax_box, ax_sum, df)

    ax_h = fig.add_subplot(gs[13:14, :]); add_section_header(ax_h, "4) Tempi (Training e inferenza)", TITLE_COLOR)
    ax_tt = fig.add_subplot(gs[14:19, :])
    t2 = [["Voce", "Valore (media ¬± dev.std)", "Note"]]
    for key, label, m, s, n, unit, fmt in time_stats:
        note = "N/D (non loggato)" if m is None else f"n={n}"
        t2.append([label, fmt_mean_std(m, s, fmt, unit), note])
    make_table(ax_tt, t2, col_widths=[0.30, 0.45, 0.25], header_rows=1, font_size=9, row_scale=1.55)

    ax_h = fig.add_subplot(gs[19:20, :]); add_section_header(ax_h, "5) Audit operativo (affidabilit√† pipeline)", TITLE_COLOR)
    ax_at = fig.add_subplot(gs[20:26, :])
    t3 = [["Indicatore", "Valore (media ¬± dev.std)", "Descrizione"]]
    for key, label, m, s, n, note in audit_stats:
        if m is None:
            val = "N/D"
        else:
            val = f"{pct(m, 1)} ¬± {pct(s, 1) if s is not None else '0.0%'}"
        t3.append([label, val, fill(note, width=62)])
    if m1_miss_m is not None:
        t3.append(["Missing pred M1", f"{pct(m1_miss_m,1)} ¬± {pct(m1_miss_s,1) if m1_miss_s is not None else '0.0%'}",
                   "quota campioni senza predizione M1 (m1_missing / N)"])
    make_table(ax_at, t3, col_widths=[0.28, 0.22, 0.50], header_rows=1, font_size=8.2, row_scale=1.30,
               align_left_cols=[2])

    pdf.savefig(fig); plt.close(fig)

    # =========================
    # PAGINA 3: per-classe + CM + conclusioni
    # =========================
    fig = plt.figure(figsize=(8.27, 11.69), facecolor="white")
    gs = fig.add_gridspec(nrows=26, ncols=6, left=0.06, right=0.94, top=0.97, bottom=0.06, hspace=1.15, wspace=0.8)

    ax = fig.add_subplot(gs[0:2, :]); ax.axis("off")
    ax.text(0.5, 0.65, "DETTAGLIO PER CLASSE E MATRICE DI CONFUSIONE",
            ha="center", va="center", fontsize=15, fontweight="bold", color="#000000")
    ax.text(0.5, 0.18, "Nota: tabella = media ¬± dev.std su seed; matrice = best seed (solo visual).",
            ha="center", va="center", fontsize=9, color="#000000")

    ax_h = fig.add_subplot(gs[2:3, :]); add_section_header(ax_h, "6) Metriche per classe (Baseline vs Fine-Tuned)", TITLE_COLOR)
    ax_pc = fig.add_subplot(gs[3:10, :])

    header = ["Classe", "Label",
              "P (base)", "P (FT)",
              "R (base)", "R (FT)",
              "F1 (base)", "F1 (FT)",
              "ŒîF1"]
    data = [header]
    for c in TARGET_CLASSES_PIPELINE:
        base_p = safe_get(res_base, f"class_{c}_p", None)
        base_r = safe_get(res_base, f"class_{c}_r", None)
        base_f = safe_get(res_base, f"class_{c}_f1", None)

        ft_p_m, ft_p_s, _ = per_class_stats.get(f"class_{c}_p", (None, None, 0))
        ft_r_m, ft_r_s, _ = per_class_stats.get(f"class_{c}_r", (None, None, 0))
        ft_f_m, ft_f_s, _ = per_class_stats.get(f"class_{c}_f1", (None, None, 0))

        delta_f1 = None
        if to_float(base_f) is not None and ft_f_m is not None:
            delta_f1 = ft_f_m - float(base_f)

        data.append([
            str(c), classes_map.get(str(c), str(c)),
            fmt_val(base_p, "{:.3f}"), fmt_mean_std(ft_p_m, ft_p_s, "{:.3f}"),
            fmt_val(base_r, "{:.3f}"), fmt_mean_std(ft_r_m, ft_r_s, "{:.3f}"),
            fmt_val(base_f, "{:.3f}"), fmt_mean_std(ft_f_m, ft_f_s, "{:.3f}"),
            fmt_val(delta_f1, "{:+.3f}")
        ])

    make_table(ax_pc, data,
               col_widths=[0.06, 0.16, 0.10, 0.13, 0.10, 0.13, 0.10, 0.13, 0.09],
               header_rows=1, font_size=9, row_scale=1.35, align_left_cols=[1])

    ax_h = fig.add_subplot(gs[10:11, :]); add_section_header(ax_h, "7) Matrice di confusione (best seed) ‚Äî conteggi reali (diagonale ‚úì)", TITLE_COLOR)
    ax_cm = fig.add_subplot(gs[11:20, 1:5])  # centrata
    cm = safe_get(best_run, "conf_matrix", None)
    if cm is None:
        ax_cm.axis("off")
        ax_cm.text(0.0, 0.5, "Nessuna confusion matrix disponibile.", fontsize=9, color="#000000")
    else:
        draw_easy_confusion_matrix(ax_cm, cm, row_labels=labels_short, col_labels=labels_short,
                                   scale_x=0.70, scale_y=1.60, font_size=9)

    ax_h = fig.add_subplot(gs[20:21, :]); add_section_header(ax_h, "8) Conclusioni (bullet secchi)", TITLE_COLOR)
    ax_c = fig.add_subplot(gs[21:26, :]); ax_c.axis("off")

    act_m = next((m for k, _, m, s, n, _ in audit_stats if k == "m2_activation_rate"), None)
    fb_m  = next((m for k, _, m, s, n, _ in audit_stats if k == "reliability_risk"), None)

    bullets = [
        "Pipeline: M1 (0 vs 1*) ‚Üí M2 (1..4) ‚Üí merge (0..4).",
        f"Fine-Tuned (media ¬± dev.std): Accuracy = {fmt_mean_std(overall_stats['accuracy'][0], overall_stats['accuracy'][1], '{:.4f}')}, "
        f"F1 macro = {fmt_mean_std(overall_stats['f1_macro'][0], overall_stats['f1_macro'][1], '{:.4f}')}.",
        "Per-classe: confronto baseline vs fine-tuned (P/R/F1 accoppiati) + ŒîF1.",
        f"Instradamento: attivazione M2 = {pct(act_m,1) if act_m is not None else 'N/D'}; fallback/invalid su M2 = {pct(fb_m,1) if fb_m is not None else 'N/D'}.",
        "Matrice: ‚úì sulla diagonale = predizioni corrette; numeri = conteggi reali."
    ]
    ax_c.text(0.0, 0.98, "\n".join([f"‚Ä¢ {b}" for b in bullets]),
              va="top", fontsize=9, color="#000000", linespacing=1.6)

    pdf.savefig(fig); plt.close(fig)

print(f"‚úÖ Report PDF rigenerato (REV7): {pdf_path}")


‚úÖ Report PDF rigenerato (REV7): TEST_EXPERIMENTS/TEST_N3_PIPELINE_FULL/Report_PIPELINE_FINAL.pdf
