In [1]:
# ==============================================================================
# CELLA 0: SETUP TOTALE (MINIMAL & STABILE)
# ==============================================================================
import sys
import os
from IPython.display import clear_output

# 1. BLOCCO MODULI PROBLEMATICI
sys.modules["vllm"] = None
sys.modules["vllm.sampling_params"] = None

print("‚è≥ Setup Ambiente in corso... (Attendere, output nascosto)")

# 2. INSTALLAZIONE & AGGIORNAMENTO SILENZIOSO
# Scarica l'ultima versione di Unsloth da Git e aggiorna automaticamente 
# PyTorch e Transformers alle versioni pi√π recenti e compatibili.
!pip install --upgrade --no-cache-dir --quiet \
    "torch" "torchvision" "torchaudio" \
    "transformers" "trl" "peft" "accelerate" "bitsandbytes" \
    "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" \
    "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo.git" \
    "pillow" "scikit-learn" "pandas"

# 3. VERIFICA E PULIZIA
clear_output()

import torch
import unsloth
import transformers
from PIL import Image

print(f"‚úÖ Ambiente Pronto e Pulito.")
print(f"   ‚Ä¢ GPU: {torch.cuda.get_device_name(0)}")
print(f"   ‚Ä¢ PyTorch: {torch.__version__}")
print(f"   ‚Ä¢ Unsloth: {unsloth.__version__}")
print(f"   ‚Ä¢ Transformers: {transformers.__version__}")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from pandas.core.computation.check import NUMEXPR_INSTALLED


Unsloth: Using MoE backend 'grouped_mm'
ü¶• Unsloth Zoo will now patch everything to make training faster!
‚úÖ Ambiente Pronto e Pulito.
   ‚Ä¢ GPU: Tesla V100S-PCIE-32GB
   ‚Ä¢ PyTorch: 2.10.0+cu128
   ‚Ä¢ Unsloth: 2026.2.1
   ‚Ä¢ Transformers: 4.57.6


In [2]:
import torch
import os
import gc
import json
import shutil
import time
import random
import numpy as np
import unsloth
from datetime import datetime
from datasets import load_from_disk
from trl import SFTTrainer, SFTConfig
from unsloth import FastVisionModel, UnslothVisionDataCollator, is_bfloat16_supported
from transformers import TrainerCallback, set_seed

# ==============================================================================
# 1. CONFIGURAZIONE GLOBALE (Fissa per tutte le run)
# ==============================================================================
SEEDS = [101, 285, 3692, 92]  # <--- LISTA DI SEED DA TESTARE
NUM_EPOCHS = 5                # <--- NUMERO DI EPOCHE PER OGNI RUN (con SEED X)
MODEL_ID = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"
MODEL_SHORTNAME = "Qwen2.5-VL-M1-Detection"
DATASET_PATH = os.path.join("DATASET_ITA", "PROCESSED_DATA", "HF_DATASETS", "M1_detection")

# SYSTEM PROMPT
SYSTEM_INSTRUCTION_M1 = """Sei un classificatore binario esperto per la moderazione di contenuti social.
Valuta CONGIUNTAMENTE il commento testuale e i frame del video associato.

Il tuo compito √® stabilire OGGETTIVAMENTE se √® presente contenuto offensivo riconducibile ad una di queste categorie:
- Flaming: insulti, linguaggio aggressivo, minacce, parolacce dirette a una persona o gruppo.
- Denigration: umiliazione, disprezzo, svalutazione o ridicolizzazione di una persona o gruppo.
- Sexual: molestie, allusioni o offese a sfondo sessuale, oggettivazione o inviti espliciti.
- Racism: stereotipi, offese o discriminazione verso etnia, razza, cultura o nazionalit√†.

Regola decisionale:
- Rispondi 1 se rilevi contenuto offensivo riconducibile alle categorie sopra indicate.
- Rispondi 0 se il contenuto √® neutro, positivo, o se si tratta di critica costruttiva/ironia non offensiva.

Linee guida:
- Basati solo sull'evidenza presente nei dati (testo + immagini).
- Non essere n√© troppo severo n√© troppo permissivo: attieniti alle definizioni.

Formato di output (OBBLIGATORIO):
Rispondi esclusivamente con un singolo carattere: 1 oppure 0.
Non aggiungere spiegazioni, punteggiatura o altro testo."""

# Callback per monitoraggio
class RealTimePrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            print(f"üìù Step: {state.global_step:4d} | Epoch: {logs['epoch']:.2f} | Loss: {logs['loss']:.4f}")

# ==============================================================================
# 2. CARICAMENTO E FORMATTAZIONE DATASET (Una volta sola per efficienza)
# ==============================================================================
print("üìÇ Caricamento Dataset HF (Eseguito una volta sola)...")
dataset_raw = load_from_disk(DATASET_PATH)

def has_valid_images(sample):
    user_msg = sample["messages"][0]
    for item in user_msg["content"]:
        if item["type"] == "image":
            raw_path = item["image"]
            clean_path = raw_path.replace("file://", "")
            check_path = "/" + clean_path.lstrip("/") if clean_path else ""
            if not os.path.exists(check_path):
                return False
    return True

# Filtriamo eventuali immagini rotte
train_valid = dataset_raw["train"].filter(has_valid_images, desc="Filter Valid Imgs")
val_valid = dataset_raw["val"].filter(has_valid_images, desc="Filter Valid Imgs")

def format_multimodal_sample(sample):
    raw_user_msg = sample["messages"][0]
    raw_assistant_msg = sample["messages"][1] 
    user_content = []
    
    for item in raw_user_msg["content"]:
        if item["type"] == "image":
            raw_path = item["image"]
            clean_path = raw_path.replace("file://", "")
            clean_path = "/" + clean_path.lstrip("/") 
            final_path = f"file://{clean_path}"
            user_content.append({"type": "image", "image": final_path})
        elif item["type"] == "text":
            text_clean = item["text"].replace("Commento:", "").strip().strip('"').strip("'")
            text_final = f"Commento: \"{text_clean}\""
            user_content.append({"type": "text", "text": text_final})
            
    label_text = raw_assistant_msg["content"][0]["text"]

    new_messages = [
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_INSTRUCTION_M1}]},
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": [{"type": "text", "text": label_text}]}
    ]
    return {"messages": new_messages}

print("üîÑ Formattazione Dataset...")
train_dataset = train_valid.map(format_multimodal_sample, batched=False, desc="Formatting Train")
val_dataset = val_valid.map(format_multimodal_sample, batched=False, desc="Formatting Val")
print(f"‚úÖ Dataset Caricato e Formattato. Train: {len(train_dataset)} | Val: {len(val_dataset)}")


# ==============================================================================
# 3. MEGA-LOOP DI TRAINING (TUTTI I SEEDS)
# ==============================================================================
print(f"\nüöÄ AVVIO SESSIONE DI TRAINING SU {len(SEEDS)} SEED: {SEEDS}")

for seed_idx, TRAINING_SEED in enumerate(SEEDS):
    print("\n" + "#"*60)
    print(f"üé¨ RUN {seed_idx + 1}/{len(SEEDS)} | SEED CORRENTE: {TRAINING_SEED}")
    print("#"*60)

    # --- FIX DETERMINISMO GLOBALE ---
    print(f"üîí Fissaggio Seed Globali a {TRAINING_SEED}...")
    random.seed(TRAINING_SEED)
    np.random.seed(TRAINING_SEED)
    torch.manual_seed(TRAINING_SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(TRAINING_SEED)
    set_seed(TRAINING_SEED)
    # -------------------------------------------------------

    # Definizione Output Directory Dinamica
    OUTPUT_DIR = f"outputs/{MODEL_SHORTNAME}_Seed_{TRAINING_SEED}"
    print(f"üìÇ Cartella Output Run: {OUTPUT_DIR}")

    # --- A. CARICAMENTO MODELLO ---
    print(f"‚è≥ Inizializzazione Modello (Seed {TRAINING_SEED})...")
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name = MODEL_ID,
        load_in_4bit = True,
        use_gradient_checkpointing = "unsloth",
    )

    model = FastVisionModel.get_peft_model(
        model,
        r = 16,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha = 16,
        lora_dropout = 0,
        bias = "none",
        random_state = TRAINING_SEED,
        use_rslora = False,
        loftq_config = None,
    )
    FastVisionModel.for_training(model)

    # --- B. CONFIGURAZIONE TRAINER ---
    training_args = SFTConfig(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        num_train_epochs = NUM_EPOCHS,
        learning_rate = 5e-5,
        lr_scheduler_type = "cosine",
        warmup_ratio = 0.1,
        weight_decay = 0.01,
        optim = "adamw_8bit",
        max_grad_norm = 0.3,
        
        # Salvataggio
        eval_strategy = "epoch",
        save_strategy = "epoch",
        save_total_limit = None,
        load_best_model_at_end = False,
        metric_for_best_model = "eval_loss",
        greater_is_better = False,
        
        # Hardware & Path
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        gradient_checkpointing = True,
        logging_steps = 10,
        output_dir = OUTPUT_DIR,
        report_to = "none",
        
        # Unsloth
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        seed = TRAINING_SEED,
    )

    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        data_collator = UnslothVisionDataCollator(model, tokenizer),
        train_dataset = train_dataset,
        eval_dataset = val_dataset,
        args = training_args,
        callbacks = [RealTimePrinterCallback()],
    )

    # --- C. ESECUZIONE TRAINING ---
    print(f"üî• Avvio Training Seed {TRAINING_SEED}...")
    torch.cuda.empty_cache()
    
    start_time = time.time()
    trainer_stats = trainer.train()
    end_time = time.time()
    
    total_duration = (end_time - start_time) / 60
    final_train_loss = trainer_stats.training_loss
    global_steps_done = trainer_stats.global_step

    print(f"‚úÖ Training Finito. Durata: {total_duration:.2f} min | Loss: {final_train_loss:.4f}")

    # --- D. SALVATAGGIO ---
    ADAPTER_PATH = os.path.join(OUTPUT_DIR, "final_adapter_latest")
    REPORT_FILENAME = f"training_report_Seed_{TRAINING_SEED}.json"
    REPORT_PATH = os.path.join(OUTPUT_DIR, REPORT_FILENAME)
    ZIP_FILENAME = f"{MODEL_SHORTNAME}_Seed_{TRAINING_SEED}_FULL_CHECKPOINTS"
    
    # Cartella Padre per lo ZIP (Per evitare Inception bug)
    PARENT_DIR = os.path.dirname(OUTPUT_DIR)
    ZIP_FULL_PATH = os.path.join(PARENT_DIR, ZIP_FILENAME)

    os.makedirs(ADAPTER_PATH, exist_ok=True)
    
    print(f"üíæ Salvataggio Artifacts...")
    model.save_pretrained(ADAPTER_PATH)
    tokenizer.save_pretrained(ADAPTER_PATH)

    # Report JSON
    peft_config_data = "N/A"
    try:
        raw_config = getattr(model, "peft_config", None)
        if isinstance(raw_config, dict) and raw_config.get("default"):
            peft_config_data = str(raw_config["default"])
    except: pass

    full_report = {
        "1_META_INFO": {
            "timestamp_end": datetime.now().isoformat(),
            "model_shortname": MODEL_SHORTNAME,
            "seed": TRAINING_SEED,
            "task": "M1 Detection - Training Loop"
        },
        "4_TRAINING_PERFORMANCE": {
            "total_duration_minutes": total_duration,
            "final_training_loss": final_train_loss,
            "global_steps": global_steps_done,
            "epochs": training_args.num_train_epochs
        },
        "5_LORA_PARAMS": peft_config_data,
        "7_ARTIFACTS": {
            "checkpoints_location": "Inside ZIP archive",
            "zip_path": f"{ZIP_FULL_PATH}.zip"
        },
        "8_FULL_LOG_HISTORY": getattr(trainer.state, "log_history", [])
    }

    with open(REPORT_PATH, "w", encoding="utf-8") as f:
        json.dump(full_report, f, indent=4, ensure_ascii=False)

    print(f"üì¶ Compressione ZIP in corso (attendere)...")
    shutil.make_archive(
        base_name=ZIP_FULL_PATH, 
        format='zip', 
        root_dir=OUTPUT_DIR
    )
    print(f"   -> ZIP creato: {ZIP_FULL_PATH}.zip")

    # --- E. PULIZIA MEMORIA ---
    print(f"üßπ Pulizia VRAM per il prossimo seed...")
    try:
        del model
        del trainer
        del tokenizer
    except: pass
    
    gc.collect()
    torch.cuda.empty_cache()
    print("‚ú® Ambiente pulito.\n")

print("\nüéâ TUTTE LE RUN PER M1 SONO COMPLETATE CON SUCCESSO!")

üìÇ Caricamento Dataset HF (Eseguito una volta sola)...


Filter Valid Imgs:   0%|          | 0/2373 [00:00<?, ? examples/s]

Filter Valid Imgs:   0%|          | 0/710 [00:00<?, ? examples/s]

üîÑ Formattazione Dataset...


Formatting Train:   0%|          | 0/2373 [00:00<?, ? examples/s]

Formatting Val:   0%|          | 0/710 [00:00<?, ? examples/s]

‚úÖ Dataset Caricato e Formattato. Train: 2373 | Val: 710

üöÄ AVVIO SESSIONE DI TRAINING SU 5 SEED: [101, 285, 3692, 92, 7708]

############################################################
üé¨ RUN 1/5 | SEED CORRENTE: 101
############################################################
üîí Fissaggio Seed Globali a 101...
üìÇ Cartella Output Run: outputs/Qwen2.5-VL-M1-Detection_Seed_101
‚è≥ Inizializzazione Modello (Seed 101)...
==((====))==  Unsloth 2026.2.1: Fast Qwen2_5_Vl patching. Transformers: 4.57.6. vLLM: 0.6.3.
   \\   /|    Tesla V100S-PCIE-32GB. Num GPUs = 1. Max memory: 31.739 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Making `model.base_model.model.model` require gradients
Unsloth: Model does no

The model is already on multiple devices. Skipping the move to device specified in `args`.


üî• Avvio Training Seed 101...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,373 | Num Epochs = 5 | Total steps = 745
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 47,589,376 of 8,339,756,032 (0.57% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.1434,0.112191
2,0.1259,0.108429
3,0.1125,0.107707
4,0.1103,0.108699
5,0.1119,0.108912


üìù Step:   10 | Epoch: 0.07 | Loss: 4.8023
üìù Step:   20 | Epoch: 0.13 | Loss: 4.1079
üìù Step:   30 | Epoch: 0.20 | Loss: 2.9097
üìù Step:   40 | Epoch: 0.27 | Loss: 2.2009
üìù Step:   50 | Epoch: 0.34 | Loss: 1.7176
üìù Step:   60 | Epoch: 0.40 | Loss: 1.1295
üìù Step:   70 | Epoch: 0.47 | Loss: 0.4293
üìù Step:   80 | Epoch: 0.54 | Loss: 0.1805
üìù Step:   90 | Epoch: 0.61 | Loss: 0.1538
üìù Step:  100 | Epoch: 0.67 | Loss: 0.1432
üìù Step:  110 | Epoch: 0.74 | Loss: 0.1389
üìù Step:  120 | Epoch: 0.81 | Loss: 0.1321
üìù Step:  130 | Epoch: 0.88 | Loss: 0.1353
üìù Step:  140 | Epoch: 0.94 | Loss: 0.1434


Unsloth: Not an error, but Qwen2_5_VLForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


üìù Step:  150 | Epoch: 1.01 | Loss: 0.1333
üìù Step:  160 | Epoch: 1.07 | Loss: 0.1339
üìù Step:  170 | Epoch: 1.14 | Loss: 0.1339
üìù Step:  180 | Epoch: 1.21 | Loss: 0.1474
üìù Step:  190 | Epoch: 1.28 | Loss: 0.1229
üìù Step:  200 | Epoch: 1.34 | Loss: 0.1357
üìù Step:  210 | Epoch: 1.41 | Loss: 0.1352
üìù Step:  220 | Epoch: 1.48 | Loss: 0.1269
üìù Step:  230 | Epoch: 1.55 | Loss: 0.1285
üìù Step:  240 | Epoch: 1.61 | Loss: 0.1212
üìù Step:  250 | Epoch: 1.68 | Loss: 0.1377
üìù Step:  260 | Epoch: 1.75 | Loss: 0.1235
üìù Step:  270 | Epoch: 1.81 | Loss: 0.1235
üìù Step:  280 | Epoch: 1.88 | Loss: 0.1293
üìù Step:  290 | Epoch: 1.95 | Loss: 0.1259
üìù Step:  300 | Epoch: 2.01 | Loss: 0.1278
üìù Step:  310 | Epoch: 2.08 | Loss: 0.1176
üìù Step:  320 | Epoch: 2.15 | Loss: 0.1217
üìù Step:  330 | Epoch: 2.22 | Loss: 0.1364
üìù Step:  340 | Epoch: 2.28 | Loss: 0.1232
üìù Step:  350 | Epoch: 2.35 | Loss: 0.1132
üìù Step:  360 | Epoch: 2.42 | Loss: 0.1214
üìù Step:

The model is already on multiple devices. Skipping the move to device specified in `args`.


üî• Avvio Training Seed 285...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,373 | Num Epochs = 5 | Total steps = 745
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 47,589,376 of 8,339,756,032 (0.57% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.1434,0.112202
2,0.126,0.108385
3,0.1125,0.107674
4,0.1103,0.108579
5,0.112,0.108598


üìù Step:   10 | Epoch: 0.07 | Loss: 4.8033
üìù Step:   20 | Epoch: 0.13 | Loss: 4.1093
üìù Step:   30 | Epoch: 0.20 | Loss: 2.9140
üìù Step:   40 | Epoch: 0.27 | Loss: 2.2052
üìù Step:   50 | Epoch: 0.34 | Loss: 1.7183
üìù Step:   60 | Epoch: 0.40 | Loss: 1.1232
üìù Step:   70 | Epoch: 0.47 | Loss: 0.4189
üìù Step:   80 | Epoch: 0.54 | Loss: 0.1796
üìù Step:   90 | Epoch: 0.61 | Loss: 0.1533
üìù Step:  100 | Epoch: 0.67 | Loss: 0.1430
üìù Step:  110 | Epoch: 0.74 | Loss: 0.1387
üìù Step:  120 | Epoch: 0.81 | Loss: 0.1321
üìù Step:  130 | Epoch: 0.88 | Loss: 0.1352
üìù Step:  140 | Epoch: 0.94 | Loss: 0.1434
üìù Step:  150 | Epoch: 1.01 | Loss: 0.1333
üìù Step:  160 | Epoch: 1.07 | Loss: 0.1338
üìù Step:  170 | Epoch: 1.14 | Loss: 0.1339
üìù Step:  180 | Epoch: 1.21 | Loss: 0.1473
üìù Step:  190 | Epoch: 1.28 | Loss: 0.1228
üìù Step:  200 | Epoch: 1.34 | Loss: 0.1357
üìù Step:  210 | Epoch: 1.41 | Loss: 0.1351
üìù Step:  220 | Epoch: 1.48 | Loss: 0.1267
üìù Step:

The model is already on multiple devices. Skipping the move to device specified in `args`.


üî• Avvio Training Seed 3692...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,373 | Num Epochs = 5 | Total steps = 745
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 47,589,376 of 8,339,756,032 (0.57% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.1433,0.112202
2,0.1259,0.10833
3,0.1124,0.107865
4,0.1103,0.108784
5,0.1119,0.108924


üìù Step:   10 | Epoch: 0.07 | Loss: 4.8040
üìù Step:   20 | Epoch: 0.13 | Loss: 4.1052
üìù Step:   30 | Epoch: 0.20 | Loss: 2.9080
üìù Step:   40 | Epoch: 0.27 | Loss: 2.2000
üìù Step:   50 | Epoch: 0.34 | Loss: 1.7154
üìù Step:   60 | Epoch: 0.40 | Loss: 1.1235
üìù Step:   70 | Epoch: 0.47 | Loss: 0.4194
üìù Step:   80 | Epoch: 0.54 | Loss: 0.1791
üìù Step:   90 | Epoch: 0.61 | Loss: 0.1526
üìù Step:  100 | Epoch: 0.67 | Loss: 0.1430
üìù Step:  110 | Epoch: 0.74 | Loss: 0.1387
üìù Step:  120 | Epoch: 0.81 | Loss: 0.1321
üìù Step:  130 | Epoch: 0.88 | Loss: 0.1352
üìù Step:  140 | Epoch: 0.94 | Loss: 0.1433
üìù Step:  150 | Epoch: 1.01 | Loss: 0.1332
üìù Step:  160 | Epoch: 1.07 | Loss: 0.1338
üìù Step:  170 | Epoch: 1.14 | Loss: 0.1339
üìù Step:  180 | Epoch: 1.21 | Loss: 0.1472
üìù Step:  190 | Epoch: 1.28 | Loss: 0.1228
üìù Step:  200 | Epoch: 1.34 | Loss: 0.1357
üìù Step:  210 | Epoch: 1.41 | Loss: 0.1352
üìù Step:  220 | Epoch: 1.48 | Loss: 0.1267
üìù Step:

The model is already on multiple devices. Skipping the move to device specified in `args`.


üî• Avvio Training Seed 92...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,373 | Num Epochs = 5 | Total steps = 745
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 47,589,376 of 8,339,756,032 (0.57% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.1433,0.112126
2,0.126,0.108541
3,0.1124,0.107768
4,0.1103,0.108632
5,0.1118,0.108652


üìù Step:   10 | Epoch: 0.07 | Loss: 4.8032
üìù Step:   20 | Epoch: 0.13 | Loss: 4.1107
üìù Step:   30 | Epoch: 0.20 | Loss: 2.9113
üìù Step:   40 | Epoch: 0.27 | Loss: 2.2013
üìù Step:   50 | Epoch: 0.34 | Loss: 1.7149
üìù Step:   60 | Epoch: 0.40 | Loss: 1.1215
üìù Step:   70 | Epoch: 0.47 | Loss: 0.4210
üìù Step:   80 | Epoch: 0.54 | Loss: 0.1797
üìù Step:   90 | Epoch: 0.61 | Loss: 0.1534
üìù Step:  100 | Epoch: 0.67 | Loss: 0.1431
üìù Step:  110 | Epoch: 0.74 | Loss: 0.1388
üìù Step:  120 | Epoch: 0.81 | Loss: 0.1320
üìù Step:  130 | Epoch: 0.88 | Loss: 0.1352
üìù Step:  140 | Epoch: 0.94 | Loss: 0.1433
üìù Step:  150 | Epoch: 1.01 | Loss: 0.1332
üìù Step:  160 | Epoch: 1.07 | Loss: 0.1339
üìù Step:  170 | Epoch: 1.14 | Loss: 0.1339
üìù Step:  180 | Epoch: 1.21 | Loss: 0.1473
üìù Step:  190 | Epoch: 1.28 | Loss: 0.1228
üìù Step:  200 | Epoch: 1.34 | Loss: 0.1356
üìù Step:  210 | Epoch: 1.41 | Loss: 0.1352
üìù Step:  220 | Epoch: 1.48 | Loss: 0.1267
üìù Step:

The model is already on multiple devices. Skipping the move to device specified in `args`.


üî• Avvio Training Seed 7708...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,373 | Num Epochs = 5 | Total steps = 745
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 47,589,376 of 8,339,756,032 (0.57% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss


üìù Step:   10 | Epoch: 0.07 | Loss: 4.8030
üìù Step:   20 | Epoch: 0.13 | Loss: 4.1080
üìù Step:   30 | Epoch: 0.20 | Loss: 2.9080
üìù Step:   40 | Epoch: 0.27 | Loss: 2.1999
üìù Step:   50 | Epoch: 0.34 | Loss: 1.7117
üìù Step:   60 | Epoch: 0.40 | Loss: 1.1156
üìù Step:   70 | Epoch: 0.47 | Loss: 0.4184
üìù Step:   80 | Epoch: 0.54 | Loss: 0.1795
üìù Step:   90 | Epoch: 0.61 | Loss: 0.1531
üìù Step:  100 | Epoch: 0.67 | Loss: 0.1431
üìù Step:  110 | Epoch: 0.74 | Loss: 0.1389


KeyboardInterrupt: 