<a href="https://colab.research.google.com/github/benedettoscala/ifttt-code-generator/blob/main/fine_tuning_mistral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture

!pip install transformers==4.36.2
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
!pip install datasets==2.16.0
!pip install sentencepiece
!pip install -U bitsandbytes
!pip install rouge_score

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch
import pandas as pd
from datasets import Dataset


In [3]:
from google.colab import userdata
secret_hf = userdata.get('HUGGINGFACE_TOKEN')
!huggingface-cli login --token $secret_hf

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `prova` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `prova`


# Preprocessing dei dati

In [4]:
!git clone https://github.com/benedettoscala/ifttt-code-generator
%cd ifttt-code-generator/
!git pull

fatal: destination path 'ifttt-code-generator' already exists and is not an empty directory.
/content/ifttt-code-generator
Already up to date.


In [5]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)

In [6]:
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import BitsAndBytesConfig

#Configurazione modello base e quantizzazione
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

#Caricamento del modello e tokenizer
print("Caricamento del modello base...")
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

print("Caricamento del tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Prepara il modello per k-bit training (disabilita gradienti su pesi int4 ecc.)
model = prepare_model_for_kbit_training(model)

# 3. Configurazione LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]
    # i moduli dei transformer li ho controllati su hugging face, sono questi (godo)
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()


Caricamento del modello base...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Caricamento del tokenizer...
trainable params: 23,068,672 || all params: 7,264,800,768 || trainable%: 0.3175


In [7]:
# Caricamento e pulizia del dataset
csv_path = "datasets/cleaned_and_combined.csv"
df = pd.read_csv(csv_path)

df.dropna(subset=["cleaned_description", "filter_code"], inplace=True)

#drop duplicates
df.drop_duplicates(subset=["cleaned_description", "filter_code"], inplace=True)

#Suddivisione train   e val
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(eval_df)

In [8]:
train_dataset

Dataset({
    features: ['cleaned_description', 'filter_code', '__index_level_0__'],
    num_rows: 134
})

In [9]:
from transformers import DataCollatorWithPadding
# Funzione di tokenizzazione
def tokenize_function(examples):
    separator = "\n###\n"

    # Concateno desc + code
    full_text = [
        desc + separator + code
        for desc, code in zip(examples["cleaned_description"], examples["filter_code"])
    ]

    # Tokenizza con padding e truncation "coerenti"
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=512,
        padding="max_length"  # cosi ottengo shape costanti
    )

    # Calcolo la lunghezza del prompt con gli stessi identici parametri
    prompt_text = [
        desc + separator
        for desc in examples["cleaned_description"]
    ]
    tokenized_prompt = tokenizer(
        prompt_text,
        truncation=True,
        max_length=512,
        padding="max_length"  # stesse impostazioni
    )
    prompt_lengths = [
        sum(p_id != tokenizer.pad_token_id for p_id in p_ids)
        for p_ids in tokenized_prompt["input_ids"]
    ]

    # Costruisco le label: maschero la parte del prompt con -100
    labels = []
    for i, seq in enumerate(tokenized["input_ids"]):
        prompt_len = prompt_lengths[i]
        # Il prompt è su N token e la rimanente parte su (512 - N) token
        # ATTENZIONE: se usi "padding=max_length" la seq avrà sempre lunghezza 512
        # e i token in eccesso (se esiste) saranno solo pad.
        # Esempio: seq[prompt_len:] serve a prendere la parte del "code".
        masked_labels = [-100]*prompt_len + seq[prompt_len:]
        labels.append(masked_labels)

    tokenized["labels"] = labels
    return tokenized




train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset  = eval_dataset.map(tokenize_function,  batched=True)

# Data collator con padding a sinistra
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding='longest'  # la dimensione massima del batch la decide a runtime
)

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
from transformers import Seq2SeqTrainingArguments, Trainer
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# Impostazioni di training specifiche per Seq2Seq
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=1.0,
    save_steps=100,
    logging_steps=2,
    eval_strategy="epoch",  # Puoi anche usare "epoch"
    load_best_model_at_end=False,
    save_total_limit=3,
    fp16=False,
    bf16=True,
    report_to="wandb",
    #predict_with_generate=True  # Abilitare la generazione delle predizioni
)

# Funzione per il calcolo delle metriche
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Se predictions è un tuple, prendiamo la prima parte (logits)
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Se predictions è un tensor, trasformiamolo in numpy
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.detach().cpu().numpy()

    # Se i predictions contengono logits invece di id di token, facciamo argmax
    if predictions.dtype not in [int, 'int32', 'int64']:
        predictions = predictions.argmax(axis=-1)

    # *** ECCO LA CORREZIONE FONDAMENTALE ***
    # Sostituiamo tutti i -100 nelle label con il pad_token_id
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    # Decodifica
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Calcolo metriche BLEU, METEOR, ROUGE ecc.
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    from nltk.translate.meteor_score import meteor_score
    from rouge_score import rouge_scorer

    # BLEU
    bleu_scores = [
        sentence_bleu(
            [ref.split()], pred.split(),
            smoothing_function=SmoothingFunction().method1
        )
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    # METEOR
    meteor_scores = [
        meteor_score([ref.split()], pred.split())
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_meteor = sum(meteor_scores) / len(meteor_scores)

    # ROUGE
    rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = [
        rouge.score(ref, pred)
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_rouge1 = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rouge2 = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rougeL = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)

    return {
        "bleu": avg_bleu,
        "meteor": avg_meteor,
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL,
    }

# Creazione Seq2SeqTrainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# Avvia il training
trainer.train()

# Salvataggio finale LoRA + quantization
trainer.save_model("./results/best_model")


[34m[1mwandb[0m: Currently logged in as: [33mb-scala1[0m ([33mb-scala1-universit-degli-studi-di-salerno[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Bleu,Meteor,Rouge1,Rouge2,Rougel
0,0.4159,0.401365,0.115153,0.346912,0.476729,0.222758,0.426599
1,0.334,0.315954,0.171528,0.404337,0.528974,0.279273,0.488278
2,0.262,0.305761,0.176069,0.411823,0.541916,0.298516,0.504169



zip error: Nothing to do! (results.zip)


In [15]:
# prompt: zip the results folder

!zip -r results.zip results


  adding: results/ (stored 0%)
  adding: results/best_model/ (stored 0%)
  adding: results/best_model/adapter_model.safetensors (deflated 8%)
  adding: results/best_model/adapter_config.json (deflated 55%)
  adding: results/best_model/README.md (deflated 66%)
  adding: results/best_model/training_args.bin (deflated 51%)
  adding: results/best_model/special_tokens_map.json (deflated 73%)
  adding: results/best_model/tokenizer.model (deflated 55%)
  adding: results/best_model/tokenizer_config.json (deflated 68%)
  adding: results/best_model/tokenizer.json (deflated 85%)
  adding: results/checkpoint-48/ (stored 0%)
  adding: results/checkpoint-48/rng_state.pth (deflated 25%)
  adding: results/checkpoint-48/scheduler.pt (deflated 56%)
  adding: results/checkpoint-48/adapter_model.safetensors (deflated 8%)
  adding: results/checkpoint-48/trainer_state.json (deflated 73%)
  adding: results/checkpoint-48/adapter_config.json (deflated 55%)
  adding: results/checkpoint-48/README.md (deflated 66

In [18]:
# prompt: empty gpu memory

import torch

# Empty GPU cache
torch.cuda.empty_cache()

# Optionally, check GPU memory usage after emptying the cache
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 1         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   5055 MiB |  10025 MiB |  52108 GiB |  52103 GiB |
|       from large pool |   4710 MiB |   9548 MiB |  52043 GiB |  52039 GiB |
|       from small pool |    345 MiB |    521 MiB |     64 GiB |     64 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   5055 MiB |  10025 MiB |  52108 GiB |  52103 GiB |
|       from large pool |   4710 MiB |   9548 MiB |  52043 GiB |  52039 GiB |
|       from small pool |    345 MiB |    521 MiB |     64 GiB |     64 GiB |
|---------------------------------------------------------------

# Mi sono rotto i coglioni
Voglio solo giocare ad Elden Ring...

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
import torch
# Percorso del modello fine-tunato
finetuned_model_path = "./results/best_model"
basemodel_path = "mistralai/Mistral-7B-Instruct-v0.2"
# Caricamento del modello base


#create offload directory if it doesn't exist
if not os.path.exists("./offload"):
    os.makedirs("./offload")


# Caricamento del modello e del tokenizer fine-tunati
print("Caricamento del modello fine-tunato...")
from peft import PeftModel

bnb_config = BitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

model = AutoModelForCausalLM.from_pretrained(
    basemodel_path,
    torch_dtype=torch.float16,         # or torch.bfloat16, depending on your setup
    quantization_config=bnb_config,           # 4-bit quantization
    device_map="auto",
    offload_folder="./offload"         # <= Provide a folder path
)

model = PeftModel.from_pretrained(
    model,
    finetuned_model_path,
)


tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)


Caricamento del modello fine-tunato...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:

# Funzione per generare il codice IFTTT
def generate_ifttt_code(prompt, max_length=512, num_return_sequences=1):
    # Tokenizzazione dell'input
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generazione del codice
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
    )

    # Decodifica del risultato
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return decoded_outputs

# Prompt per generare il codice IFTTT
prompt = "if the current hour is 17, send a tweet"
generated_code = generate_ifttt_code(prompt)

# Stampa del codice generato
print("\nCodice IFTTT generato:")
for i, code in enumerate(generated_code, 1):
    print(f"{code}")
