<a href="https://colab.research.google.com/github/benedettoscala/ifttt-code-generator/blob/main/fine_tuning_mistral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture

!pip install transformers==4.36.2
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
!pip install datasets==2.16.0
!pip install sentencepiece
!pip install -U bitsandbytes
!pip install rouge_score

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch
import pandas as pd
from datasets import Dataset


In [3]:
from google.colab import userdata
secret_hf = userdata.get('HUGGINGFACE_TOKEN')
!huggingface-cli login --token $secret_hf

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `prova` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `prova`


# Preprocessing dei dati

In [4]:
!git clone https://github.com/benedettoscala/ifttt-code-generator
%cd ifttt-code-generator/
!git pull

Cloning into 'ifttt-code-generator'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 35 (delta 13), reused 13 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (35/35), 43.26 KiB | 4.81 MiB/s, done.
Resolving deltas: 100% (13/13), done.
Filtering content: 100% (4/4), 171.23 MiB | 6.81 MiB/s, done.
/content/ifttt-code-generator
Already up to date.


In [5]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)

In [6]:
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import BitsAndBytesConfig

#Configurazione modello base e quantizzazione
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

#Caricamento del modello e tokenizer
print("Caricamento del modello base...")
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

print("Caricamento del tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prepara il modello per k-bit training (disabilita gradienti su pesi int4 ecc.)
model = prepare_model_for_kbit_training(model)

# 3. Configurazione LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]
    # i moduli dei transformer li ho controllati su hugging face, sono questi (godo)
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()


Caricamento del modello base...


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Caricamento del tokenizer...


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

trainable params: 23,068,672 || all params: 7,264,800,768 || trainable%: 0.3175


In [7]:
# Caricamento e pulizia del dataset
csv_path = "datasets/cleaned_and_combined.csv"
df = pd.read_csv(csv_path)

df.dropna(subset=["cleaned_description", "filter_code"], inplace=True)

df.drop_duplicates(subset=["cleaned_description","filter_code"], inplace=True)

#Suddivisione train   e val
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(eval_df)

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples["cleaned_description"],
        text_target=examples["filter_code"],
        truncation=True,
        max_length=512,
        padding="max_length"  # se vuoi padding dinamico, sostituisci con "longest"
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset  = eval_dataset.map(tokenize_function,  batched=True)


# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8)

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [11]:
from transformers import TrainingArguments, Trainer
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# Impostazioni di training
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=1.0,
    save_steps=100,
    logging_steps=5,
    evaluation_strategy="steps",
    eval_steps=5,
    load_best_model_at_end=True,
    save_total_limit=3,
    fp16=False,
    bf16=False,
    report_to="wandb"
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decodifica le predizioni e le etichette in testo
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU
    bleu_scores = [
        sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    # METEOR
    meteor_scores = [
        meteor_score([ref], pred)
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_meteor = sum(meteor_scores) / len(meteor_scores)

    # ROUGE
    rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = [
        rouge.score(ref, pred)
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_rouge1 = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rouge2 = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rougeL = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)

    return {
        "bleu": avg_bleu,
        "meteor": avg_meteor,
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL,
    }

# Creazione Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [13]:
trainer.train()

#Salvataggio finale LoRA + quantization
trainer.save_model("./results/best_model")

print("Fine del training!")

KeyboardInterrupt: 

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Specifica il modello base e la cartella dove hai salvato il fine-tuning
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
FINETUNED_MODEL_DIR = "./results/best_model"

#  Carica il tokenizer del modello base
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token  # assicurati che il token di padding sia quello di fine sequenza

model_base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",         # mappa automaticamente su GPU/CPU
    torch_dtype=torch.float16,  # usa float16; se la tua GPU supporta bf16, puoi mettere torch.bfloat16
)

#Carica i pesi LoRA salvati (PeftModel)
model = PeftModel.from_pretrained(
    model_base,
    FINETUNED_MODEL_DIR,
    torch_dtype=torch.float16   # o bfloat16, in coerenza con quanto fatto su base_model
)

# Metti il modello in eval
model.eval()

#Funzione di generazione di codice (o di testo)
def generate_code(prompt, max_new_tokens=128):
    """
    prompt: testo di input (descrizione, istruzioni, ecc.)
    max_new_tokens: numero massimo di token di output
    """
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # sposta su GPU
    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,         # se vuoi campionare
            top_p=0.9,              # top-p sampling
            temperature=0.8,        # regola la "creatività"
            repetition_penalty=1.1  # leggero penalty su ripetizioni
        )
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)


test_prompt = (
    "if the current hour is 17, send a tweet"
)
generated = generate_code(test_prompt, max_new_tokens=100)
print("PROMPT:\n", test_prompt)
print("\nCODICE GENERATO:\n", generated)
