<a href="https://colab.research.google.com/github/benedettoscala/ifttt-code-generator/blob/main/fine_tuning_mistral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
%%capture

!pip install transformers==4.36.2
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
!pip install datasets==2.16.0
!pip install sentencepiece
!pip install -U bitsandbytes
!pip install rouge_score

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch
import pandas as pd
from datasets import Dataset


In [20]:
from google.colab import userdata
secret_hf = userdata.get('HUGGINGFACE_TOKEN')
!huggingface-cli login --token $secret_hf

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `prova` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `prova`


# Preprocessing dei dati

In [21]:
!git clone https://github.com/benedettoscala/ifttt-code-generator
%cd ifttt-code-generator/
!git pull

Cloning into 'ifttt-code-generator'...
remote: Enumerating objects: 52, done.[K
remote: Counting objects: 100% (52/52), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 52 (delta 19), reused 21 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (52/52), 14.43 MiB | 16.95 MiB/s, done.
Resolving deltas: 100% (19/19), done.
/content/ifttt-code-generator
Already up to date.


In [22]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)

In [23]:
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import BitsAndBytesConfig

#Configurazione modello base e quantizzazione
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

#Caricamento del modello e tokenizer
print("Caricamento del modello base...")
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

print("Caricamento del tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prepara il modello per k-bit training (disabilita gradienti su pesi int4 ecc.)
model = prepare_model_for_kbit_training(model)

# 3. Configurazione LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]
    # i moduli dei transformer li ho controllati su hugging face, sono questi (godo)
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()


Caricamento del modello base...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Caricamento del tokenizer...
trainable params: 23,068,672 || all params: 7,264,800,768 || trainable%: 0.3175


In [24]:
# Caricamento e pulizia del dataset
csv_path = "datasets/cleaned_and_combined.csv"
df = pd.read_csv(csv_path)

df.dropna(subset=["cleaned_description", "filter_code"], inplace=True)

df.drop_duplicates(subset=["cleaned_description","filter_code"], inplace=True)

#Suddivisione train   e val
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(eval_df)

In [27]:
def tokenize_function(examples):
    return tokenizer(
        examples["cleaned_description"],
        text_target=examples["filter_code"],
        truncation=True,
        max_length=512,
        padding="max_length"  # se vuoi padding dinamico, sostituisci con "longest"
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset  = eval_dataset.map(tokenize_function,  batched=True)


# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8)

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [28]:
from transformers import TrainingArguments, Trainer
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# Impostazioni di training
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=6,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=1.0,
    save_steps=100,
    logging_steps=5,
    evaluation_strategy="no",
    load_best_model_at_end=False,
    save_total_limit=3,
    fp16=False,
    bf16=False,
    report_to="wandb"
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decodifica le predizioni e le etichette in testo
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU
    bleu_scores = [
        sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    # METEOR
    meteor_scores = [
        meteor_score([ref], pred)
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_meteor = sum(meteor_scores) / len(meteor_scores)

    # ROUGE
    rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = [
        rouge.score(ref, pred)
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_rouge1 = sum(score["rouge1"].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rouge2 = sum(score["rouge2"].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rougeL = sum(score["rougeL"].fmeasure for score in rouge_scores) / len(rouge_scores)

    return {
        "bleu": avg_bleu,
        "meteor": avg_meteor,
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL,
    }

# Creazione Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
trainer.train()

#Salvataggio finale LoRA + quantization
trainer.save_model("./results/best_model")

print("Fine del training!")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113575844440978, max=1.0…

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# prompt: zip the results and upload it on drive

import os
import zipfile

# Specify the directory to zip
directory_to_zip = "/content/ifttt-code-generator"  # Replace with the actual directory

# Specify the zip file name
zip_file_name = "/content/drive/MyDrive/ifttt_results.zip"

# Create a zip archive
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(directory_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, arcname=os.path.relpath(file_path, directory_to_zip))

print(f"Successfully zipped the contents of '{directory_to_zip}' to '{zip_file_name}'")

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
import torch
# Percorso del modello fine-tunato
finetuned_model_path = "./results/best_model"
basemodel_path = "mistralai/Mistral-7B-Instruct-v0.2"
# Caricamento del modello base


#create offload directory if it doesn't exist
if not os.path.exists("./offload"):
    os.makedirs("./offload")


# Caricamento del modello e del tokenizer fine-tunati
print("Caricamento del modello fine-tunato...")
from peft import PeftModel

bnb_config = BitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

model = AutoModelForCausalLM.from_pretrained(
    basemodel_path,
    torch_dtype=torch.float16,         # or torch.bfloat16, depending on your setup
    quantization_config=bnb_config,           # 4-bit quantization
    device_map="auto",
    offload_folder="./offload"         # <= Provide a folder path
)

model = PeftModel.from_pretrained(
    model,
    finetuned_model_path,
)


tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)


Caricamento del modello fine-tunato...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [17]:

# Funzione per generare il codice IFTTT
def generate_ifttt_code(prompt, max_length=512, num_return_sequences=1):
    # Tokenizzazione dell'input
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generazione del codice
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
    )

    # Decodifica del risultato
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return decoded_outputs

# Prompt per generare il codice IFTTT
prompt = "if the current hour is 17, send a tweet"
generated_code = generate_ifttt_code(prompt)

# Stampa del codice generato
print("\nCodice IFTTT generato:")
for i, code in enumerate(generated_code, 1):
    print(f"{code}")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Codice IFTTT generato:
if the current hour is 17, send a tweet {hour.send = 25.sendhour.send <hour>
