<a href="https://colab.research.google.com/github/benedettoscala/ifttt-code-generator/blob/main/nl2gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers evaluate datasets
!pip install rouge_score  # se vuoi ancora utilizzare eventuali metriche di testo, non strettamente necessario per GPT-2


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [3

In [2]:
import pandas as pd
import numpy as np
import torch
import os

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

import evaluate
import math

In [3]:
!git clone https://github.com/benedettoscala/ifttt-code-generator
%cd ifttt-code-generator/
!git pull

Cloning into 'ifttt-code-generator'...
remote: Enumerating objects: 107, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 107 (delta 56), reused 25 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (107/107), 14.60 MiB | 20.56 MiB/s, done.
Resolving deltas: 100% (56/56), done.
/content/ifttt-code-generator
Already up to date.


In [5]:
# Carica il dataset
csv_path = "datasets/cleaned_and_combined.csv"
df = pd.read_csv(csv_path)

# Rimuovi righe con valori mancanti e duplicati
df.dropna(subset=["cleaned_description", "filter_code"], inplace=True)
df.drop_duplicates(subset=["cleaned_description", "filter_code"], inplace=True)

# Esempio: unisci description e code in un'unica stringa.
# Usa un token o un separatore specifico per aiutare il modello a distinguere
# le due parti (es: "###" o "<|endoftext|>")
def create_text_prompt(desc, code):
    return f"Description:\n{desc}\n###\nCode:\n{code}"

df["text"] = df.apply(
    lambda row: create_text_prompt(row["cleaned_description"], row["filter_code"]),
    axis=1
)

# Divisione train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Conversione in Dataset Hugging Face
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print("Train set size:", len(dataset["train"]))
print("Test set size:", len(dataset["test"]))

Train set size: 134
Test set size: 34


In [6]:
model_checkpoint = "gpt2"  # o "gpt2-medium", "gpt2-large", ecc.

tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
# Imposta un token di padding se non definito
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Imposta una lunghezza massima (ad esempio 512, da valutare in base a GPT-2)
max_length = 256

def tokenize_function(examples):
    # Ritorna un unico dict con input_ids e attention_mask
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

tokenized_datasets


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 134
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 34
    })
})

In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal Language Modeling
)


In [8]:
model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

# Aggiungiamo eventuali token se abbiamo aggiunto un token di pad
model.resize_token_embeddings(len(tokenizer))


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [9]:
def compute_metrics(eval_preds):
    """
    eval_preds for causal LM: (logits, labels)
    """
    logits, labels = eval_preds
    # Shift i logit e i label per allinearli al "next token prediction"
    # In pratica, i label[i] corrispondono ai logit[i-1] in un modello auto-regressivo.
    # Tuttavia, la Trainer di Transformers a volte gestisce questo automaticamente.
    # Se vogliamo un calcolo manuale più accurato, possiamo farlo qui.
    # Per semplicità, usiamo la cross-entropy media della Trainer.
    # Trainer => "eval_loss" e poi perplexity = exp(eval_loss).
    return {}

# In alternativa, definiremo la perplexity usando un trucco successivo:
# -> la Trainer di HF fornisce "eval_loss" che possiamo sfruttare
#    dopo la chiamata a trainer.evaluate() per calcolare perplexity.


In [10]:
training_args = TrainingArguments(
    output_dir="./gpt2-ifttt",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",   # Esegui evaluation alla fine di ogni epoca
    save_strategy="epoch",         # Salva un checkpoint a ogni epoca
    num_train_epochs=3,            # Cambia secondo le tue necessità
    per_device_train_batch_size=4, # Batch size, adattalo alla tua GPU
    per_device_eval_batch_size=4,
    logging_steps=50,
    save_total_limit=2,            # Tieni solo gli ultimi 2 checkpoint
    fp16=torch.cuda.is_available(), # Usa half precision se possibile
    report_to="none",               # Disabilita WandB o altri logger
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # se vuoi, altrimenti None
)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
results = trainer.evaluate()
print("Final eval_loss:", results["eval_loss"])
print("Perplexity:", math.exp(results["eval_loss"]))


In [None]:
from transformers import pipeline

# Carica (o ri-carica) il modello fine-tunato
# Se hai salvato i checkpoint in `./gpt2-ifttt`, puoi caricare da lì
inference_model = GPT2LMHeadModel.from_pretrained("./gpt2-ifttt")
inference_tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-ifttt")

# Pipeline di text generation
generator = pipeline(
    "text-generation",
    model=inference_model,
    tokenizer=inference_tokenizer,
    pad_token_id=inference_tokenizer.eos_token_id
)

# Esempio di prompt: solo la "descrizione"
prompt = "Description:\nCreate an applet that saves new photos from my phone to Google Drive.\n###\nCode:\n"

results = generator(prompt, max_length=100, num_return_sequences=1)
print(results[0]["generated_text"])
