<a href="https://colab.research.google.com/github/benedettoscala/ifttt-code-generator/blob/main/nl2gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers evaluate datasets
!pip install rouge_score  # se vuoi ancora utilizzare eventuali metriche di testo, non strettamente necessario per GPT-2


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [3

In [12]:
import os
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

import evaluate
import nltk
import math

# Se non hai la risorsa 'punkt' di NLTK installata, esegui:
nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!git clone https://github.com/benedettoscala/ifttt-code-generator
%cd ifttt-code-generator/
!git pull

Cloning into 'ifttt-code-generator'...
remote: Enumerating objects: 110, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 110 (delta 58), reused 25 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (110/110), 14.61 MiB | 18.44 MiB/s, done.
Resolving deltas: 100% (58/58), done.
/content/ifttt-code-generator
Already up to date.


In [4]:
# Carica il dataset
csv_path = "datasets/cleaned_and_combined.csv"
df = pd.read_csv(csv_path)

# Rimuovi righe con valori mancanti e duplicati
df.dropna(subset=["cleaned_description", "filter_code"], inplace=True)
df.drop_duplicates(subset=["cleaned_description", "filter_code"], inplace=True)

# Esempio: unisci description e code in un'unica stringa.
# Usa un token o un separatore specifico per aiutare il modello a distinguere
# le due parti (es: "###" o "<|endoftext|>")
def create_text_prompt(desc, code):
    return f"Description:\n{desc}\n###\nCode:\n{code}"

df["text"] = df.apply(
    lambda row: create_text_prompt(row["cleaned_description"], row["filter_code"]),
    axis=1
)

# Divisione train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Conversione in Dataset Hugging Face
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print("Train set size:", len(dataset["train"]))
print("Test set size:", len(dataset["test"]))

Train set size: 134
Test set size: 34


In [5]:
model_checkpoint = "gpt2"  # o "gpt2-medium", "gpt2-large", ecc.

tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
# Imposta un token di padding se non definito
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Imposta una lunghezza massima (ad esempio 512, da valutare in base a GPT-2)
max_length = 256

def tokenize_function(examples):
    # Ritorna un unico dict con input_ids e attention_mask
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

tokenized_datasets


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 134
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 34
    })
})

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal Language Modeling
)


In [7]:
model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

# Aggiungiamo eventuali token se abbiamo aggiunto un token di pad
model.resize_token_embeddings(len(tokenizer))


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [8]:
rouge_score = evaluate.load("rouge")
bleu_score  = evaluate.load("bleu")
meteor_score = evaluate.load("meteor")

def postprocess_text(preds, labels):
    """
    - Rimuove spazi superflui
    - Segmenta in frasi per calcolare ROUGE in modo corretto
    """
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # In GPT-2, di solito non usiamo -100 come mask (come in seq2seq),
    # ma se presente, lo sostituiamo con il pad_token_id
    labels[labels == -100] = tokenizer.pad_token_id

    # Argmax sui logits per ottenere la sequenza predetta
    predictions = np.argmax(logits, axis=-1)

    # Decodifica in stringhe
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-processing (rimozione spazi, split in frasi)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Calcolo delle metriche
    # 1) ROUGE
    rouge_results = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    # 2) BLEU
    # La metrica BLEU in `evaluate` richiede `references` come lista di liste
    bleu_results = bleu_score.compute(
        predictions=decoded_preds,
        references=[[lbl] for lbl in decoded_labels]
    )
    # 3) METEOR
    meteor_results = meteor_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )

    # Organizza i risultati
    result = {}
    # ROUGE
    result["rouge1"] = round(rouge_results["rouge1"] * 100, 2)
    result["rouge2"] = round(rouge_results["rouge2"] * 100, 2)
    result["rougeL"] = round(rouge_results["rougeL"] * 100, 2)
    # BLEU
    result["bleu"] = round(bleu_results["bleu"] * 100, 2)
    # METEOR
    result["meteor"] = round(meteor_results["meteor"] * 100, 2)

    return result


In [32]:
training_args = TrainingArguments(
    output_dir="./gpt2-ifttt",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",   # Esegui evaluation alla fine di ogni epoca
    save_strategy="epoch",         # Salva un checkpoint a ogni epoca
    num_train_epochs=30,            # Cambia secondo le tue necessità
    per_device_train_batch_size=4, # Batch size, adattalo alla tua GPU
    per_device_eval_batch_size=4,
    logging_steps=50,
    save_total_limit=2,            # Tieni solo gli ultimi 2 checkpoint
    fp16=torch.cuda.is_available(), # Usa half precision se possibile
    report_to="none",               # Disabilita WandB o altri logger
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # se vuoi, altrimenti None
)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu,Meteor
1,No log,1.917171,49.43,23.73,44.81,36.32,53.76
2,1.496000,1.778151,52.81,28.53,48.26,39.37,57.54
3,1.284000,1.782334,52.69,29.53,48.66,40.72,58.23
4,1.284000,1.736189,55.32,32.38,51.01,42.28,61.38
5,1.047300,1.804302,55.12,33.43,51.26,43.31,61.62
6,0.854300,1.846444,54.64,32.73,51.23,42.11,61.24
7,0.854300,1.919691,54.53,31.61,50.9,42.4,60.54
8,0.684900,1.917195,55.24,33.63,51.69,42.17,61.42
9,0.594400,1.957487,54.75,33.93,51.55,41.05,61.26
10,0.594400,2.001669,55.45,35.14,52.63,47.42,62.4


In [14]:
results = trainer.evaluate()
print("Final eval_loss:", results["eval_loss"])
print("Perplexity:", math.exp(results["eval_loss"]))


Final eval_loss: 1.914318561553955
Perplexity: 6.78231549140008


In [28]:
%cd ..


/content/ifttt-code-generator


In [31]:
from transformers import pipeline

# Carica (o ri-carica) il modello fine-tunato
# Se hai salvato i checkpoint in `./gpt2-ifttt`, puoi caricare da lì
inference_model = GPT2LMHeadModel.from_pretrained("gpt2-ifttt/checkpoint-102")
inference_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-ifttt/checkpoint-102")

# Pipeline di text generation
generator = pipeline(
    "text-generation",
    model=inference_model,
    tokenizer=inference_tokenizer,
    pad_token_id=inference_tokenizer.eos_token_id
)

# Esempio di prompt: solo la "descrizione"
prompt = "Description:\nCreate an applet that saves new photos from my phone to Google Drive.\n###\nCode:\n"

results = generator(prompt, max_length=256, num_return_sequences=1)
print(results[0]["generated_text"])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Description:
Create an applet that saves new photos from my phone to Google Drive.
###
Code:
const pictures = NewPhotoInfo.newPhotoByTag.PhotoItemUrl; const photosToPhoto = pictures; var timeOfDay = Meta.currentUserTime.hour();  const albumName = pictures.indexOf("https://s",0);  const albumArtistName = pictures.join("https://s",0);   photosToPhoto.setPhotoUrl("https:/*");  Photos.savePhotoImageUrl("https:".format(albumName)) {    Dropbox.createPhotoList(albumName); }  const title = pictures.substring(0,6);    photosToPhoto.setPhotoUrl("https:";   Dropbox.createPhotoList(title));     Photos.savePhotoImageUrl("https:/*"; }  const activity = Video.createActivity.createPhotoList(osPhotoData);     IfNotifications.sendNotification("Not an Imprint photo") {    Photos.sendNotification.setLocation("https:".format(osPhotoData.searchString("\/")[0-9
