<a href="https://colab.research.google.com/github/benedettoscala/ifttt-code-generator/blob/main/try_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from peft import PeftModel
import os

In [9]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

####################################################################
# 1. FUNZIONE DI INFERENZA SINGOLO PROMPT CON GPT-2 (causal LM)
####################################################################
def generate_single_with_gpt2(
    model_path: str,
    prompt: str,
    max_length: int = 128,
    temperature: float = 1.0,
    top_k: int = 50,
    top_p: float = 0.95
) -> str:
    """
    Esegue l'inferenza con un modello GPT-2 fine-tunato su un singolo prompt.
    """

    device = "cuda" if torch.cuda.is_available() else "cpu"
    # Caricamento tokenizer e modello
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

    # Tokenizzazione
    input_data = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    # Generazione
    output_ids = model.generate(
        **input_data,
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p
    )
    # Decodifica
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Pulizia RAM GPU
    del model, tokenizer
    torch.cuda.empty_cache()

    return generated_text

####################################################################
# 2. FUNZIONE DI INFERENZA SINGOLO PROMPT CON BART (seq2seq)
####################################################################
def generate_single_with_bart(
    model_path: str,
    prompt: str,
    max_length: int = 128,
    temperature: float = 1.0,
    top_k: int = 50,
    top_p: float = 0.95
) -> str:
    """
    Esegue l'inferenza con un modello BART fine-tunato su un singolo prompt.
    Utilizza una pipeline di text2text-generation.
    """

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Creazione pipeline (text2text-generation)
    generator = pipeline(
        "text2text-generation",
        model=model_path,
        tokenizer=model_path,
        device=0 if device == "cuda" else -1
    )

    formatted_prompt = f"ifttt_prompt: {prompt}"

    # Generazione
    outputs = generator(
        formatted_prompt,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p
    )

    generated_text = outputs[0]["generated_text"]

    del generator
    torch.cuda.empty_cache()

    return generated_text

####################################################################
# 3. FUNZIONE DI INFERENZA SINGOLO PROMPT CON MISTRAL (LoRA + QLoRA)
####################################################################
def generate_single_with_mistral(
    finetuned_model_path: str,
    base_model_path: str,
    prompt: str,
    max_length: int = 128,
    temperature: float = 1.0,
    top_k: int = 50,
    top_p: float = 0.95
) -> str:
    """
    Esegue l'inferenza con Mistral 7B Instruct (o simile) usando LoRA + QLoRA (4-bit).
    Carica il modello base in 4-bit e vi applica l'adapter LoRA.
    Genera per un singolo prompt.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Creiamo la cartella di offload se non esiste
    if not os.path.exists("./offload"):
        os.makedirs("./offload")

    # Configurazione 4-bit
    bnb_config = BitsAndBytesConfig(
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False
    )

    # Caricamento del modello base in 4-bit
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
        offload_folder="./offload"
    )

    # Caricamento LoRA
    model = PeftModel.from_pretrained(base_model, finetuned_model_path)
    tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)

    model.eval()

    # Tokenizzazione
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generazione
    output_ids = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p
    )

    # Decodifica
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Pulizia della RAM
    del model, tokenizer, base_model
    torch.cuda.empty_cache()

    return generated_text

In [4]:
# Prompt di esempio
my_prompt = "When i recieve an email, send a message to my phone"

In [None]:


###################################################
# A) Inferenz con GPT-2 Fine-tunato
###################################################
model_gpt2_path = "/content/drive/Shareddrives/NLPMODELS/gpt2model-2/checkpoint-850"

generated_gpt2 = generate_single_with_gpt2(
    model_path=model_gpt2_path,
    prompt=my_prompt,
    max_length=128,
    temperature=1.0,
    top_k=50,
    top_p=0.95
)
print("\n[GPT-2] Generated Code:\n", generated_gpt2)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



[GPT-2] Generated Code:
 When i recieve an email, send a message to my phone
###
var re1 = Object.getOwnPropertyNames(Email).indexOf('Re1').toLowerCase()  var re2 = Object.getOwnPropertyNames(Email).indexOf('Re2').toLowerCase()   var incomingMessage = 'From: ' + email[re1].toLowerCase()   if (incomingMessage) {    Phone.sendMeEmail.skip()  }  else {   Phone.sendMeEmail.setMessage(`Its still early so we'll send an


In [7]:
###################################################
# B) Inferenz con BART Fine-tunato
###################################################
model_bart_path = "/content/drive/Shareddrives/NLPMODELS/nl2sql_bart_final/checkpoint-340"

generated_bart = generate_single_with_bart(
  model_path=model_bart_path,
  prompt=my_prompt,
  max_length=128,
  temperature=1.0,
  top_k=50,
  top_p=0.95
)
print("\n[BART] Generated Code:\n", generated_bart)

Device set to use cuda:0
Both `max_new_tokens` (=200) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



[BART] Generated Code:
 var hour = Meta.triggerTime.hour()  var timeHour = 24   if(hour%timeHour > 0 && hour%10 > 0 )    Email.sendMeEmail.skip()


In [7]:
###########################################
    # C) Inferenz con Mistral (LoRA + QLoRA)
    ###################################################
finetuned_model_path = "/content/drive/Shareddrives/NLPMODELS/mistral/checkpoint-20"
base_model_path = "mistralai/Mistral-7B-Instruct-v0.2"

generated_mistral = generate_single_with_mistral(
  finetuned_model_path=finetuned_model_path,
  base_model_path=base_model_path,
  prompt=my_prompt,
  max_length=128,
  temperature=1.0,
  top_k=50,
  top_p=0.95
)
print("\n[Mistral LoRA] Generated Code:\n", generated_mistral)


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[Mistral LoRA] Generated Code:
 When i recieve an email, send a message to my phone.
###
var subject = Trigger.EntrySubject  var message = "You have a new email from " + subject   IfNotifications.sendNotification.setMessage(message)   IfNotifications.sendNotification.sendNotification()   AndroidDevice.sendNotification.setMessage(message)   AndroidDevice.sendNotification.sendNotification()   iOSDevice.sendNotification.setMessage(message)   iOSDevice.sendNotification.sendNotification()   EmailDevice.sendNotification.setMessage(message)   EmailDevice.sendNotification.send
