<a href="https://colab.research.google.com/github/benedettoscala/ifttt-code-generator/blob/main/test_and_compare_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install -U bitsandbytes

In [4]:
!git clone https://github.com/benedettoscala/ifttt-code-generator
%cd ifttt-code-generator/
!git pull

Cloning into 'ifttt-code-generator'...
remote: Enumerating objects: 159, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 159 (delta 0), reused 0 (delta 0), pack-reused 157 (from 1)[K
Receiving objects: 100% (159/159), 14.76 MiB | 41.54 MiB/s, done.
Resolving deltas: 100% (93/93), done.
/content/ifttt-code-generator
Already up to date.


In [5]:
%cd ..

/content


In [7]:
import pandas as pd
import torch
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sklearn.model_selection import train_test_split
from google.colab import drive
from peft import PeftModel

drive.mount('/content/drive')

# Load the dataset and split it
df = pd.read_csv("ifttt-code-generator/datasets/cleaned_and_combined.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Extract test set prompts
prompts = test_df["cleaned_description"].tolist()
actual_codes = test_df["filter_code"].tolist()

# Function to generate text with GPT-2
def generate_with_gpt2(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")
    generated_codes = []

    for prompt in prompts:
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
        output_ids = model.generate(input_ids, max_length=128, num_return_sequences=1)
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        generated_codes.append(generated_text)

    del model
    del tokenizer
    torch.cuda.empty_cache()

    return generated_codes

# Function to generate text with BART
def generate_with_bart(model_path):
    generator = pipeline("text2text-generation", model=model_path, tokenizer=model_path)
    generated_codes = [generator(f"ifttt_prompt: {prompt}", max_length=128)[0]["generated_text"] for prompt in prompts]

    del generator
    torch.cuda.empty_cache()

    return generated_codes

# Function to generate text with Mistral

def generate_with_mistral(finetuned_model_path, basemodel_path):
    if not os.path.exists("./offload"):
        os.makedirs("./offload")




    print("Caricamento del modello fine-tunato...")
    bnb_config = BitsAndBytesConfig(
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False
    )

    model = AutoModelForCausalLM.from_pretrained(
        basemodel_path,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
        offload_folder="./offload"
    )

    model = PeftModel.from_pretrained(model, finetuned_model_path)
    tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)

    generated_codes = []
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs,
            max_length=512,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=1,
        )
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        generated_codes.append(decoded_outputs[0])

    del model
    del tokenizer
    torch.cuda.empty_cache()

    return generated_codes

# Generate with Mistral
finetuned_model_path = "/content/drive/Shareddrives/NLPMODELS/mistral/checkpoint-20"
basemodel_path = "mistralai/Mistral-7B-Instruct-v0.2"
generated_codes_mistral = generate_with_mistral(finetuned_model_path, basemodel_path)


# Generate with BART
model_bart_path = "/content/drive/Shareddrives/NLPMODELS/nl2sql_bart_final/checkpoint-340"
generated_codes_bart = generate_with_bart(model_bart_path)


# Generate with GPT-2
model_gpt2_path = "/content/drive/Shareddrives/NLPMODELS/gpt2model/checkpoint-340"
generated_codes_gpt2 = generate_with_gpt2(model_gpt2_path)



# Create a DataFrame with results
results_df = pd.DataFrame({
    "Prompt": prompts,
    "Generated Code GPT-2": generated_codes_gpt2,
    "Generated Code BART": generated_codes_bart,
    "Generated Code Mistral": generated_codes_mistral,
    "Actual Code": actual_codes
})


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Caricamento del modello fine-tunato...


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [8]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c6e2388e4256c737293942ba1dc9e9e7171c549d7da1eb3c419a8d806846e51e
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [34]:
results_df

Unnamed: 0,Prompt,Generated Code GPT-2,Generated Code BART,Generated Code Mistral,Actual Code
0,This applet will append Medium bookmarks to a ...,This applet will append Medium bookmarks to a ...,const pairs = Object.getOwnPropertyNames(Twitt...,This applet will append Medium bookmarks to a ...,const bookmark = (Medium.postBookmarkedByYou a...
1,If Netatmo weather station reports rain amount...,If Netatmo weather station reports rain amount...,if(parseFloat(Netatmo.rainYesterdayAmount.Meas...,If Netatmo weather station reports rain amount...,if(parseFloat(Netatmo.rainYesterdayAmount.Meas...
2,When a specific user posts a tweet that has a ...,When a specific user posts a tweet that has a ...,var tweet = Twitter.newTweetByUser.Text; var r...,When a specific user posts a tweet that has a ...,var tweet = Twitter.newTweetByUser.Text; var ...
3,When tomorrow's forecast has a low temperature...,When tomorrow's forecast has a low temperature...,if (parseInt(Weather.tomorrowsWeatherAtTime.Lo...,When tomorrow's forecast has a low temperature...,if (parseInt(Weather.tomorrowsWeatherAtTime.Lo...
4,Set a Yeelight Scene on exiting an area betwee...,Set a Yeelight Scene on exiting an area betwee...,if (Meta.currentUserTime.hour() >= 18 || Meta....,Set a Yeelight Scene on exiting an area betwee...,if (Meta.currentUserTime.hour() >= 18 || Meta....
5,If Boundary Alarm is disarmed and it is nightt...,If Boundary Alarm is disarmed and it is nightt...,var timeOfDay = Meta.currentUserTime.hour(); ...,If Boundary Alarm is disarmed and it is nightt...,var season = Meta.currentUserTime.month(); va...
6,Which ever color tier your latest Super Chat m...,Which ever color tier your latest Super Chat m...,"if (Youtube.newSuperchat.ColorTier == ""Light b...",Which ever color tier your latest Super Chat m...,"if (Youtube.newSuperchat.ColorTier == ""Light b..."
7,Send an SMS message when the Link collar is ch...,Send an SMS message when the Link collar is ch...,var minute = Meta.triggerTime.minute() var mi...,Send an SMS message when the Link collar is ch...,var minute = Meta.triggerTime.minute() var mi...
8,"We have your date night planned ahead of time,...","We have your date night planned ahead of time,...",let optionOne = Math.floor((Math.random() * Tr...,"We have your date night planned ahead of time,...",let optionOne = Math.floor((Math.random() * Tr...
9,This applet will save 1p for every 10 metres y...,This applet will save 1p for every 10 metres y...,var distance = parseInt(Strava.newActivityByYo...,This applet will save 1p for every 10 metres y...,var distance = parseInt(Strava.newActivityByYo...


In [30]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

def evaluate_generated_text(generated_codes, actual_codes):
    bleu_scores = []
    meteor_scores = []
    rouge_l_scores = []
    rouge_1_scores = []
    rouge_2_scores = []

    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    scorer_1 = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
    scorer_2 = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)

    for gen, ref in zip(generated_codes, actual_codes):
        gen_tokens = gen.split()
        ref_tokens = ref.split()

        # Calcolo BLEU (sentence-level)
        bleu = sentence_bleu([ref_tokens], gen_tokens)

        # Calcolo METEOR (sentence-level)
        meteor = single_meteor_score(ref_tokens, gen_tokens)

        # Calcolo ROUGE-L (f-measure)
        rouge_l = scorer.score(ref, gen)["rougeL"].fmeasure
        # calcolo ROUGE-1
        rouge_1 = scorer_1.score(ref, gen)["rouge1"].fmeasure
        # calcolo ROUGE-2
        rouge_2 = scorer_2.score(ref, gen)["rouge2"].fmeasure

        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        rouge_l_scores.append(rouge_l)
        rouge_1_scores.append(rouge_1)
        rouge_2_scores.append(rouge_2)

    # Media su tutte le frasi del dataset di test
    mean_bleu = sum(bleu_scores) / len(bleu_scores)
    mean_meteor = sum(meteor_scores) / len(meteor_scores)
    mean_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    mean_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    mean_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)

    return mean_bleu, mean_meteor, mean_rouge_l, mean_rouge_1, mean_rouge_2


In [28]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
# Evaluate models
gpt2_scores = evaluate_generated_text(generated_codes_gpt2, actual_codes)
bart_scores = evaluate_generated_text(generated_codes_bart, actual_codes)
mistral_scores = evaluate_generated_text(generated_codes_mistral, actual_codes)

metrics_df = pd.DataFrame(
    {
        "Metric": ["BLEU", "METEOR", "ROUGE-L", "ROUGE-1", "ROUGE-2"],
        "GPT-2": gpt2_scores,
        "BART": bart_scores,
        "Mistral": mistral_scores
    }
)


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [33]:
metrics_df

Unnamed: 0,Metric,GPT-2,BART,Mistral
0,BLEU,0.10691,0.418573,0.06266
1,METEOR,0.374292,0.616489,0.284503
2,ROUGE-L,0.300527,0.712151,0.282811
3,ROUGE-1,0.33631,0.721117,0.313391
4,ROUGE-2,0.232983,0.640528,0.193302
