## Generate evaluation scores for performance testing

This cell runs the model on 100 random samples from the dataset and scores each output using multiple metrics:

- **BLEU**: Measures n-gram overlap between generated and reference commit messages.
- **ROUGE-L**: Captures longest matching subsequences.
- **METEOR**: Similar to BLEU but accounts for synonyms using WordNet.
- **BERTScore**: Uses BERT embeddings to measure semantic similarity.
- **RAGAs (Answer Correctness)**: Evaluates how correct the generated message is in context, using the OpenAI API.

What this does:
- Loads the tokenizer and resizes the model’s embeddings.
- Defines a `safe_generate()` function to handle long inputs and decode outputs.
- Samples 100 rows from the cleaned dataset and generates commit messages.
- Computes all the metrics above and saves the results to a JSON file.
- Downloads the final file: `sample_metrics_with_ragas_and_bertscore.json`.

> **Note:** You must have the correct dataset path defined in the line:
> `df = pd.read_csv("data/cleaned_python_commit_dataset.csv")`.  
> Adjust it if your file is located elsewhere or named something else.

> Make sure you’ve set a working OpenAI API key in `os.environ["OPENAI_API_KEY"]` before running this.
> You’ll also need to have **active credits or a paid plan** on your OpenAI account — RAGAs uses GPT-based evaluation under the hood.


In [None]:
# ------------------------------------------------------------
# Eval on 100 samples — BLEU, ROUGE-L, METEOR, BERTScore, RAGAs Answer Correctness → JSON + download
# ------------------------------------------------------------
!pip install -q "nltk==3.8.1" rouge-score bert-score ragas datasets

import torch, pandas as pd, json, nltk, random, time, os
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from transformers import PreTrainedTokenizerFast
from tqdm import tqdm
from google.colab import files
from bert_score import BERTScorer
from ragas.metrics import answer_correctness
from ragas import evaluate
from datasets import Dataset

# Set OpenAI API Key
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key

# WordNet download for METEOR
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

# --- tokenizer setup --------------------------------------------------------------
tok = PreTrainedTokenizerFast(tokenizer_file="custom_bpe_tokenizer.json")
tok.add_special_tokens({
    "pad_token": "<pad>",
    "eos_token": "<endOfCommitMessage>"
})
tok.add_tokens(["<endOfDiff>"])
model.resize_token_embeddings(len(tok))

MAX_CTX, GEN_SPACE = model.config.n_positions, 50
SAFE_LIM = MAX_CTX - GEN_SPACE
device = next(model.parameters()).device  # e.g. 'cuda:0'

# --- helper functions --------------------------------------------------------------
def safe_generate(model, tokenizer, diff_text, device="cuda"):
    sep, eos = "<endOfDiff>", "<endOfCommitMessage>"
    ids = tokenizer.encode(diff_text + sep)
    if len(ids) > SAFE_LIM:
        ids = ids[-SAFE_LIM:]
    inp = torch.tensor([ids]).to(device)
    out = model.generate(
        inp,
        max_length=len(ids) + GEN_SPACE,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.encode(eos)[0],
        do_sample=False,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3
    )[0].tolist()
    txt = tokenizer.decode(out)
    return txt.split(sep, 1)[1].split(eos, 1)[0].strip()

# Load your dataset
df = pd.read_csv("data/cleaned_python_commit_dataset.csv")

# --- sampling setup --------------------------------------------------------------
sample_df = (
    df.sample(n=100, random_state=42)
      .reset_index(drop=True)
      .fillna({"diff": "", "commit_message": ""})
)
sample_df["diff"] = sample_df["diff"].astype(str)
sample_df["commit_message"] = sample_df["commit_message"].astype(str)

smooth_fn = SmoothingFunction().method1
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)

# Buffers to build a HuggingFace Dataset for RAGAs
ragas_examples = {"question": [], "answer": [], "ground_truth": []}
results = []

bar = tqdm(list(zip(sample_df["diff"], sample_df["commit_message"])),
           total=len(sample_df), desc="Scoring")

for diff_text, ref in bar:
    hyp = safe_generate(model, tok, diff_text, device=device)

    if ref.strip():
        ref_tokens = ref.split()
        hyp_tokens = hyp.split()
        bleu = sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smooth_fn)
        rougeL = rouge.score(ref, hyp)["rougeL"].fmeasure
        meteor = meteor_score([ref_tokens], hyp_tokens)
    else:
        bleu = rougeL = meteor = 0.0

    P, R, F1 = bert_scorer.score([hyp], [ref])
    bert_f1 = F1[0].item()

    bar.set_postfix(
        BLEU=f"{bleu:.4f}",
        ROUGE_L=f"{rougeL:.4f}",
        METEOR=f"{meteor:.4f}",
        BERTScore=f"{bert_f1:.4f}"
    )

    ragas_examples["question"].append(diff_text)
    ragas_examples["answer"].append(hyp)
    ragas_examples["ground_truth"].append(ref)

    results.append({
        "diff": diff_text,
        "generated_commit": hyp,
        "label_commit": ref,
        "bleu": bleu,
        "rouge_l": rougeL,
        "meteor": meteor,
        "bert_score_f1": bert_f1
    })

bar.close()

# --- run RAGAs Answer Correctness with retry loop -----------------------------
ragas_ds = Dataset.from_dict(ragas_examples)

while True:
    try:
        ragas_res = evaluate(ragas_ds, metrics=[answer_correctness])
        break  # success
    except TimeoutError:
        print("Timeout occurred, retrying in 5 seconds...")
        time.sleep(5)
    except Exception as e:
        print(f"Unexpected error: {e}. Retrying in 5 seconds...")
        time.sleep(5)

df_corr = ragas_res.to_pandas()
for i, row in enumerate(df_corr.itertuples()):
    results[i]["answer_correctness"] = row.answer_correctness

# --- save and download -------------------------------------------------------
json_file = "sample_metrics_with_ragas_and_bertscore.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

files.download(json_file)


## Write test scores to a .txt file for manual analysis

This cell takes the JSON file of model evaluation results (`sample_metrics_with_ragas_and_bertscore.json`) and converts it into a readable text file.

What it does:
- Loads all evaluation samples from the JSON file.
- For each sample, it writes:
  - The generated commit message
  - The ground-truth commit message
  - All evaluation scores (BLEU, ROUGE-L, METEOR, BERTScore-F1, AnswerCorrectness)
  - The associated Git diff
- Outputs everything in a clean, readable format to `all_diffs.txt`, with clear separators between samples.

This is useful if you want to manually review model outputs and compare them to the reference messages.

In [None]:
import json
import pathlib

in_path  = pathlib.Path("sample_metrics_with_ragas_and_bertscore.json")   # JSON array produced earlier
out_path = pathlib.Path("all_diffs.txt")             # destination text file

# 1) Load the whole JSON array
with in_path.open(encoding="utf-8") as src:
    samples = json.load(src)     

# 2) Dump each sample in a readable block
with out_path.open("w", encoding="utf-8") as dst:
    for i, obj in enumerate(samples, 1):
        dst.write(f"---- SAMPLE #{i} ----\n")
        dst.write(f"Generated commit : {obj.get('generated_commit','')}\n")
        dst.write(f"Ground-truth     : {obj.get('label_commit','')}\n")
        dst.write(
            "Scores (BLEU / ROUGE-L / METEOR / BERTScore-F1 / AnswerCorrectness) : "
            f"{obj.get('bleu', 0.0):.4f} / "
            f"{obj.get('rouge_l', 0.0):.4f} / "
            f"{obj.get('meteor', 0.0):.4f} / "
            f"{obj.get('bert_score_f1', 0.0):.4f} / "
            f"{obj.get('answer_correctness', 0.0):.4f}\n"
        )
        dst.write("Diff:\n")
        dst.write(obj.get("diff",""))
        dst.write("\n\n")  # blank line between samples

print(f"Wrote {len(samples)} samples to {out_path.resolve()}")

## Create plots for results and save

This cell loads the evaluation results from `sample_metrics_with_ragas_and_bertscore.json` and visualizes the distribution of all five metrics across the 100 samples.

What it does:
- Loads the JSON into a pandas DataFrame.
- Extracts:  
  - `bleu`  
  - `rouge_l`  
  - `meteor`  
  - `bert_score_f1`  
  - `answer_correctness`  
- Plots a histogram for each metric to show how scores are distributed across the dataset.
- Saves each plot as a separate PNG file:
  - `bleu4_distribution.png`
  - `rouge_l_distribution.png`
  - `meteor_distribution.png`
  - `bert_score_f1_distribution.png`
  - `answer_correctness_distribution.png`

This gives a quick visual sense of how well the model is performing across different evaluation dimensions.


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt

json_path = "sample_metrics_with_ragas_and_bertscore.json"   # <-- change if yours lives elsewhere

# Load JSON and select all five metrics
with open(json_path, encoding="utf-8") as f:
    df = pd.DataFrame(json.load(f))
scores = df[["bleu", "rouge_l", "meteor", "bert_score_f1", "answer_correctness"]].fillna(0.0)

# BLEU‑4 distribution
plt.figure()
plt.hist(scores["bleu"], bins=30)
plt.title("Distribution of BLEU‑4 scores")
plt.xlabel("BLEU‑4")
plt.ylabel("Frequency")
plt.savefig("bleu4_distribution.png")

# ROUGE‑L F1 distribution
plt.figure()
plt.hist(scores["rouge_l"], bins=30)
plt.title("Distribution of ROUGE‑L F1 scores")
plt.xlabel("ROUGE‑L F1")
plt.ylabel("Frequency")
plt.savefig("rouge_l_distribution.png")

# METEOR distribution
plt.figure()
plt.hist(scores["meteor"], bins=30)
plt.title("Distribution of METEOR scores")
plt.xlabel("METEOR")
plt.ylabel("Frequency")
plt.savefig("meteor_distribution.png")

# BERTScore‑F1 distribution
plt.figure()
plt.hist(scores["bert_score_f1"], bins=30)
plt.title("Distribution of BERTScore‑F1 scores")
plt.xlabel("BERTScore‑F1")
plt.ylabel("Frequency")
plt.savefig("bert_score_f1_distribution.png")

# Answer Correctness distribution
plt.figure()
plt.hist(scores["answer_correctness"], bins=30)
plt.title("Distribution of Answer Correctness scores")
plt.xlabel("Answer Correctness")
plt.ylabel("Frequency")
plt.savefig("answer_correctness_distribution.png")