# 1. Demos

This notebook demonstrates a comparison between our trained model and two baselines on some simple demos. The baselines are:
- A baseline model (JetBrains-Research/cmg-codet5-without-history)
- DeepSeek V3

In [1]:
import json
import os

import pandas as pd
import rootutils
import torch

In [2]:
ROOT = rootutils.setup_root(".", ".project-root", pythonpath=True)

In [3]:
from src.demo_inference import load_run, run_inference, fetch_git_changes, run_inference_llm

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

t5_efficient_extra_tiny_module, t5_efficient_extra_tiny_datamodule = load_run(
     checkpoint_path=ROOT / "logs/train/runs/2025-01-24_23-12-54/checkpoints/epoch_023-val_MRR_top5_0.6524.ckpt",
    config_path=None,
    device=device,
)

codet5_module, codet5_datamodule = load_run(
     checkpoint_path=None,
    config_path=ROOT / "logs/eval/runs/2025-01-25_11-45-44/.hydra/config.yaml",
    device=device,
)

deepseek_v3_module, deepseek_v3_datamodule = load_run(
     checkpoint_path=None,
    config_path=ROOT / "logs/eval/runs/2025-01-27_10-31-09/.hydra/config.yaml",
    device=device,
)

  checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))  # nosec B614
[32m2025-01-28 17:39:34.190[0m | [1mINFO    [0m | [36msrc.data.commit_chronicle.llm_api[0m:[36msetup[0m:[36m157[0m - [1mTotal tokens in dataset: 10710853 Tokens per sample: 650.641052120034[0m


### Example 1: Hello World

In [9]:
changes = fetch_git_changes(ROOT / "data/demos/01-hello-world")
changes

[{'change_type': 'MODIFY',
  'old_path': 'main.go',
  'new_path': 'main.go',
  'diff': 'diff --git a/main.go b/main.go\nindex 1b5cdd1..b2f344e 100644\n--- a/main.go\n+++ b/main.go\n@@ -7,10 +7,13 @@ type User struct {\n}\nfunc GetUserName(user *User) string {\n- return user.Name // BUG: If `user` is nil, this will panic!\n+ if user == nil {\n+ return "Unknown User" // FIX: Handle the nil case\n+ }\n+ return user.Name\n}\nfunc main() {\nvar user *User // user is nil\n- fmt.Println("User Name:", GetUserName(user)) // This will cause a runtime panic!\n+ fmt.Println("User Name:", GetUserName(user)) // Now it won\'t panic!\n}\n'}]

##### Our Model: T5 Efficient Extra Tiny

In [10]:
run_inference(t5_efficient_extra_tiny_module, t5_efficient_extra_tiny_datamodule, changes, device)

Unnamed: 0,input,prediction
0,main.go diff --git a/main.go b/main.go index 1...,Refactor GetUserName to return nil string


##### CMG CodeT5 Model

In [11]:
run_inference(codet5_module, codet5_datamodule, changes, device)

Unnamed: 0,input,prediction
0,main.go\ndiff --git a/main.go b/main.go\nindex...,Fix panic in main.go


##### DeepSeek V3

In [12]:
output = run_inference_llm(deepseek_v3_module, deepseek_v3_datamodule, changes)
display(output)

Unnamed: 0,system_content,user_content,diff,prediction
0,You are a helpful assistant that generates com...,Code changes:\nmain.go\ndiff --git a/main.go b...,main.go\ndiff --git a/main.go b/main.go\nindex...,Fix GetUserName to handle nil user case


### Example 2: Simple Bug Fix

In [27]:
changes = fetch_git_changes(ROOT / "data/demos/02-simple-bug-fix")
changes

[{'change_type': 'MODIFY',
  'old_path': 'main.go',
  'new_path': 'main.go',
  'diff': 'diff --git a/main.go b/main.go\nindex 4f59ffd..e19e180 100644\n--- a/main.go\n+++ b/main.go\n@@ -7,6 +7,9 @@ type User struct {\n}\nfunc GetUserName(user *User) string {\n+ if user == nil {\n+ return "Unknown"\n+ }\nreturn user.Name\n}\n'}]

##### Our Model: T5 Efficient Extra Tiny

In [28]:
run_inference(t5_efficient_extra_tiny_module, t5_efficient_extra_tiny_datamodule, changes, device)

Unnamed: 0,input,prediction
0,main.go diff --git a/main.go b/main.go index 4...,Add nil check to User struct


##### CMG CodeT5 Model

In [29]:
run_inference(codet5_module, codet5_datamodule, changes, device)

Unnamed: 0,input,prediction
0,main.go\ndiff --git a/main.go b/main.go\nindex...,Add nil check to GetUserName


##### DeepSeek V3

In [30]:
output = run_inference_llm(deepseek_v3_module, deepseek_v3_datamodule, changes)
display(output)

Unnamed: 0,system_content,user_content,diff,prediction
0,You are a helpful assistant that generates com...,Code changes:\nmain.go\ndiff --git a/main.go b...,main.go\ndiff --git a/main.go b/main.go\nindex...,Fix GetUserName to handle nil user case


### Example 3: New Feature

In [31]:
changes = fetch_git_changes(ROOT / "data/demos/03-new-feature")
changes

[{'change_type': 'MODIFY',
  'old_path': 'main.go',
  'new_path': 'main.go',
  'diff': 'diff --git a/main.go b/main.go\nindex 059a52c..cafd97b 100644\n--- a/main.go\n+++ b/main.go\n@@ -8,4 +8,11 @@ import (\nfunc main() {\nu := user.User{Name: "Alice", Email: "alice@example.com"}\nfmt.Println(u.GetDetails())\n+\n+ // Simulate login attempts\n+ u.IncrementLoginAttempts()\n+ u.IncrementLoginAttempts()\n+\n+ fmt.Println("After login attempts:")\n+ fmt.Println(u.GetDetails())\n}\n'},
 {'change_type': 'MODIFY',
  'old_path': 'user.go',
  'new_path': 'user.go',
  'diff': 'diff --git a/user.go b/user.go\nindex f8d3410..b6d881e 100644\n--- a/user.go\n+++ b/user.go\n@@ -5,8 +5,13 @@ import "fmt"\ntype User struct {\nName string\nEmail string\n+ LoginAttempts int\n}\nfunc (u *User) GetDetails() string {\n- return fmt.Sprintf("Name: %s, Email: %s", u.Name, u.Email)\n+ return fmt.Sprintf("Name: %s, Email: %s, Login Attempts: %d", u.Name, u.Email, u.LoginAttempts)\n+}\n+\n+func (u *User) IncrementL

##### Our Model: T5 Efficient Extra Tiny

In [32]:
run_inference(t5_efficient_extra_tiny_module, t5_efficient_extra_tiny_datamodule, changes, device)

Unnamed: 0,input,prediction
0,main.go diff --git a/main.go b/main.go index 0...,Add login attempts to user.go


##### CMG CodeT5 Model

In [33]:
run_inference(codet5_module, codet5_datamodule, changes, device)

Unnamed: 0,input,prediction
0,main.go\ndiff --git a/main.go b/main.go\nindex...,simulate login attempts


##### DeepSeek v3

In [34]:
output = run_inference_llm(deepseek_v3_module, deepseek_v3_datamodule, changes)
display(output)

Unnamed: 0,system_content,user_content,diff,prediction
0,You are a helpful assistant that generates com...,Code changes:\nmain.go\ndiff --git a/main.go b...,main.go\ndiff --git a/main.go b/main.go\nindex...,Add LoginAttempts field and IncrementLoginAtte...


# 2. Model Evaluation Framework

The notebook implements two key evaluation functions:
- `generate_baseline_message()`: Generates commit messages using the baseline CodeT5 model
- `evaluate_messages()`: Uses GPT-4 to evaluate the quality of generated messages compared to target messages on a scale of 1-10

The evaluation considers:
- The input code diff
- Messages from both models
- The target (actual) commit message

In [None]:
import json
import os

import openai
import pandas as pd
import rootutils
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

ROOT = rootutils.setup_root(".", ".project-root", pythonpath=True)

from src.demo_inference import load_run

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint_path = (
    ROOT / "logs/train/runs/2025-01-24_23-12-54/checkpoints/epoch_023-val_MRR_top5_0.6524.ckpt"
)
our_model, datamodule = load_run(checkpoint_path)

baseline_tokenizer = AutoTokenizer.from_pretrained("JetBrains-Research/cmg-codet5-without-history")
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(
    "JetBrains-Research/cmg-codet5-without-history"
)
baseline_model = baseline_model.to(device)

csv_path = ROOT / "notebooks/comparisons.csv"
samples = pd.read_csv(csv_path)


openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def generate_baseline_message(diff: str) -> str:
    """Generate commit message using the baseline model."""
    inputs = baseline_tokenizer(diff, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = baseline_model.generate(**inputs)
    return baseline_tokenizer.decode(outputs[0], skip_special_tokens=True)


def evaluate_messages(baseline_message: str, target_message: str, diff: str) -> dict:
    """Evaluate messages using OpenAI."""
    prompt = f"""Given a code diff and two commit messages (one from a model and one target message), 
    evaluate the model message on a scale of 1-10 based on how well it captures the essence of the target message
    while maintaining clarity and relevance to the changes.

    Code diff:
    {diff}

    Model Message: {baseline_message}
    Target Message: {target_message}

    Provide your response in JSON format:
    {{
        "score": <score>,
        "explanation": "<brief explanation of the score>"
    }}
    """

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
        response_format={"type": "json"},
    )

    return json.loads(response.choices[0].message.content)

# 3. Statistical Analysis

The notebook performs comprehensive statistical analysis of the evaluation results, including:
- Basic descriptive statistics for both models' scores
- Analysis based on diff length (categorized into Very Short to Very Long)
- Score distributions and comparisons
- Correlation between diff length and model performance
- Win rate analysis showing the percentage of cases where each model performs better

In [None]:
results_df = pd.read_csv("evaluation_results.csv")

basic_stats = pd.DataFrame(
    {
        "Tiny Model": results_df["tiny_score"].describe(),
        "Baseline Model": results_df["baseline_score"].describe(),
        "Score Difference": results_df["score_difference"].describe(),
    }
)
print("Basic Statistics:")
print(basic_stats)

results_df["diff_length"] = results_df["input"].str.len()

results_df["length_bin"] = pd.qcut(
    results_df["diff_length"], q=5, labels=["Very Short", "Short", "Medium", "Long", "Very Long"]
)

length_stats = (
    results_df.groupby("length_bin")
    .agg(
        {
            "tiny_score": ["mean", "std", "count"],
            "baseline_score": ["mean", "std", "count"],
            "score_difference": "mean",
        }
    )
    .round(3)
)
print("\nScores by Diff Length:")
print(length_stats)

# 4. Visualization
## a. Results
The results are visualized through multiple plots:
- Histograms showing score distributions for both models
- A histogram showing the distribution of score differences
- Box plots comparing score distributions between models
These visualizations help understand the relative performance of both models.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(data=results_df, x="tiny_score", bins=10)
plt.title("Tiny Model Score Distribution")

plt.subplot(1, 3, 2)
sns.histplot(data=results_df, x="baseline_score", bins=10)
plt.title("Baseline Model Score Distribution")

plt.subplot(1, 3, 3)
sns.histplot(data=results_df, x="score_difference", bins=10)
plt.title("Score Difference Distribution")

plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
scores_melted = pd.melt(
    results_df[["tiny_score", "baseline_score"]], var_name="Model", value_name="Score"
)
sns.boxplot(data=scores_melted, x="Model", y="Score")
plt.title("Score Distribution Comparison")
plt.show()

print("\nCorrelation between diff length and scores:")
correlations = pd.DataFrame(
    {
        "Tiny Model": results_df["diff_length"].corr(results_df["tiny_score"]),
        "Baseline Model": results_df["diff_length"].corr(results_df["baseline_score"]),
    },
    index=["Correlation with diff length"],
)
print(correlations)

win_stats = {
    "Tiny Wins": (results_df["score_difference"] > 0).mean() * 100,
    "Baseline Wins": (results_df["score_difference"] < 0).mean() * 100,
    "Ties": (results_df["score_difference"] == 0).mean() * 100,
}
print("\nWin Rate Analysis (%):")
print(pd.Series(win_stats).round(2))

## b. Metrics
The notebook calculates and visualizes various metrics for the generated commit messages:
- BLEU, ROUGE, BERTScore, and METEOR scores for both models
- Visualization of score distributions for each metric
- Comparison of average metrics between models
These metrics provide a comprehensive evaluation of the generated commit messages.
import pandas as pd

In [None]:
import nltk
import numpy as np
import pandas as pd
import sacrebleu
from nltk.translate import meteor_score
from rouge_score import rouge_scorer
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

nltk.download("wordnet")

bert_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
model = AutoModel.from_pretrained(bert_name)


def calculate_metrics(generated_messages, reference_messages):
    """Calculate ROUGE, BLEU, and METEOR metrics."""
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    metrics = {"bleu": [], "rouge1": [], "rouge2": [], "rougeL": [], "meteor": []}

    for gen, ref in tqdm(
        zip(generated_messages, reference_messages), total=len(generated_messages)
    ):
        # BLEU
        bleu = sacrebleu.corpus_bleu([gen], [[ref]], lowercase=True, tokenize="13a").score
        metrics["bleu"].append(bleu)

        # ROUGE scores
        rouge_scores = scorer.score(gen, ref)
        metrics["rouge1"].append(rouge_scores["rouge1"].fmeasure)
        metrics["rouge2"].append(rouge_scores["rouge2"].fmeasure)
        metrics["rougeL"].append(rouge_scores["rougeL"].fmeasure)

        # METEOR
        meteor = meteor_score.meteor_score([tokenizer.tokenize(ref)], tokenizer.tokenize(gen))
        metrics["meteor"].append(meteor)

    return {k: np.mean(v) for k, v in metrics.items()}


def plot_metrics():
    df = pd.read_csv("comparisons.csv")

    print("Calculating metrics...")
    tiny_metrics = calculate_metrics(df["t5-efficient-extra-tiny"], df["target"])
    baseline_metrics = calculate_metrics(df["baseline-cmg-codet5-without-history"], df["target"])

    metrics_df = pd.DataFrame({"Tiny": tiny_metrics, "Baseline": baseline_metrics})

    colors = {"Tiny": "#2ecc71", "Baseline": "#3498db"}

    for metric in metrics_df.index:
        plt.figure(figsize=(8, 5))
        data = [metrics_df.loc[metric, "Tiny"], metrics_df.loc[metric, "Baseline"]]
        bars = plt.bar(
            ["Tiny", "Baseline"],
            data,
            color=[colors["Tiny"], colors["Baseline"]],
            alpha=0.8,
            width=0.6,
        )

        plt.title(f"{metric} Score Comparison", pad=20, fontsize=14, fontweight="bold")
        plt.ylabel("Score", fontsize=12)
        plt.ylim(0, max(data) * 1.2)

        plt.grid(axis="y", linestyle="--", alpha=0.3)

        for bar in bars:
            height = bar.get_height()
            plt.text(
                bar.get_x() + bar.get_width() / 2.0,
                height,
                f"{height:.4f}",
                ha="center",
                va="bottom",
                fontsize=11,
            )

        plt.gca().spines["top"].set_visible(False)
        plt.gca().spines["right"].set_visible(False)

        plt.tight_layout()
        plt.show()

    print("\nMetric Summary:")
    print(metrics_df.to_string(float_format=lambda x: "{:.4f}".format(x)))


plot_metrics()

# 5. Sample Evaluation
The notebook includes a practical demonstration using a sample diff:
- Generates commit messages using both models
- Shows the target (actual) commit message
- Provides a detailed evaluation comparing both generated messages
This gives a concrete example of how the models perform in practice.

In [None]:
from src.demo_inference import generate_commit_message
from src.evaluate_commits import CommitMessageEvaluator

evaluator = CommitMessageEvaluator(openai_api_key=os.getenv("OPENAI_API_KEY"))

sample = samples.iloc[0]
print("Sample 1 Diff:\n", sample["input"][:200] + "...\n")

our_message = generate_commit_message(our_model, sample["input"])
baseline_message = generate_baseline_message(sample["input"])

print("Our Model's Message:", our_message)
print("Baseline Message:", baseline_message)
print("Target Message:", sample["target"])

evaluation = evaluator.evaluate_messages(
    tiny_message=our_message,
    baseline_message=baseline_message,
    target_message=sample["target"],
    diff=sample["input"],
)
print("\nEvaluation:", json.dumps(evaluation, indent=2))