In [1]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("google/code_x_glue_ct_code_to_text", "java")

# Reduce size for faster iteration
import random
train_data = dataset["train"].select(random.sample(range(len(dataset["train"])), k=int(1.0 * len(dataset["train"]))))
val_data = dataset["validation"].select(random.sample(range(len(dataset["validation"])), k=int(1.0 * len(dataset["validation"]))))
print("Train size:", len(train_data))
print("Validation size:", len(val_data))


  from .autonotebook import tqdm as notebook_tqdm


Train size: 164923
Validation size: 5183


In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '<pad>', 'sep_token': '<sep>', 'bos_token': '<s>', 'eos_token': '</s>'})


4

In [3]:
def preprocess(example):
    code = example["code"]
    docstring = example["docstring"]
    full_text = f"<s> {code} </s> <sep> {docstring}"

    # Tokenize the full string
    tokens = tokenizer(
        full_text,
        padding="max_length",
        truncation=True,
        max_length=512
    )

    # Copy labels from input_ids
    labels = tokens["input_ids"][:]

    # Mask out the code portion
    sep_id = tokenizer.convert_tokens_to_ids("<sep>")
    try:
        sep_index = labels.index(sep_id)
    except ValueError:
        sep_index = 0  # fallback: mask entire sequence

    labels[:sep_index + 1] = [-100] * (sep_index + 1)
    tokens["labels"] = labels

    return tokens
train_data = train_data.map(preprocess, remove_columns=train_data.column_names)
val_data = val_data.map(preprocess, remove_columns=val_data.column_names)

# Convert to PyTorch tensors
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████████████████████████████████████████████████████████████████████| 164923/164923 [02:43<00:00, 1010.17 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:05<00:00, 1015.69 examples/s]


In [3]:
import torch

In [11]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import os

# Load and resize the pretrained model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Resize for new tokens

# Optional: Enable progress bar
os.environ["WANDB_DISABLED"] = "true"  # Disable WandB if accidentally enabled
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-docstring",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,                       # 5 epochs
    eval_strategy="epoch",                    # evaluate every epoch
    save_strategy="epoch",                    # save model every epoch
    logging_dir="./logs",                     # logs directory
    logging_steps=1,                          # log every step for progress bar
    disable_tqdm=False,                       # ✅ ensure tqdm is enabled
    report_to="none",                         # no external logging (e.g. WandB)
    logging_first_step=True
)

# Data collator to handle padding and masking
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
)



if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# ✅ Start training with progress bar
trainer.train()


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


NameError: name 'torch' is not defined

In [9]:
# Save model and tokenizer
model.save_pretrained("./gpt2-docstring-model")
tokenizer.save_pretrained("./gpt2-docstring-model")


KeyboardInterrupt: 

In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-docstring-model")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-docstring-model")


In [25]:
def generate_docstring_few_shot(test_code, max_length=200):
    few_shot_prompt = """
<s> public int add(int a, int b) { return a + b; } </s> <sep> Adds two integers and returns the sum.

<s> public int multiply(int a, int b) { return a * b; } </s> <sep> Multiplies two integers and returns the product.

<s> public boolean isEven(int num) { return num % 2 == 0; } </s> <sep> Checks if a number is even.

<s> public String greet(String name) { return "Hello " + name; } </s> <sep> Greets the user by name.
"""

    # Append the new example
    prompt =  f"<s> {test_code} </s> <sep>"

    # Tokenize prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_len = input_ids.shape[1]

    # Generate continuation from after the prompt
    output_ids = model.generate(
        input_ids,
        max_length=input_len + 50,  # buffer for generation
        num_beams=9,
        no_repeat_ngram_size=4,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Only decode the newly generated tokens (after prompt)
    generated_ids = output_ids[0][input_len:]  # exclude prompt
    result = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return result


In [None]:
# java_code = "public boolean isEqual(int a, int b) { return a==b; }"
java_code = "public int multiply(int a, int b) { return a * b; }"
docstring = generate_docstring_few_shot(java_code)
print("Generated docstring:\n", docstring.split("\n")[0])


Generated docstring:
 Multiply two integers.


In [26]:
# java_code = "public boolean isEqual(int a, int b) { return a==b; }"
java_code = "boolean isPalindrome(String s) { return s.equals(new StringBuilder(s).reverse().toString()); }"
docstring = generate_docstring_few_shot(java_code)
print("Generated docstring:\n", docstring)


Generated docstring:
 Determines if the given string is a Palindrome.

@param s the string to test.
@return true if the string is Palindrome, false otherwise.
@since 1.0.0
@see #


In [20]:
# java_code = "public boolean isEqual(int a, int b) { return a==b; }"
java_code = "public String cleanAndLower(String input) { return input.trim().toLowerCase().replaceAll('[^a-z0-9 ]', ''); }"
docstring = generate_docstring_few_shot(java_code)
print("Generated docstring:\n", docstring.split("\n")[0])


Generated docstring:
 Removes all whitespace from the input string.


In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from tqdm import tqdm
import evaluate
import torch
import pandas as pd
import os

# Configs
model_dir = "/gpt2-docstring/checkpoint-123693/"
save_path = "codet5_val_predictions_2.csv"
save_every = 50  # save every N batches

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-docstring-model").to("mps" if torch.backends.mps.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-docstring-model")
model.eval()

# Load and tokenize data
dataset = load_dataset("code_x_glue_ct_code_to_text", "java")
val_data = dataset["validation"]
gold_summaries = val_data["docstring"]

def tokenize_fn(example):
    return tokenizer(example["code"], truncation=True, padding="max_length", max_length=512)

val_tokenized = val_data.map(tokenize_fn, batched=True)
val_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
val_loader = DataLoader(val_tokenized, batch_size=16)

# Batch inference with intermediate saving
generated_summaries = []
batch_size = 16
start_batch = 0

# Resume logic (optional)
if os.path.exists(save_path):
    df_existing = pd.read_csv(save_path)
    generated_summaries = df_existing["predicted_summary"].tolist()
    start_batch = len(generated_summaries) // batch_size
    print(f"⏩ Resuming from batch {start_batch} (already {len(generated_summaries)} predictions)")

# Inference loop
for i, batch in enumerate(tqdm(val_loader, desc="Generating summaries in batch")):
    if i < start_batch:
        continue  # skip already done batches

    input_ids = batch["input_ids"].to(model.device)
    attention_mask = batch["attention_mask"].to(model.device)

    with torch.no_grad():
        outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=64,   # generate up to 64 tokens of summary
        num_beams=4
    )


    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    generated_summaries.extend(decoded)

    # Save every N batches
    if (i + 1) % save_every == 0 or (i + 1) == len(val_loader):
        print(f"💾 Saving progress at batch {i + 1}")
        df = pd.DataFrame({
            "gold_summary": gold_summaries[:len(generated_summaries)],
            "predicted_summary": generated_summaries
        })
        df.to_csv(save_path, index=False)

Generating train split: 100%|██████████| 164923/164923 [00:01<00:00, 139412.72 examples/s]
Generating validation split: 100%|██████████| 5183/5183 [00:00<00:00, 93917.07 examples/s]
Generating test split: 100%|██████████| 10955/10955 [00:00<00:00, 109530.52 examples/s]
Map: 100%|██████████| 5183/5183 [00:12<00:00, 425.51 examples/s]
Generating summaries in batch:   0%|          | 0/324 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating summaries in batch:   0%|          | 0/324 [08:56<?, ?it/s]


RuntimeError: Invalid buffer size: 6.14 GB

In [16]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from tqdm import tqdm
import evaluate
import torch
import pandas as pd
import os

# Configs
model_dir = "/gpt2-docstring/checkpoint-123693/"
save_path = "codet5_val_predictions.csv"
save_every = 50  # save every N batches

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-docstring-model").to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-docstring-model")
model.eval()

# Load and tokenize data
dataset = load_dataset("code_x_glue_ct_code_to_text", "java")
val_data = dataset["validation"]
gold_summaries = val_data["docstring"]

def tokenize_fn(example):
    return tokenizer(example["code"], truncation=True, padding="max_length", max_length=512)

# Filter out long code snippets
def filter_by_token_length(example, tokenizer=tokenizer, max_input_tokens=384):
    tokens = tokenizer(example["code"], truncation=False)["input_ids"]
    return len(tokens) <= max_input_tokens

# Apply filtering
val_data = val_data.filter(lambda x: filter_by_token_length(x), batched=False)
print(f"🧹 Filtered validation size: {len(val_data)} examples")

val_tokenized = val_data.map(tokenize_fn, batched=True)
val_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
val_loader = DataLoader(val_tokenized, batch_size=16)

# Batch inference with intermediate saving
generated_summaries = []
batch_size = 16
start_batch = 0

# Resume logic (optional)
if os.path.exists(save_path):
    df_existing = pd.read_csv(save_path)
    generated_summaries = df_existing["predicted_summary"].tolist()
    start_batch = len(generated_summaries) // batch_size
    print(f"⏩ Resuming from batch {start_batch} (already {len(generated_summaries)} predictions)")

# Inference loop
for i, batch in enumerate(tqdm(val_loader, desc="Generating summaries in batch")):
    if i < start_batch:
        continue  # skip already done batches

    input_ids = batch["input_ids"].to(model.device)
    attention_mask = batch["attention_mask"].to(model.device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4)

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    generated_summaries.extend(decoded)

    # Save every N batches
    if (i + 1) % save_every == 0 or (i + 1) == len(val_loader):
        print(f"💾 Saving progress at batch {i + 1}")
        df = pd.DataFrame({
            "gold_summary": gold_summaries[:len(generated_summaries)],
            "predicted_summary": generated_summaries
        })
        df.to_csv(save_path, index=False)

🧹 Filtered validation size: 4336 examples


Map: 100%|██████████████████████████████████████████████████████████████████████████████| 4336/4336 [00:02<00:00, 1783.36 examples/s]


⏩ Resuming from batch 323 (already 5183 predictions)


Generating summaries in batch: 100%|█████████████████████████████████████████████████████████████| 271/271 [00:00<00:00, 3135.44it/s]


In [7]:
pip install bert_score

Python(83936) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Collecting bert_score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting matplotlib (from bert_score)
  Using cached matplotlib-3.10.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib->bert_score)
  Downloading contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib->bert_score)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->bert_score)
  Using cached fonttools-4.57.0-cp312-cp312-macosx_10_13_universal2.whl.metadata (102 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->bert_score)
  Using cached kiwisolver-1.4.8-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.2 kB)
Collecting pillow>=8 (from matplotlib->bert_score)
  Using cached pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.9 kB)
Collecting pyparsing>=2.3.1 (from matplotlib->bert_score)
  Using cached pyparsing-3.2.3-py3-none-any.whl

In [25]:
import pandas as pd
import evaluate
import numpy as np
from datasets import Dataset

# Load predictions CSV
csv_path = "codet5_val_predictions_cleaned_2.csv"  # update if needed
df = pd.read_csv(csv_path)

# Extract predictions and references
predictions = df["predicted_summary"].astype(str).tolist()
references = df["gold_summary"].astype(str).tolist()

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Compute ROUGE
rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
print("📊 ROUGE Scores:")
for k, v in rouge_scores.items():
    print(f"{k}: {v:.4f}")

# Load metric and compute
bleu = evaluate.load("bleu")
result = bleu.compute(predictions=predictions, references=references)


# Compute Exact Match
exact_matches = sum([p.strip() == r.strip() for p, r in zip(predictions, references)])
exact_match_accuracy = exact_matches / len(references)
print(f"\n✅ Exact Match Accuracy: {exact_match_accuracy:.4f}")

📊 ROUGE Scores:
rouge1: 0.3065
rouge2: 0.1492
rougeL: 0.2683
rougeLsum: 0.2928

✅ Exact Match Accuracy: 0.0000


In [35]:
import pandas as pd
import evaluate
import numpy as np
from collections import Counter


# Load predictions CSV
df = pd.read_csv("codet5_val_predictions_cleaned_2.csv")

# Extract predictions and references
predictions = df["predicted_summary"].astype(str).tolist()
references = df["gold_summary"].astype(str).tolist()

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

# ROUGE
rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
print("📊 ROUGE Scores:")
for k, v in rouge_scores.items():
    print(f"{k}: {v:.4f}")

# BLEU
bleu_score = bleu.compute(predictions=predictions, references=references)
print(f"\n📘 BLEU Score: {bleu_score['bleu']:.4f}")

# BERTScore
bert_score = bertscore.compute(predictions=predictions, references=references, lang="en", device="cuda")
print(f"\n🧠 BERTScore F1 (avg): {np.mean(bert_score['f1']):.4f}")

# Function to compute average token repetition per summary
def avg_token_repetition(predictions):
    rep_counts = []
    for text in predictions:
        tokens = text.strip().split()
        counts = Counter(tokens)
        repeated_tokens = sum(v for v in counts.values() if v > 1)
        rep_counts.append(repeated_tokens / max(1, len(tokens)))
    return np.mean(rep_counts)
repetition = avg_token_repetition(predictions)


# Exact Match
exact_matches = sum(p.strip() == r.strip() for p, r in zip(predictions, references))
exact_match_accuracy = exact_matches / len(references)
print(f"\n✅ Exact Match Accuracy: {exact_match_accuracy:.4f}")


📊 ROUGE Scores:
rouge1: 0.3065
rouge2: 0.1492
rougeL: 0.2683
rougeLsum: 0.2928

📘 BLEU Score: 0.1318


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🧠 BERTScore F1 (avg): 0.8429

✅ Exact Match Accuracy: 0.0000


In [36]:
all_results = []  # Initialize before appending

all_results.append({
        # "Version": name,
        "ROUGE-1": round(rouge_scores["rouge1"], 4),
        "ROUGE-2": round(rouge_scores["rouge2"], 4),
        "ROUGE-L": round(rouge_scores["rougeL"], 4),
        "BLEU": round(bleu_score["bleu"], 4),
        "BERTScore": round(np.mean(bert_score["f1"]), 4),
        "Exact Match": round(exact_match_accuracy, 4),
        "Avg Token Repetition": round(repetition, 4)
    })

# Display comparison table
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values("ROUGE-L", ascending=False)
display(results_df)

Unnamed: 0,ROUGE-1,ROUGE-2,ROUGE-L,BLEU,BERTScore,Exact Match,Avg Token Repetition
0,0.3065,0.1492,0.2683,0.1318,0.8429,0.0,0.5025


In [45]:
# inference_pipeline.py

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from torch.utils.data import DataLoader
from typing import List, Dict, Optional
from tqdm import tqdm
import os


class CodeSummaryGenerator:
    def __init__(self, model_path: str, decoding_config: Dict,device):
        self.model_path = model_path
        self.decoding_config = decoding_config
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model =GPT2LMHeadModel.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
        self.model.eval()
        
    def preprocess_dataset(self, split: str = "validation", max_input_length: int = 512):
        dataset = load_dataset("code_x_glue_ct_code_to_text", "java")[split]

        def tokenize_fn(example):
            return self.tokenizer(
                example["code"],
                truncation=True,
                padding="max_length",
                max_length=max_input_length,
            )

        tokenized = dataset.map(tokenize_fn, batched=True)
        tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
        return tokenized, dataset["docstring"]

    def generate_summaries(self,
                            tokenized_data,
                            references: List[str],
                            save_path: str,
                            batch_size: int = 16,
                            save_every: int = 50):

        val_loader = DataLoader(tokenized_data, batch_size=batch_size)
        generated_summaries = []
        start_batch = 0

        if os.path.exists(save_path):
            df_existing = pd.read_csv(save_path)
            generated_summaries = df_existing["predicted_summary"].astype(str).tolist()
            start_batch = len(generated_summaries) // batch_size
            print(f"⏩ Resuming from batch {start_batch} (already {len(generated_summaries)} predictions)")

        for i, batch in enumerate(tqdm(val_loader, desc="Generating Summaries")):
            if i < start_batch:
                continue

            input_ids = batch["input_ids"].to(self.device)
            attention_mask = batch["attention_mask"].to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    **self.decoding_config
                )

            decoded = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
            generated_summaries.extend(decoded)

            if (i + 1) % save_every == 0 or (i + 1) == len(val_loader):
                print(f"💾 Saving at batch {i + 1}")
                df = pd.DataFrame({
                    "gold_summary": references[:len(generated_summaries)],
                    "predicted_summary": generated_summaries
                })
                df.to_csv(save_path, index=False)

    def generate_single(self, code_snippet: str):
        inputs = self.tokenizer(
            code_snippet,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(self.device)

        with torch.no_grad():
            output = self.model.generate(**inputs, **self.decoding_config)

        return self.tokenizer.decode(output[0], skip_special_tokens=True)


# Example usage in a script or notebook:
# # if _name_ == "_main_":
# model_path = GPT2LMHeadModel.from_pretrained("./gpt2-docstring-model")
# # tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-docstring-model")

# # Define all 4 decoding strategies
# decoding_configs = {
#     "baseline_beam": {
#         "max_new_tokens": 64,
#         "num_beams": 9,
#         "early_stopping": True
#     },
#     "beam_repetition": {
#         "max_new_tokens": 64,
#         "num_beams": 4,
#         "early_stopping": True,
#         "repetition_penalty": 1.3
#     },
#     "sampling_topk": {
#         "max_new_tokens": 64,
#         "do_sample": True,
#         "top_k": 50,
#         "temperature": 0.7,
#         "early_stopping": True,
#         "repetition_penalty": 1.2
#     }
# }

# Choose best config for final test/validation inference
chosen_config = decoding_configs["sampling_topk"]  # Based on best val results
save_path = "codet5_test_sampling_output_sampling.csv"
chosen_config = decoding_configs["baseline_beam"]  # Based on best val results
save_path = "codet5_test_sampling_output_beam.csv"
chosen_config = decoding_configs["beam_repetition"]  # Based on best val results
save_path = "codet5_test_sampling_output_beam_repeat.csv"

generator = CodeSummaryGenerator(model_path, chosen_config)
test_tokenized, references = generator.preprocess_dataset("test")
generator.generate_summaries(test_tokenized, references, save_path)

# Optional: generate on validation set with another config if needed
# val_generator = CodeSummaryGenerator(model_path, decoding_configs["beam_repetition"])
# val_tokenized, val_refs = val_generator.preprocess_dataset("validation")
# val_generator.generate_summaries(val_tokenized, val_refs, "/content/drive/MyDrive/codet5_val_beam_repetition.csv")

In [47]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_path = "./gpt2-docstring-model"  # ✅ just the path, not the model itself

chosen_config = {
    "max_new_tokens": 64,
    "num_beams": 4,
    "early_stopping": True,
    "repetition_penalty": 1.3
}
save_path = "codet5_test_sampling_output_beam_repeat.csv"
# .to("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

generator = CodeSummaryGenerator(model_path, chosen_config, device)
test_tokenized, references = generator.preprocess_dataset("test")
generator.generate_summaries(test_tokenized, references, save_path)


Map: 100%|████████████████████████████████████████████████████████████████████████████| 10955/10955 [00:07<00:00, 1386.95 examples/s]
Generating Summaries:   0%|                                                                                  | 0/685 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:   0%|                                                                          | 1/685 [00:02<28:23,  2.49s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:   0%|▏                                                                         | 2/685 [00:04<28:22,  2.49s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:   0%|▎                                                                         | 3/685 [00:07<28:21,  2.49s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:   1%|▍                  

💾 Saving at batch 50


Generating Summaries:   7%|█████▍                                                                   | 51/685 [02:11<28:14,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:   8%|█████▌                                                                   | 52/685 [02:14<28:12,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:   8%|█████▋                                                                   | 53/685 [02:17<28:11,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:   8%|█████▊                                                                   | 54/685 [02:19<28:09,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:   8%|█████▊                                                                   | 55/685 [02:22<28:07,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 100


Generating Summaries:  15%|██████████▌                                                             | 101/685 [04:25<26:05,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  15%|██████████▋                                                             | 102/685 [04:28<26:02,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  15%|██████████▊                                                             | 103/685 [04:31<26:00,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  15%|██████████▉                                                             | 104/685 [04:33<25:56,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  15%|███████████                                                             | 105/685 [04:36<25:55,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 150


Generating Summaries:  22%|███████████████▊                                                        | 151/685 [06:39<23:52,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  22%|███████████████▉                                                        | 152/685 [06:42<23:49,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  22%|████████████████                                                        | 153/685 [06:45<23:44,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  22%|████████████████▏                                                       | 154/685 [06:48<23:40,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  23%|████████████████▎                                                       | 155/685 [06:50<23:38,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 200


Generating Summaries:  29%|█████████████████████▏                                                  | 201/685 [08:53<21:37,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  29%|█████████████████████▏                                                  | 202/685 [08:56<21:33,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  30%|█████████████████████▎                                                  | 203/685 [08:59<21:30,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  30%|█████████████████████▍                                                  | 204/685 [09:01<21:26,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  30%|█████████████████████▌                                                  | 205/685 [09:04<21:23,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 250


Generating Summaries:  37%|██████████████████████████▍                                             | 251/685 [11:07<19:19,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  37%|██████████████████████████▍                                             | 252/685 [11:10<19:16,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  37%|██████████████████████████▌                                             | 253/685 [11:12<19:12,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  37%|██████████████████████████▋                                             | 254/685 [11:15<19:09,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  37%|██████████████████████████▊                                             | 255/685 [11:18<19:05,  2.66s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 300


Generating Summaries:  44%|███████████████████████████████▋                                        | 301/685 [13:20<17:08,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  44%|███████████████████████████████▋                                        | 302/685 [13:23<17:05,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  44%|███████████████████████████████▊                                        | 303/685 [13:26<17:01,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  44%|███████████████████████████████▉                                        | 304/685 [13:28<16:59,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  45%|████████████████████████████████                                        | 305/685 [13:31<16:57,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 350


Generating Summaries:  51%|████████████████████████████████████▉                                   | 351/685 [15:34<14:55,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  51%|████████████████████████████████████▉                                   | 352/685 [15:37<14:50,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  52%|█████████████████████████████████████                                   | 353/685 [15:39<14:47,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  52%|█████████████████████████████████████▏                                  | 354/685 [15:42<14:45,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  52%|█████████████████████████████████████▎                                  | 355/685 [15:45<14:41,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 400


Generating Summaries:  59%|██████████████████████████████████████████▏                             | 401/685 [17:48<12:41,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  59%|██████████████████████████████████████████▎                             | 402/685 [17:50<12:37,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  59%|██████████████████████████████████████████▎                             | 403/685 [17:53<12:35,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  59%|██████████████████████████████████████████▍                             | 404/685 [17:56<12:32,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  59%|██████████████████████████████████████████▌                             | 405/685 [17:58<12:30,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 450


Generating Summaries:  66%|███████████████████████████████████████████████▍                        | 451/685 [20:01<10:30,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  66%|███████████████████████████████████████████████▌                        | 452/685 [20:04<10:25,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  66%|███████████████████████████████████████████████▌                        | 453/685 [20:07<10:22,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  66%|███████████████████████████████████████████████▋                        | 454/685 [20:09<10:18,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  66%|███████████████████████████████████████████████▊                        | 455/685 [20:12<10:16,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 500


Generating Summaries:  73%|████████████████████████████████████████████████████▋                   | 501/685 [22:15<08:14,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  73%|████████████████████████████████████████████████████▊                   | 502/685 [22:18<08:12,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  73%|████████████████████████████████████████████████████▊                   | 503/685 [22:21<08:09,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  74%|████████████████████████████████████████████████████▉                   | 504/685 [22:23<08:06,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  74%|█████████████████████████████████████████████████████                   | 505/685 [22:26<08:03,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 550


Generating Summaries:  80%|█████████████████████████████████████████████████████████▉              | 551/685 [24:29<05:59,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  81%|██████████████████████████████████████████████████████████              | 552/685 [24:32<05:56,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  81%|██████████████████████████████████████████████████████████▏             | 553/685 [24:34<05:53,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  81%|██████████████████████████████████████████████████████████▏             | 554/685 [24:37<05:50,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  81%|██████████████████████████████████████████████████████████▎             | 555/685 [24:40<05:47,  2.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 600


Generating Summaries:  88%|███████████████████████████████████████████████████████████████▏        | 601/685 [26:43<03:45,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  88%|███████████████████████████████████████████████████████████████▎        | 602/685 [26:46<03:43,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  88%|███████████████████████████████████████████████████████████████▍        | 603/685 [26:48<03:40,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  88%|███████████████████████████████████████████████████████████████▍        | 604/685 [26:51<03:37,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  88%|███████████████████████████████████████████████████████████████▌        | 605/685 [26:54<03:34,  2.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 650


Generating Summaries:  95%|████████████████████████████████████████████████████████████████████▍   | 651/685 [28:57<01:31,  2.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  95%|████████████████████████████████████████████████████████████████████▌   | 652/685 [29:00<01:28,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  95%|████████████████████████████████████████████████████████████████████▋   | 653/685 [29:02<01:26,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  95%|████████████████████████████████████████████████████████████████████▋   | 654/685 [29:05<01:23,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Summaries:  96%|████████████████████████████████████████████████████████████████████▊   | 655/685 [29:08<01:20,  2.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 

💾 Saving at batch 685





In [None]:
import pandas as pd
import evaluate
import numpy as np
from collections import Counter


# Load predictions CSV
df = pd.read_csv("codet5_val_predictions_cleaned_2.csv")

# Extract predictions and references
predictions = df["predicted_summary"].astype(str).tolist()
references = df["gold_summary"].astype(str).tolist()

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

# ROUGE
rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
print("📊 ROUGE Scores:")
for k, v in rouge_scores.items():
    print(f"{k}: {v:.4f}")

# BLEU
bleu_score = bleu.compute(predictions=predictions, references=references)
print(f"\n📘 BLEU Score: {bleu_score['bleu']:.4f}")

# BERTScore
bert_score = bertscore.compute(predictions=predictions, references=references, lang="en", device="cuda")
print(f"\n🧠 BERTScore F1 (avg): {np.mean(bert_score['f1']):.4f}")

# Function to compute average token repetition per summary
def avg_token_repetition(predictions):
    rep_counts = []
    for text in predictions:
        tokens = text.strip().split()
        counts = Counter(tokens)
        repeated_tokens = sum(v for v in counts.values() if v > 1)
        rep_counts.append(repeated_tokens / max(1, len(tokens)))
    return np.mean(rep_counts)
repetition = avg_token_repetition(predictions)


# Exact Match
exact_matches = sum(p.strip() == r.strip() for p, r in zip(predictions, references))
exact_match_accuracy = exact_matches / len(references)
print(f"\n✅ Exact Match Accuracy: {exact_match_accuracy:.4f}")
