**Goal:** Replicate evaluation of baseline models.

In [None]:
import json
import time
from tqdm.auto import tqdm

import numpy as np
import pandas as pd

import torch

from datasets import load_dataset
from setfit import SetFitModel

In [None]:
tqdm.pandas()

In [None]:
NUM_EXPERIMENTS = 10

In [None]:
langs = ["java", "python", "pharo"]

labels = {
    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
    "pharo": ["Keyimplementationpoints", "Example", "Responsibilities", "Classreferences", "Intent", "Keymessages", "Collaborators"]
}

In [None]:
dataset = load_dataset("NLBSE/nlbse25-code-comment-classification")

In [None]:
total_flops = 0
total_time = 0
scores = []

for lan in langs:
    # Loading the baseline model from the Hub
    model = SetFitModel.from_pretrained(f"NLBSE/nlbse25_{lan}")
    
    # Starting the profiler as context
    with torch.profiler.profile(with_flops=True) as p:
        begin = time.time()
        
        # Running multiple inference experiments
        for i in range(NUM_EXPERIMENTS):
            # Calling the model
            y_pred = model(dataset[f"{lan}_test"]["combo"]).numpy().T
        
        total = time.time() - begin
        total_time = total_time + total
    
    # Calculating the total of FLOPs used
    total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)

    # Getting the ground truth
    y_true = np.array(dataset[f"{lan}_test"]["labels"]).T
    
    for i in range(len(y_pred)):
        assert(len(y_pred[i]) == len(y_true[i]))
        
        # Calculating confusion matrix
        tp = sum([true == pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
        tn = sum([true == pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
        fp = sum([true == 0 and pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
        fn = sum([true == 1 and pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
        
        # Calculating error metrics
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2*tp) / (2*tp + fp + fn)
        
        scores.append({
            "lan": lan,
            "cat": labels[lan][i],
            "precision": precision,
            "recall": recall,
            "f1": f1
        })

scores = pd.DataFrame(scores)

print("Compute in GFLOPs:", total_flops / NUM_EXPERIMENTS)
print("Avg. runtime in seconds:", total_time / NUM_EXPERIMENTS)

In [None]:
scores

In [None]:
max_avg_runtime = 5
max_avg_flops = 5000

# s𝑢𝑏𝑚𝑖𝑠𝑠𝑖𝑜𝑛_𝑠𝑐𝑜𝑟𝑒(𝑚𝑜𝑑𝑒𝑙)=(𝑎𝑣𝑔. 𝐹1)×0.60+((𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)/𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)×0.2+((𝑚𝑎𝑥_GFLOPs−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_GFLOPs)/𝑚𝑎𝑥_GFLOPs)×0.2
def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
      0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
      0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

avg_f1 = float(scores.f1.mean())
avg_runtime = total_time / NUM_EXPERIMENTS
avg_flops = total_flops / NUM_EXPERIMENTS

submission_score = score(avg_f1, avg_runtime, avg_flops)

print("Submission score:", round(submission_score, 3))

In [None]:
scoring_details = {}

for i, row in scores.iterrows():
	scoring_details[f"{row["lan"]}_{ row["cat"]}_f1"] = row["f1"]

scoring_details = {
    **scoring_details,
    **scores.groupby("lan").f1.mean().add_suffix("_avg_f1").to_dict()
}

scoring_details["avg_f1"] = avg_f1
scoring_details["avg_runtime"] = avg_runtime
scoring_details["avg_flops"] = avg_flops
scoring_details["submission_score"] = submission_score

with open("../competition/baseline.json", "w") as f: 
    json.dump(scoring_details, f, indent=4)