**Goal:** Evaluate model candidates by language that potentially use different embedding models and heads.

In [1]:
import json
import time
from tqdm.auto import tqdm

import numpy as np
import pandas as pd

import torch

from datasets import load_dataset
from setfit import SetFitModel

In [2]:
tqdm.pandas()

In [3]:
NUM_EXPERIMENTS = 10
MODEL_NAME = "{}-s{}-bs{}-e{}-i{}-h{}"
MODEL_PATH = "../optimized_models/{}/{}"

In [4]:
langs = ["java", "python", "pharo"]

labels = {
    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
    "pharo": ["Keyimplementationpoints", "Example", "Responsibilities", "Classreferences", "Intent", "Keymessages", "Collaborators"]
}

In [5]:
strategy = "MO"
batch_size = 32
num_epochs = "5-10-10"
head = "LR"  #TODO: Change the head in name for optimized models

model_keys = {
    "java": {
        "alias": "aml6",
        "num_iterations": 40
    },
    "python": {
        "alias": "aml6",
        "num_iterations": 20
    },
    "pharo": {
        "alias": "pml3",
        "num_iterations": 60
    },
}


In [6]:
dataset = load_dataset("NLBSE/nlbse25-code-comment-classification")

In [8]:
total_flops = 0
total_time = 0
scores = []

for lan in langs:
    model_name = MODEL_NAME.format(model_keys[lan]["alias"], strategy, batch_size, num_epochs, model_keys[lan]["num_iterations"], head)
    print("Model to evaluate:", model_name)

    # Loading the baseline model from the Hub:
    model = SetFitModel.from_pretrained(MODEL_PATH.format(model_name, lan))
    
    # Starting the profiler as context
    with torch.profiler.profile(with_flops=True) as p:
        begin = time.time()
        
        # Running multiple inference experiments
        for i in range(NUM_EXPERIMENTS):
            # Calling the model
            y_pred = model(dataset[f"{lan}_test"]["combo"]).cpu().numpy().T
        
        total = time.time() - begin
        total_time = total_time + total
    
    # Calculating the total of FLOPs used
    total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)

    # Getting the ground truth
    y_true = np.array(dataset[f"{lan}_test"]["labels"]).T
    
    for i in range(len(y_pred)):
        assert(len(y_pred[i]) == len(y_true[i]))
        
        # Calculating confusion matrix
        tp = sum([true == pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
        tn = sum([true == pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
        fp = sum([true == 0 and pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
        fn = sum([true == 1 and pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
        
        # Calculating error metrics
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2*tp) / (2*tp + fp + fn)
        
        scores.append({
            "lan": lan,
            "cat": labels[lan][i],
            "precision": precision,
            "recall": recall,
            "f1": f1
        })

scores = pd.DataFrame(scores)

print("Compute in GFLOPs:", total_flops / NUM_EXPERIMENTS)
print("Avg. runtime in seconds:", total_time / NUM_EXPERIMENTS)

Model to evaluate: aml6-sMO-bs32-e5-10-10-i40-hLR
Model to evaluate: aml6-sMO-bs32-e5-10-10-i20-hLR
Model to evaluate: pml3-sMO-bs32-e5-10-10-i60-hLR


  pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid 

Compute in GFLOPs: 2081.452659456
Avg. runtime in seconds: 4.9896280527114865


In [9]:
scores

Unnamed: 0,lan,cat,precision,recall,f1
0,java,summary,0.872998,0.855381,0.8641
1,java,Ownership,1.0,1.0,1.0
2,java,Expand,0.415094,0.431373,0.423077
3,java,usage,0.929293,0.853828,0.889964
4,java,Pointer,0.811927,0.961957,0.880597
5,java,deprecation,1.0,0.666667,0.8
6,java,rational,0.157895,0.352941,0.218182
7,python,Usage,0.760331,0.760331,0.760331
8,python,Parameters,0.860656,0.820312,0.84
9,python,DevelopmentNotes,0.392157,0.487805,0.434783


In [10]:
max_avg_runtime = 5
max_avg_flops = 5000

# s𝑢𝑏𝑚𝑖𝑠𝑠𝑖𝑜𝑛_𝑠𝑐𝑜𝑟𝑒(𝑚𝑜𝑑𝑒𝑙)=(𝑎𝑣𝑔. 𝐹1)×0.60+((𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)/𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)×0.2+((𝑚𝑎𝑥_GFLOPs−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_GFLOPs)/𝑚𝑎𝑥_GFLOPs)×0.2
def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
      0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
      0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

avg_f1 = float(scores.f1.mean())
avg_runtime = total_time / NUM_EXPERIMENTS
avg_flops = total_flops / NUM_EXPERIMENTS

submission_score = score(avg_f1, avg_runtime, avg_flops)

print("Submission score:", round(submission_score, 3))

Submission score: 0.523


In [11]:
scoring_details = {}

for i, row in scores.iterrows():
	scoring_details[f"{row["lan"]}_{ row["cat"]}_f1"] = row["f1"]

scoring_details = {
    **scoring_details,
    **scores.groupby("lan").f1.mean().add_suffix("_avg_f1").to_dict()
}

scoring_details["avg_f1"] = avg_f1
scoring_details["avg_runtime"] = avg_runtime
scoring_details["avg_flops"] = avg_flops
scoring_details["submission_score"] = submission_score

with open(f"../competition/final.json", "w") as f: 
    json.dump(scoring_details, f, indent=4)