In [None]:
import pandas as pd
from codebleu import calc_codebleu


def calc_experiment_codebleu(file):
    print(file)
    df = pd.read_json(file, orient='records', lines=True, dtype=False)
    
    if df.shape[0] > 0:
        list_of_references =[[pred] for pred in df["reference"].to_list()]
        hypotheses = df["prediction"].to_list()
        code_bleu_score = calc_codebleu(list_of_references, hypotheses, "java", tokenizer=lambda x: x)
    else:
        code_bleu_score = {"codebleu": 0.0}
    
    return code_bleu_score["codebleu"]


In [4]:
import os
from pathlib import Path
from tqdm.auto import tqdm

import fnmatch
import os

root, dirs, files = next(os.walk("../data"))
datasets = dirs

for dataset in datasets:
    dataset_path = Path(root, dataset, "fixed")
    
    paths = []
    for d_root, dirnames, filenames in os.walk(dataset_path):
        for filename in fnmatch.filter(filenames, '*00001-of-00001.jsonl'):
            paths.append( Path(d_root, filename))

    scores_data = {}
    for path in tqdm(paths):
        print(path)
        method = path.parts[4]
        model = path.parts[5] + "/" + path.parts[6]
        print(dataset, method, model)
    
        code_bleu_score = calc_experiment_codebleu(path)
        scores_data.setdefault(model, {})
        scores_data[model][method] = code_bleu_score
    
    scores_df = pd.DataFrame(scores_data)
    scores_df.T.to_csv(Path(root, dataset, "scores.csv"), index_label = 'model')

  0%|          | 0/50 [00:00<?, ?it/s]

../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
humaneval-x pre-trained metal-llama/CodeLlama-7b-hf
../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-3b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-7b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-15b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoderbase
../data/humaneval-x/

  0%|          | 0/50 [00:00<?, ?it/s]

../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-3b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-7b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-15b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoderbase
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-train