## CodeBLEU

In [None]:
import pandas as pd
from codebleu import calc_codebleu
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a


tokenizer = Tokenizer13a()


def calc_experiment_codebleu(file):
    
    print(file)
    df = pd.read_json(file, orient='records', lines=True, dtype=False)
    
    if df.shape[0] > 0:
        #list_of_references =[[pred] for pred in df["reference"].to_list()]
        list_of_references =df["reference"].to_list()
        hypotheses = df["prediction"].to_list()
        code_bleu_score = calc_codebleu(list_of_references, hypotheses, "java", tokenizer=lambda x: tokenizer(x).split())
    else:
        code_bleu_score = {"codebleu": 0.0}
    
    return code_bleu_score["codebleu"]


In [None]:
import os
from pathlib import Path
from tqdm.auto import tqdm

import fnmatch
import os

root, dirs, files = next(os.walk("../data"))
datasets = dirs

for dataset in datasets:
    dataset_path = Path(root, dataset, "fixed")
    
    paths = []
    for d_root, dirnames, filenames in os.walk(dataset_path):
        for filename in fnmatch.filter(filenames, '*00001-of-00001.jsonl'):
            paths.append( Path(d_root, filename))

    scores_data = {}
    for path in tqdm(paths):
        print(path)
        method = path.parts[4]
        model = path.parts[5] + "/" + path.parts[6]
        print(dataset, method, model)
    
        code_bleu_score = calc_experiment_codebleu(path)
        scores_data.setdefault(model, {})
        scores_data[model][method] = code_bleu_score
    
    scores_df = pd.DataFrame(scores_data)
    scores_df.T.to_csv(Path(root, dataset, "codebleu_scores.csv"), index_label = 'model')

  0%|          | 0/65 [00:00<?, ?it/s]

../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
humaneval-x pre-trained metal-llama/CodeLlama-7b-hf
../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-3b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-7b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-15b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoderbase
../data/humaneval-x/

  0%|          | 0/65 [00:00<?, ?it/s]

../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-3b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-7b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-15b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoderbase
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-train

## CrystalBLEU

In [25]:
import sys
import fractions

_OriginalFraction = fractions.Fraction

class PatchedFraction(_OriginalFraction):
    """Fraction with _normalize=False support for 3.12+"""

    def __new__(cls, numerator=0, denominator=None, _normalize=True):
        if sys.version_info >= (3, 12) and not _normalize:
            # allocate instance without calling _OriginalFraction __new__ for normalization
            self = object.__new__(cls)
            self._numerator = int(numerator)
            self._denominator = int(1 if denominator is None else denominator)
            self._normalize_flag = False
            return self
        else:
            # normal behavior
            obj = _OriginalFraction.__new__(cls, numerator, denominator)
            obj._normalize_flag = True
            return obj

    @property
    def numerator(self):
        if hasattr(self, "_normalize_flag") and not self._normalize_flag:
            return self._numerator
        return super().numerator

    @property
    def denominator(self):
        if hasattr(self, "_normalize_flag") and not self._normalize_flag:
            return self._denominator
        return super().denominator

    def __repr__(self):
        if hasattr(self, "_normalize_flag") and not self._normalize_flag:
            return f"Fraction({self._numerator}, {self._denominator}, _normalize=False)"
        return super().__repr__()

# Monkey-patch globally
fractions.Fraction = PatchedFraction

In [26]:
from datasets import load_dataset

ds = load_dataset("andstor/methods2test_small", "fm+fc+c+m+f+t+tc", split="train")

In [77]:
from collections import Counter
from nltk.util import ngrams
from tqdm.auto import tqdm

# 2. Extract trivially shared n-grams
k = 500
# <tokenized_corpus> is a list of strings
# Extract all n-grams of length 1-4
all_ngrams = []
for example in tqdm(ds):
    tokenized_example = (example["source"] + example["target"]).split()
    for n in range(1, 5):
        all_ngrams.extend(list(ngrams(tokenized_example, n)))
# Calculate frequencies of all n-grams
frequencies = Counter(all_ngrams)
trivially_shared_ngrams = dict(frequencies.most_common(k))
trivially_shared_ngrams

  0%|          | 0/7440 [00:00<?, ?it/s]

{('public',): 82080,
 ('{',): 50147,
 ('}',): 49421,
 ('private',): 44575,
 ('=',): 43989,
 ('String',): 38410,
 ('static',): 34505,
 ('void',): 33148,
 ('final',): 28326,
 ('public', 'void'): 21369,
 ('public', 'static'): 17902,
 ('@Override',): 17553,
 ('int',): 16766,
 ('@Override', 'public'): 16280,
 ('new',): 15645,
 ('class',): 15105,
 ('private', 'static'): 13048,
 ('}', '}'): 12780,
 ('boolean',): 12713,
 ('if',): 9970,
 ('=', 'new'): 9465,
 ('return',): 9433,
 ('static', 'final'): 8770,
 ('+',): 8738,
 ('//',): 8045,
 ('final', 'String'): 7930,
 ('private', 'static', 'final'): 7534,
 ('}', 'class'): 7515,
 ('@Test',): 7003,
 ('@Test', 'public'): 6517,
 ('@Test', 'public', 'void'): 6479,
 ('protected',): 5054,
 ('public', 'String'): 4850,
 ('throws',): 4733,
 ('static', 'final', 'String'): 4683,
 ('@Mock',): 4646,
 ('@Override', 'public', 'void'): 4640,
 ('private', 'void'): 4593,
 ('public', 'boolean'): 4499,
 ('{', 'public'): 4317,
 ('}', 'public'): 4162,
 ('static', 'String'

In [78]:
import pandas as pd
from crystalbleu import corpus_bleu

def calc_experiment_crystalbleu(file):
    global list_of_references, hypotheses
    print(file)
    df = pd.read_json(file, orient='records', lines=True, dtype=False)
    
    if df.shape[0] > 0:
        list_of_references =[[pred.split()] for pred in df["reference"].to_list()]
        hypotheses = [hyp.split() for hyp in df["prediction"].to_list()]
        crystal_bleu_score = corpus_bleu(list_of_references, hypotheses, ignoring=trivially_shared_ngrams)
    else:
        crystal_bleu_score = 0.0

    return crystal_bleu_score


In [80]:
calc_experiment_crystalbleu("../data/humaneval-x/fixed/lora/bigcode/starcoder2-15b/00001-of-00001.jsonl")

../data/humaneval-x/fixed/lora/bigcode/starcoder2-15b/00001-of-00001.jsonl


0.17597010362474716

In [81]:
import os
from pathlib import Path
from tqdm.auto import tqdm
from pygments.lexers.jvm import JavaLexer
lexer = JavaLexer()

import fnmatch
import os

root, dirs, files = next(os.walk("../data"))
datasets = dirs

for dataset in datasets:
    dataset_path = Path(root, dataset, "fixed")
    
    paths = []
    for d_root, dirnames, filenames in os.walk(dataset_path):
        for filename in fnmatch.filter(filenames, '*00001-of-00001.jsonl'):
            paths.append( Path(d_root, filename))

    scores_data = {}
    for path in tqdm(paths):
        print(path)
        method = path.parts[4]
        model = path.parts[5] + "/" + path.parts[6]
        print(dataset, method, model)
    
        code_bleu_score = calc_experiment_crystalbleu(path)
        scores_data.setdefault(model, {})
        scores_data[model][method] = code_bleu_score
    
    scores_df = pd.DataFrame(scores_data)
    scores_df.T.to_csv(Path(root, dataset, "crystalbleu_scores.csv"), index_label = 'model')

  0%|          | 0/65 [00:00<?, ?it/s]

../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
humaneval-x pre-trained metal-llama/CodeLlama-7b-hf
../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-3b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-7b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-15b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoderbase
../data/humaneval-x/

  0%|          | 0/65 [00:00<?, ?it/s]

../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-3b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-7b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-15b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoderbase
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-train

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


../data/methods2test_runnable/fixed/lora/meta-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
methods2test_runnable lora meta-llama/CodeLlama-7b-hf
../data/methods2test_runnable/fixed/lora/meta-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
