## CodeBLEU

In [8]:
import pandas as pd
from codebleu import calc_codebleu


def calc_experiment_codebleu(file):
    print(file)
    df = pd.read_json(file, orient='records', lines=True, dtype=False)
    
    if df.shape[0] > 0:
        list_of_references =[[pred] for pred in df["reference"].to_list()]
        hypotheses = df["prediction"].to_list()
        code_bleu_score = calc_codebleu(list_of_references, hypotheses, "java", tokenizer=lambda x: x)
    else:
        code_bleu_score = {"codebleu": 0.0}
    
    return code_bleu_score#["codebleu"]


In [9]:
import os
from pathlib import Path
from tqdm.auto import tqdm

import fnmatch
import os

root, dirs, files = next(os.walk("../data"))
datasets = dirs

for dataset in datasets:
    dataset_path = Path(root, dataset, "fixed")
    
    paths = []
    for d_root, dirnames, filenames in os.walk(dataset_path):
        for filename in fnmatch.filter(filenames, '*00001-of-00001.jsonl'):
            paths.append( Path(d_root, filename))

    scores_data = {}
    for path in tqdm(paths):
        print(path)
        method = path.parts[4]
        model = path.parts[5] + "/" + path.parts[6]
        print(dataset, method, model)
    
        code_bleu_score = calc_experiment_codebleu(path)
        scores_data.setdefault(model, {})
        scores_data[model][method] = code_bleu_score
    
    scores_df = pd.DataFrame(scores_data)
    scores_df.T.to_csv(Path(root, dataset, "codebleu_scores.csv"), index_label = 'model')

  0%|          | 0/50 [00:00<?, ?it/s]

../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
humaneval-x pre-trained metal-llama/CodeLlama-7b-hf
../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-3b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-7b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-15b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoderbase
../data/humaneval-x/

  0%|          | 0/60 [00:00<?, ?it/s]

../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-3b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-7b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-15b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoderbase
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-train

## CrystalBLEU

In [1]:
import sys
import fractions

_OriginalFraction = fractions.Fraction

class PatchedFraction(_OriginalFraction):
    """Fraction with _normalize=False support for 3.12+"""

    def __new__(cls, numerator=0, denominator=None, _normalize=True):
        if sys.version_info >= (3, 12) and not _normalize:
            # allocate instance without calling _OriginalFraction __new__ for normalization
            self = object.__new__(cls)
            self._numerator = int(numerator)
            self._denominator = int(1 if denominator is None else denominator)
            self._normalize_flag = False
            return self
        else:
            # normal behavior
            obj = _OriginalFraction.__new__(cls, numerator, denominator)
            obj._normalize_flag = True
            return obj

    @property
    def numerator(self):
        if hasattr(self, "_normalize_flag") and not self._normalize_flag:
            return self._numerator
        return super().numerator

    @property
    def denominator(self):
        if hasattr(self, "_normalize_flag") and not self._normalize_flag:
            return self._denominator
        return super().denominator

    def __repr__(self):
        if hasattr(self, "_normalize_flag") and not self._normalize_flag:
            return f"Fraction({self._numerator}, {self._denominator}, _normalize=False)"
        return super().__repr__()

# Monkey-patch globally
fractions.Fraction = PatchedFraction

In [2]:
from datasets import load_dataset

ds = load_dataset("andstor/methods2test_small", "fm+fc+c+m+f+t+tc", split="train")

<function random_array at 0x118059c60>
<function random at 0x118059f80>
<function rand at 0x11805a0c0>


In [3]:
from collections import Counter
from nltk.util import ngrams
from tqdm.auto import tqdm

# 2. Extract trivially shared n-grams
k = 500
# <tokenized_corpus> is a list of strings
# Extract all n-grams of length 1-4
all_ngrams = []
for example in tqdm(ds):
    tokenized_example = list(example["source"] + example["target"])
    for n in range(1, 5):
        all_ngrams.extend(list(ngrams(tokenized_example, n)))
# Calculate frequencies of all n-grams
frequencies = Counter(all_ngrams)
trivially_shared_ngrams = dict(frequencies.most_common(k))
trivially_shared_ngrams

<function bandwidth at 0x119a91e40>
<function issymmetric at 0x119a92020>
<function ishermitian at 0x119a92160>
<function eig at 0x119a92c00>
<function eigh at 0x119a92200>
<function eig_banded at 0x119a92e80>
<function eigvals at 0x119a92fc0>
<function eigvalsh at 0x119a93060>
<function eigvals_banded at 0x119a931a0>
<function eigvalsh_tridiagonal at 0x119a93380>
<function eigh_tridiagonal at 0x119a934c0>
<function hessenberg at 0x119a936a0>
<function svd at 0x119a93920>
<function svdvals at 0x119a93a60>
<function diagsvd at 0x119a93ba0>
<function orth at 0x119a93ce0>
<function null_space at 0x119a93e20>
<function subspace_angles at 0x119a93f60>
<function solve at 0x119b28180>
<function solve_triangular at 0x119b28720>
<function _solve_banded at 0x119b289a0>
<function solveh_banded at 0x119b28ae0>
<function inv at 0x119b28ea0>
<function lstsq at 0x119b29080>
<function pinv at 0x119b291c0>
<function pinvh at 0x119b29260>
<function matrix_balance at 0x119b29440>
<function lu_factor at 0

  0%|          | 0/7440 [00:00<?, ?it/s]

{(' ',): 3257024,
 (' ', ' '): 1911033,
 (' ', ' ', ' '): 1606153,
 ('e',): 1413885,
 (' ', ' ', ' ', ' '): 1342178,
 ('t',): 1155823,
 ('i',): 900838,
 ('r',): 832190,
 ('a',): 802696,
 ('n',): 740576,
 ('s',): 671809,
 ('o',): 663116,
 ('l',): 575262,
 ('c',): 479518,
 ('\n',): 453940,
 ('p',): 396352,
 ('u',): 380090,
 ('d',): 336830,
 ('\n', ' '): 294695,
 ('\n', ' ', ' '): 294663,
 (')',): 292613,
 ('(',): 292566,
 ('g',): 282594,
 ('\n', ' ', ' ', ' '): 261810,
 ('e', 'r'): 238406,
 (';',): 227272,
 (';', '\n'): 223428,
 ('i', 'n'): 220113,
 ('b',): 209595,
 ('a', 't'): 209456,
 ('m',): 208926,
 ('t', 'e'): 206482,
 ('S',): 196412,
 ('r', 'i'): 186431,
 ('v',): 180265,
 (')', ';'): 172118,
 ('o', 'n'): 171449,
 (')', ';', '\n'): 170896,
 ('.',): 157344,
 (';', '\n', ' '): 156451,
 (';', '\n', ' ', ' '): 156436,
 ('f',): 154582,
 ('s', 't'): 153122,
 ('i', 'c'): 151004,
 ('t', 'i'): 146332,
 (' ', 'p'): 143269,
 ('h',): 138359,
 (',',): 137368,
 ('e', 's'): 134229,
 ('n', 't'): 13

In [6]:
import pandas as pd
from crystalbleu import corpus_bleu

def calc_experiment_crystalbleu(file):
    print(file)
    df = pd.read_json(file, orient='records', lines=True, dtype=False)
    
    if df.shape[0] > 0:
        list_of_references =[[pred] for pred in df["reference"].to_list()]
        hypotheses = df["prediction"].to_list()
        crystal_bleu_score = corpus_bleu(list_of_references, hypotheses, ignoring=trivially_shared_ngrams)
    else:
        crystal_bleu_score = 0.0

    return crystal_bleu_score


In [7]:
import os
from pathlib import Path
from tqdm.auto import tqdm

import fnmatch
import os

root, dirs, files = next(os.walk("../data"))
datasets = dirs

for dataset in datasets:
    dataset_path = Path(root, dataset, "fixed")
    
    paths = []
    for d_root, dirnames, filenames in os.walk(dataset_path):
        for filename in fnmatch.filter(filenames, '*00001-of-00001.jsonl'):
            paths.append( Path(d_root, filename))

    scores_data = {}
    for path in tqdm(paths):
        print(path)
        method = path.parts[4]
        model = path.parts[5] + "/" + path.parts[6]
        print(dataset, method, model)
    
        code_bleu_score = calc_experiment_crystalbleu(path)
        scores_data.setdefault(model, {})
        scores_data[model][method] = code_bleu_score
    
    scores_df = pd.DataFrame(scores_data)
    scores_df.T.to_csv(Path(root, dataset, "crystalbleu_scores.csv"), index_label = 'model')

  0%|          | 0/50 [00:00<?, ?it/s]

../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
humaneval-x pre-trained metal-llama/CodeLlama-7b-hf
../data/humaneval-x/fixed/pre-trained/metal-llama/CodeLlama-7b-hf/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-3b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-7b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoder2-15b
../data/humaneval-x/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/humaneval-x/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
humaneval-x pre-trained bigcode/starcoderbase
../data/humaneval-x/

  0%|          | 0/60 [00:00<?, ?it/s]

../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-3b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-3b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-7b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-7b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoder2-15b
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoder2-15b/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
methods2test_runnable pre-trained bigcode/starcoderbase
../data/methods2test_runnable/fixed/pre-trained/bigcode/starcoderbase/00001-of-00001.jsonl
../data/methods2test_runnable/fixed/pre-train