# Tables

## Setup

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('../data/')

In [2]:
models = {
    'codegen-350M-multi': 'CodeGen-350M-multi',
    'codegen2-1B_P': 'CodeGen2-1B',
    'starcoder2-3b': 'StarCoder2-3B',
    'codegen2-3_7B_P': 'CodeGen2-3.7B',
    'CodeLlama-7b-hf': 'CodeLlama-7B',
    'codegen2-7B_P': 'CodeGen2-7B',
    'starcoder2-7b': 'StarCoder2-7B',
    'starcoderbase': 'StarCoderBase',
    'starcoder2-15b': 'StarCoder2-15B',
    'codegen2-16B_P': 'CodeGen2-16B',
}

datasets = {
    'methods2test_runnable': '\\textsc{Methods2Test\\textsubscript{runnable}}',
    'humaneval-x': '\\textsc{HumanEval-X\\textsubscript{java}}',
}

columns = {"methods2test_runnable": ['valid_syntax', 'scores', 'passing_rate', 'coverage_instruction', 'coverage_branch'],
           "humaneval-x": ['valid_syntax', 'scores', 'passing_rate']}

methods = {
    'pre-trained': 'None',
    'fine-tuning': 'Fine-tuning',
    'lora': 'LoRA',
    'ia3': '(IA)\\textsuperscript{3}',
    'prompt-tuning': 'Prompt tuning',
}

## Table 2

In [3]:
model_trainable_params =  pd.read_csv(DATA_DIR / 'params_data.csv', index_col=0)
model_trainable_params.index = model_trainable_params.index.str.split('/').str[1]
model_trainable_params = model_trainable_params.loc[models.keys()]
model_trainable_params["pre-trained"] = '0'
model_trainable_params

Unnamed: 0_level_0,pre-trained,fine-tuning,prompt-tuning,lora,ia3
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
codegen-350M-multi,0,356712448,20480,1310720,143360
codegen2-1B_P,0,1015306240,40960,2097152,229376
starcoder2-3b,0,3030371328,61440,4546560,468480
codegen2-3_7B_P,0,3641174016,81920,4194304,458752
CodeLlama-7b-hf,0,6738546688,81920,8388608,614400
codegen2-7B_P,0,6862858240,81920,8388608,917504
starcoder2-7b,0,7173923840,92160,7340032,753664
starcoderbase,0,15517456384,122880,8028160,1239040
starcoder2-15b,0,15655899136,122880,12124160,1249280
codegen2-16B_P,0,16032155648,122880,13369344,1462272


In [4]:
import pandas as pd
from pathlib import Path

data = {}

for dataset in datasets.keys():
    dataset_dir = DATA_DIR / dataset
    dataset_data = {}
    dataset_columns = []

    for name in columns[dataset]:
        file_path = dataset_dir / f"{name}.csv"
        try:
            df = pd.read_csv(file_path, index_col=0)
            df.index = df.index.str.split('/').str[1]
            df = df.loc[models.keys()]
            dataset_data[name] = df
            dataset_columns.append(name)
        except FileNotFoundError:
            print(f"Missing file: {file_path} — skipping.")

    if dataset_data:
        data[dataset] = pd.concat(
            [dataset_data[col] for col in dataset_columns],
            axis=1,
            keys=dataset_columns
        )

# Combine all datasets (outer concat across dataset names)
if data:
    data = pd.concat(data, axis=1)
    data = data.astype(float).round(2)
else:
    data = pd.DataFrame()

data

Unnamed: 0_level_0,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,...,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x
Unnamed: 0_level_1,valid_syntax,valid_syntax,valid_syntax,valid_syntax,valid_syntax,scores,scores,scores,scores,scores,...,scores,scores,scores,scores,scores,passing_rate,passing_rate,passing_rate,passing_rate,passing_rate
Unnamed: 0_level_2,lora,ia3,prompt-tuning,pre-trained,fine-tuning,pre-trained,fine-tuning,ia3,prompt-tuning,lora,...,pre-trained,fine-tuning,ia3,prompt-tuning,lora,pre-trained,fine-tuning,ia3,prompt-tuning,lora
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
codegen-350M-multi,0.96,0.95,0.96,0.96,0.98,0.24,0.3,0.24,0.23,0.26,...,0.36,0.33,0.36,0.33,0.39,0.07,0.04,0.07,0.05,0.04
codegen2-1B_P,0.38,0.02,0.7,0.0,0.75,0.0,0.15,0.26,0.26,0.04,...,0.0,0.04,0.0,0.25,0.01,0.0,0.0,0.0,0.0,0.0
starcoder2-3b,0.98,0.94,0.93,0.93,0.96,0.17,0.3,0.29,0.17,0.31,...,0.43,0.5,0.42,0.43,0.42,0.34,0.31,0.36,0.27,0.24
codegen2-3_7B_P,0.42,0.0,0.22,0.0,0.41,0.0,0.11,0.0,0.26,0.11,...,0.0,0.26,0.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0
CodeLlama-7b-hf,0.98,0.98,0.97,0.98,0.98,0.31,0.32,0.31,0.3,0.34,...,0.49,0.5,0.48,0.46,0.43,0.33,0.34,0.32,0.28,0.31
codegen2-7B_P,0.99,0.98,0.98,0.99,0.98,0.28,0.31,0.29,0.28,0.31,...,0.47,0.44,0.49,0.44,0.47,0.23,0.13,0.21,0.18,0.2
starcoder2-7b,0.97,0.95,0.93,0.92,0.97,0.17,0.31,0.32,0.33,0.31,...,0.4,0.44,0.42,0.51,0.52,0.37,0.36,0.34,0.37,0.35
starcoderbase,0.92,0.92,0.91,0.92,0.97,0.17,0.34,0.17,0.19,0.17,...,0.43,0.48,0.43,0.23,0.39,0.3,0.33,0.3,0.16,0.3
starcoder2-15b,0.98,0.93,0.98,0.93,0.98,0.2,0.34,0.2,0.33,0.34,...,0.37,0.49,0.37,0.54,0.46,0.41,0.37,0.41,0.45,0.39
codegen2-16B_P,0.99,0.98,0.98,0.98,0.99,0.3,0.33,0.3,0.3,0.33,...,0.48,0.38,0.48,0.47,0.47,0.2,0.12,0.2,0.21,0.22


In [5]:
best_method_data = {}

for dataset in datasets.keys():
    for column in columns[dataset]:
        for model in models.keys():
            model_data = data[dataset, column].loc[model]
            # Find max value and what method it corresponds to
            max_value = model_data.max()
            max_indices = model_data[model_data == max_value].index.tolist()
            # If all values are the same, max_indices will be empty
            #if set(max_indices) == set(model_data.index.tolist()):
            #    print(f"All values are the same for {dataset}, {column}, {model}.")
            #    continue
            
            for method_index in max_indices:
                best_method_data.setdefault(model, [])
                best_method_data[model].append((dataset, column, method_index))


  model_data = data[dataset, column].loc[model]


In [6]:
decreased_performance_data = {}

for dataset in datasets.keys():
    for column in columns[dataset]:
        for model in models.keys():
            model_data = data[dataset, column].loc[model]
            # Find max value and what method it corresponds to
            
            baseline_value = model_data.loc['pre-trained']
            
            decreased_indices = model_data[model_data < baseline_value].index.tolist()
            
            for method_index in decreased_indices:
                decreased_performance_data.setdefault(model, [])
                decreased_performance_data[model].append((dataset, column, method_index))

  model_data = data[dataset, column].loc[model]


In [7]:
bad_syntactical_performance_data = {}

for dataset in datasets.keys():
    for method in methods.keys():
        for model in models.keys():
            valid_syntax = data[dataset, "valid_syntax", method].loc[model]
            if valid_syntax < 0.5:
                for column in columns[dataset]:  # Skip 'valid_syntax'
                    bad_syntactical_performance_data.setdefault(model, [])
                    bad_syntactical_performance_data[model].append((dataset, column, method))

In [8]:
def escape_latex(text):
    """Escape LaTeX special characters."""
    return text.replace('_', '\\_').replace('%', '\\%').replace('&', '\\&').replace('$', '\\$')

In [9]:
def int_to_letter(n):
    return chr(ord('a') + n - 1)

In [10]:
table = []

table.append("\\begin{table*}[htbp]")
table.append("\\begin{threeparttable}")
table.append("    \\newcolumntype{Y}{>{\\centering\\arraybackslash}X}")
table.append("    \\newcolumntype{R}{>{\\raggedright\\arraybackslash}X}")
table.append("    \\newcolumntype{L}{>{\\raggedleft\\arraybackslash}X}")
table.append("    \\centering")
table.append("    \\footnotesize")
table.append("    \\caption{Evaluation metrics experiment results using different tuning methods across various models.}\\label{tab:eval-summary}")

row = "    \\begin{tabularx}{\\textwidth}{lr!{\\color{white}\\hspace{.5em}}"
for i, dataset in enumerate(datasets, start=1):
    
    row += "Y" * len(columns[dataset])  # 5 columns for each dataset
    if i < len(datasets):
        row += "!{\\color{white}\\hspace{1em}}"
row += "}"
table.append(row)
#table.append("    \\begin{tabularx}{\\textwidth}{lrLLLLL!{\\color{white}\\ }LLLLL}")
#\multicolumn{2}{c}{\normalsize\textbf{(a)}} & \multicolumn{5}{c}{\normalsize\textbf{(b)}} & \multicolumn{3}{c}{\normalsize\textbf{(c)}}\\[.5em]

row = "        \\multicolumn{2}{c}{\\normalsize\\textbf{(a)}} & "
for i, dataset in enumerate(datasets, start=2):
    print(dataset)
    row += "\\multicolumn{" + str(len(columns[dataset])) + "}{c}{\\normalsize\\textbf{(" + str(int_to_letter(i)) + ")}}"
    if i < len(datasets) + 1:
        print(f"Adding dataset {dataset} with {len(columns[dataset])} columns")
        row += " & "
row += "\\\\[.5em]"
table.append(row)

table.append("        \\cmidrule(lr){1-2}\\cmidrule(lr){3-7}\\cmidrule(lr){8-10}")

row = "        \\multirow{2}{*}{\\textbf{Method}} & \\multirow{2}{*}{\\parbox[t]{1cm}{\\centering \\textbf{Trainable\\\\params}}}"

for i, dataset in enumerate(datasets):
    row += " & \\multicolumn{" + str(len(columns[dataset])) + "}{c}{\\textbf{" + escape_latex(datasets[dataset]) + "}}"
#row += " & \\multicolumn{5}{c}{\\textbf{" + datasets[dataset] + "}}"
#row += " & \\multicolumn{3}{c}{\\textbf{" + datasets[dataset] + "}}"
row += "\\\\"
table.append(row)

#\cmidrule(lr){3-7}\cmidrule(lr){8-12}
row = "        "
index = 3
for i, dataset in enumerate(datasets):
    row += "\\cmidrule(lr){" + f"{index}-{index + len(columns[dataset])-1}" + "}"
    index += len(columns[dataset])
table.append(row)

row  = "        &"
row += " & \\rotatebox[origin=l]{90}{Valid syntax} & \\rotatebox[origin=l]{90}{CodeBLEU} & \\rotatebox[origin=l]{90}{pass@1} & \\rotatebox[origin=l]{90}{Instr. Cov.} & \\rotatebox[origin=l]{90}{Branch Cov.}"
row += " & \\rotatebox[origin=l]{90}{Valid syntax} & \\rotatebox[origin=l]{90}{CodeBLEU} & \\rotatebox[origin=l]{90}{pass@1}"
row += "\\\\"
table.append(row)

table.append("        \\hline")


for model in models.keys():
    table.append("        \\multicolumn{" + str(2+5+3) + "}{l}{\\cellcolor{gray!10}{\\textbf{" + models[model] + "}}} \\bigstrut \\\\*")
    for method in methods:
        col = []
        for dataset in datasets:
            for column in columns[dataset]:
                try:
                    value = data[dataset, column, method].loc[model]
                except KeyError:
                    value = "N/A"
            
                if (dataset, column, method) in decreased_performance_data.get(model, []):
                    value = f"({value})"
                    
                if (dataset, column, method) in best_method_data.get(model, []):
                    value = f"\\textbf{{{value}}}"
                    
                if (dataset, column, method) in bad_syntactical_performance_data.get(model, []):
                    value = f"\\cellcolor{{red!10}}{{{value}}}"
                    
                col.append(f"{value}")
        row = " & ".join(col)
        params = model_trainable_params[method].loc[model]
        row = "        " + methods[method] + " & " + params + " & " + row + " \\\\"
        table.append(row)
    table.append("")

table.append("       \\bottomrule")
table.append("    \\end{tabularx}")
table.append("    \\begin{tablenotes}[flushleft]\\small")
table.append("      \\item \\textbf{Bold}: best-performing training method per model. (Parentheses): decreased performance compared to baseline. \\colorbox{red!10}{Red}: $<$ 50\\% syntactical valid samples.")
table.append("    \\end{tablenotes}")
table.append("\\end{threeparttable}")
table.append("\\end{table*}")

#print("\n".join(table))

methods2test_runnable
Adding dataset methods2test_runnable with 5 columns
humaneval-x


In [11]:
result = "\n".join(table)
table_path = Path.cwd().parent / 'tables' / 'eval_summary.tex'
table_path.parent.mkdir(parents=True, exist_ok=True)
with open(table_path, 'w') as f:
    f.write(result)

## Table 3

In [12]:
keywords = ["assert", "verify", "fail"]

def contains_keyword(text):
    for keyword in keywords:
        if keyword in text.lower():
            return True
    return False

In [13]:
statuses = {
    "success": "Succ.",
    "failed": "Failed",
    "error": "Interrupt",
    "compilation error": "CompErr",
    "no_assertions": "NoAssert"
}

In [14]:
import pandas as pd

table = []

table.append("\\begin{table}[htbp]")
table.append("    \\newcolumntype{Y}{>{\\centering\\arraybackslash}X}")
table.append("    \\centering")
table.append("    \\caption{Test execution statuses for StarCode2-7B.}")
table.append("    \\label{tab:test-statuses}")
table.append("    \\small")
table.append("    \\begin{tabularx}{\\columnwidth}{lYYYYY}")
table.append("    \\toprule")
table.append("    \\textbf{Method} & \\textbf{Succ.} & \\textbf{Failed} & \\textbf{Interrupt} & \\textbf{CompErr} & \\textbf{NoAssert} \\\\")
table.append("    \\midrule")

for method in methods:
    col = []
    
    path = DATA_DIR / "methods2test_runnable/coverage" / method / "bigcode/starcoder2-7b/jacoco.jsonl"
    df = pd.read_json(path, lines=True, dtype=False).set_index("id")
    df = df[df["status"] != "exception"]
    
    gen_data_path = Path(str(path).replace("coverage", "fixed").replace("jacoco.jsonl", "00001-of-00001.jsonl"))
    gen_df = pd.read_json(gen_data_path, lines=True, dtype=False).set_index("id")
    ids = gen_df[~gen_df["prediction"].apply(contains_keyword)].index.tolist()
    keys = df.index.intersection(ids)
    no_assertions_df = df.loc[(df.index.isin(keys)) & (df["status"] == "success")]
    
    values = df['status'].value_counts().sort_index()
    values.loc['success'] = values.loc['success'] - no_assertions_df.shape[0]
    values.loc['no_assertions'] = no_assertions_df.shape[0]

    percentages = (values / values.sum())
    
    for status in statuses.keys():
        value = str(int(percentages.loc[status].round(2) * 100)) + "\\%"
        col.append(value)

    
    row = " & ".join(col)
    row = methods[method] + " & " + row
    table.append("    " + row + " \\\\")
    
table.append("    \\bottomrule")
table.append("    \\end{tabularx}")
table.append("\\end{table}")

#print("\n".join(table))

In [15]:
result = "\n".join(table)
table_path = Path.cwd().parent / 'tables' / 'test_execution_example.tex'
table_path.parent.mkdir(parents=True, exist_ok=True)
with open(table_path, 'w') as f:
    f.write(result)