In [12]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('../data/')

In [48]:
models = {
    'codegen-350M-multi': 'CodeGen-350M-multi',
    'codegen2-1B_P': 'CodeGen2-1B',
    'starcoder2-3b': 'StarCoder2-3B',
    'codegen2-3_7B_P': 'CodeGen2-3.7B',
    'CodeLlama-7b-hf': 'CodeLlama-7B',
    'codegen2-7B_P': 'CodeGen2-7B',
    'starcoder2-7b': 'StarCoder2-7B',
    'starcoderbase': 'StarCoderBase',
    'starcoder2-15b': 'StarCoder2-15B',
    'codegen2-16B_P': 'CodeGen2-16B',
}

#datasets = ['methods2test_runnable', 'humaneval-x']
datasets = {
    'humaneval-x': '\\textsc{HumanEval-X\\textsubscript{java}}',
#    'methods2test_runnable': '\\textsc{Methods2Test\\textsubscript{runnable}}',
}

columns = ['valid_syntax', 'scores', 'coverage_runnable', 'coverage_instruction', 'coverage_branch']

methods = {
    'pre-trained': 'None',
    'fine-tuning': 'Fine-tuning',
    'prompt-tuning': 'Prompt tuning',
    'lora': 'LoRA',
    'ia3': '(IA)\\textsuperscript{3}',
}

In [49]:
model_trainable_params =  pd.read_csv(DATA_DIR / 'params_data.csv', index_col=0)
model_trainable_params.index = model_trainable_params.index.str.split('/').str[1]
model_trainable_params = model_trainable_params.loc[models.keys()]
model_trainable_params

Unnamed: 0_level_0,pre-trained,fine-tuning,prompt-tuning,lora,ia3
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
codegen-350M-multi,356712448,356712448,20480,1310720,143360
codegen2-1B_P,1015306240,1015306240,40960,2097152,229376
starcoder2-3b,3030371328,3030371328,61440,4546560,468480
codegen2-3_7B_P,3641174016,3641174016,81920,4194304,458752
CodeLlama-7b-hf,6738546688,6738546688,81920,8388608,614400
codegen2-7B_P,6862858240,6862858240,81920,8388608,917504
starcoder2-7b,7173923840,7173923840,92160,7340032,753664
starcoderbase,15517456384,15517456384,122880,8028160,1239040
starcoder2-15b,15655899136,15655899136,122880,12124160,1249280
codegen2-16B_P,16032155648,16032155648,122880,13369344,1462272


In [50]:

data = {}

for dataset in datasets.keys():
    dataset_dir = DATA_DIR / dataset
    
    coverage_branch = pd.read_csv(dataset_dir / 'coverage_branch.csv', index_col=0, header=None).T.set_index('model').loc[models.keys()]
    
    
    coverage_instruction = pd.read_csv(dataset_dir / 'coverage_instruction.csv', index_col=0, header=None).T.set_index('model').loc[models.keys()]
    
    coverage_runnable = pd.read_csv(dataset_dir / 'coverage_runnable.csv', index_col=0, header=None).T.set_index('model').loc[models.keys()]
    
    scores = pd.read_csv(dataset_dir / 'scores.csv', index_col=0)
    scores.index = scores.index.str.split('/').str[1]
    scores = scores.loc[models.keys()]
    
    valid_syntax = pd.read_csv(dataset_dir / 'valid_syntax.csv', index_col=0)
    valid_syntax.index = valid_syntax.index.str.split('/').str[1]
    valid_syntax = valid_syntax.loc[models.keys()]
    
    data[dataset] = pd.concat([valid_syntax, scores, coverage_runnable, coverage_instruction, coverage_branch], axis=1, keys=columns)
    

#concat all datasets
data = pd.concat(data, axis=1)
data = data.astype(float).round(4)
data


Unnamed: 0_level_0,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x
Unnamed: 0_level_1,valid_syntax,valid_syntax,valid_syntax,valid_syntax,valid_syntax,scores,scores,scores,scores,scores,...,coverage_instruction,coverage_instruction,coverage_instruction,coverage_instruction,coverage_instruction,coverage_branch,coverage_branch,coverage_branch,coverage_branch,coverage_branch
Unnamed: 0_level_2,pre-trained,prompt-tuning,lora,fine-tuning,ia3,pre-trained,fine-tuning,ia3,prompt-tuning,lora,...,fine-tuning,pre-trained,ia3,prompt-tuning,lora,fine-tuning,pre-trained,ia3,prompt-tuning,lora
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
codegen-350M-multi,1.0,1.0,1.0,1.0,1.0,0.3603,0.3291,0.3591,0.3274,0.3906,...,1.0,0.9733,0.9869,0.9769,0.9656,0.8333,0.8977,0.9432,0.9167,0.8929
codegen2-1B_P,0.0,0.0793,0.0854,0.0549,0.0,0.0,0.0359,0.0,0.2547,0.0117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
starcoder2-3b,1.0,1.0,1.0,1.0,1.0,0.4273,0.4974,0.4245,0.4317,0.417,...,0.989,0.994,0.9919,0.9967,0.996,0.8549,0.8567,0.8381,0.7585,0.5884
codegen2-3_7B_P,0.0,0.0,0.4024,0.7378,0.0,0.0,0.2621,0.0,0.0,0.1384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CodeLlama-7b-hf,0.9939,0.9939,0.9939,1.0,1.0,0.487,0.5002,0.4807,0.4623,0.4297,...,0.9895,0.9833,0.9877,0.9838,0.9961,0.8637,0.8446,0.8472,0.8225,0.7247
codegen2-7B_P,1.0,1.0,1.0,1.0,1.0,0.474,0.4407,0.4906,0.4409,0.4645,...,0.9975,0.9831,0.9806,0.9855,0.991,0.7,0.8145,0.795,0.694,0.7535
starcoder2-7b,1.0,1.0,1.0,1.0,1.0,0.403,0.4398,0.4216,0.5058,0.5186,...,0.9915,0.9907,0.9935,0.994,0.9938,0.908,0.8316,0.8839,0.8602,0.8706
starcoderbase,0.9878,0.7805,0.9878,1.0,0.9878,0.4342,0.4832,0.4343,0.2312,0.3911,...,0.9916,0.9907,0.9907,0.9872,0.9976,0.752,0.8148,0.8148,0.6762,0.7389
starcoder2-15b,1.0,1.0,1.0,0.9939,1.0,0.3728,0.4896,0.3729,0.5349,0.4646,...,0.9952,0.9893,0.9895,0.9957,0.9932,0.813,0.8769,0.8789,0.8289,0.8288
codegen2-16B_P,1.0,1.0,1.0,0.9817,1.0,0.4793,0.3783,0.4795,0.4676,0.4714,...,0.9952,0.9866,0.9866,0.9746,0.9862,0.7807,0.8056,0.8056,0.7107,0.8206


In [57]:
best_method_data = {}

for dataset in datasets.keys():
    for column in columns:
        for model in models.keys():
            model_data = data[dataset, column].loc[model]
            # Find max value and what method it corresponds to
            max_value = model_data.max()
            max_indices = model_data[model_data == max_value].index.tolist()
            # If all values are the same, max_indices will be empty
            #if set(max_indices) == set(model_data.index.tolist()):
            #    print(f"All values are the same for {dataset}, {column}, {model}.")
            #    continue
            
            for method_index in max_indices:
                best_method_data.setdefault(model, [])
                best_method_data[model].append((dataset, column, method_index))
                
best_method_data

{'codegen-350M-multi': [('humaneval-x', 'valid_syntax', 'pre-trained'),
  ('humaneval-x', 'valid_syntax', 'prompt-tuning'),
  ('humaneval-x', 'valid_syntax', 'lora'),
  ('humaneval-x', 'valid_syntax', 'fine-tuning'),
  ('humaneval-x', 'valid_syntax', 'ia3'),
  ('humaneval-x', 'scores', 'lora'),
  ('humaneval-x', 'coverage_runnable', 'pre-trained'),
  ('humaneval-x', 'coverage_runnable', 'ia3'),
  ('humaneval-x', 'coverage_instruction', 'fine-tuning'),
  ('humaneval-x', 'coverage_branch', 'ia3')],
 'codegen2-1B_P': [('humaneval-x', 'valid_syntax', 'lora'),
  ('humaneval-x', 'scores', 'prompt-tuning'),
  ('humaneval-x', 'coverage_runnable', 'fine-tuning'),
  ('humaneval-x', 'coverage_runnable', 'pre-trained'),
  ('humaneval-x', 'coverage_runnable', 'ia3'),
  ('humaneval-x', 'coverage_runnable', 'prompt-tuning'),
  ('humaneval-x', 'coverage_runnable', 'lora'),
  ('humaneval-x', 'coverage_instruction', 'fine-tuning'),
  ('humaneval-x', 'coverage_instruction', 'pre-trained'),
  ('humaneval-

In [58]:
decreased_performance_data = {}

for dataset in datasets.keys():
    for column in columns:
        for model in models.keys():
            model_data = data[dataset, column].loc[model]
            # Find max value and what method it corresponds to
            baseline_value = model_data.loc['pre-trained']
            
            decreased_indices = model_data[model_data < baseline_value].index.tolist()
            
            for method_index in decreased_indices:
                decreased_performance_data.setdefault(model, [])
                decreased_performance_data[model].append((dataset, column, method_index))

decreased_performance_data

{'starcoderbase': [('humaneval-x', 'valid_syntax', 'prompt-tuning'),
  ('humaneval-x', 'scores', 'prompt-tuning'),
  ('humaneval-x', 'scores', 'lora'),
  ('humaneval-x', 'coverage_runnable', 'prompt-tuning'),
  ('humaneval-x', 'coverage_instruction', 'prompt-tuning'),
  ('humaneval-x', 'coverage_branch', 'fine-tuning'),
  ('humaneval-x', 'coverage_branch', 'prompt-tuning'),
  ('humaneval-x', 'coverage_branch', 'lora')],
 'starcoder2-15b': [('humaneval-x', 'valid_syntax', 'fine-tuning'),
  ('humaneval-x', 'coverage_runnable', 'fine-tuning'),
  ('humaneval-x', 'coverage_runnable', 'lora'),
  ('humaneval-x', 'coverage_branch', 'fine-tuning'),
  ('humaneval-x', 'coverage_branch', 'prompt-tuning'),
  ('humaneval-x', 'coverage_branch', 'lora')],
 'codegen2-16B_P': [('humaneval-x', 'valid_syntax', 'fine-tuning'),
  ('humaneval-x', 'scores', 'fine-tuning'),
  ('humaneval-x', 'scores', 'prompt-tuning'),
  ('humaneval-x', 'scores', 'lora'),
  ('humaneval-x', 'coverage_runnable', 'fine-tuning'),


In [59]:
bad_syntactical_performance_data = {}

for dataset in datasets.keys():
    for method in methods.keys():
        for model in models.keys():
            valid_syntax = data[dataset, "valid_syntax", method].loc[model]
            if valid_syntax < 0.5:
                for column in columns:  # Skip 'valid_syntax'
                    bad_syntactical_performance_data.setdefault(model, [])
                    bad_syntactical_performance_data[model].append((dataset, column, method))
                    

bad_syntactical_performance_data

{'codegen2-1B_P': [('humaneval-x', 'valid_syntax', 'pre-trained'),
  ('humaneval-x', 'scores', 'pre-trained'),
  ('humaneval-x', 'coverage_runnable', 'pre-trained'),
  ('humaneval-x', 'coverage_instruction', 'pre-trained'),
  ('humaneval-x', 'coverage_branch', 'pre-trained'),
  ('humaneval-x', 'valid_syntax', 'fine-tuning'),
  ('humaneval-x', 'scores', 'fine-tuning'),
  ('humaneval-x', 'coverage_runnable', 'fine-tuning'),
  ('humaneval-x', 'coverage_instruction', 'fine-tuning'),
  ('humaneval-x', 'coverage_branch', 'fine-tuning'),
  ('humaneval-x', 'valid_syntax', 'prompt-tuning'),
  ('humaneval-x', 'scores', 'prompt-tuning'),
  ('humaneval-x', 'coverage_runnable', 'prompt-tuning'),
  ('humaneval-x', 'coverage_instruction', 'prompt-tuning'),
  ('humaneval-x', 'coverage_branch', 'prompt-tuning'),
  ('humaneval-x', 'valid_syntax', 'lora'),
  ('humaneval-x', 'scores', 'lora'),
  ('humaneval-x', 'coverage_runnable', 'lora'),
  ('humaneval-x', 'coverage_instruction', 'lora'),
  ('humaneval-

In [60]:
def escape_latex(text):
    """Escape LaTeX special characters."""
    return text.replace('_', '\\_').replace('%', '\\%').replace('&', '\\&').replace('$', '\\$')

In [63]:
table = []

table.append("\\begin{table*}[htbp]")
table.append("\\begin{threeparttable}")
table.append("    \\newcolumntype{Y}{>{\\centering\\arraybackslash}X}")
table.append("    \\newcolumntype{R}{>{\\raggedright\\arraybackslash}X}")
table.append("    \\newcolumntype{L}{>{\\raggedleft\\arraybackslash}X}")
table.append("    \\centering")
table.append("    \\footnotesize")
table.append("    \\caption{Comparison of syntactical validity and CodeBLEU scores from experiments using different tuning methods across various models on testing split of \\textsc{Methods2Test\\textsubscript{runnable}} and \textsc{HumanEval-X\\textsubscript{java}} datasets. }\\label{tab:eval-summary}")

row = "    \\begin{tabularx}{\\textwidth}{lr"
for i, dataset in enumerate(datasets, start=1):
    row += "L" * 5  # 5 columns for each dataset
    if i < len(datasets):
        row += "!{\\color{white}\\ }"  # Add a space for the vertical line
row += "}"
table.append(row)
#table.append("    \\begin{tabularx}{\\textwidth}{lrLLLLL!{\\color{white}\\ }LLLLL}")


table.append("        \\toprule")

row = "        \\multirow{2}{*}{\\textbf{Method}} & \\multirow{3}{*}{\\parbox[t]{1cm}{\\centering \\textbf{Trainable\\\\params}}}"
for dataset in datasets:
    row += " & \\multicolumn{5}{c}{\\textbf{" + datasets[dataset] + "}}"
row += "\\\\"
table.append(row)

#\cmidrule(lr){3-7}\cmidrule(lr){8-12}
row = "        "
for i, dataset in enumerate(datasets):
    index = 3 + (i * 5)
    row += "\\cmidrule(lr){" + f"{index}-{index + 4}" + "}"
table.append(row)

row  = "        &"
for dataset in datasets:
    row += " & \\rotatebox[origin=l]{90}{Valid syntax} & \\rotatebox[origin=l]{90}{CodeBLEU} & \\rotatebox[origin=l]{90}{pass@1} & \\rotatebox[origin=l]{90}{Instr Cov} & \\rotatebox[origin=l]{90}{Branch Cov}"
row += "\\\\"
table.append(row)

table.append("        \\hline")


for model in models.keys():
    table.append("        \\multicolumn{" + str(2+5*len(datasets)) + "}{l}{\\cellcolor{gray!10}{\\textbf{" + models[model] + "}}} \\bigstrut \\\\*")
    for method in methods:
        col = []
        for dataset in datasets:
            for column in columns:
                try:
                    value = data[dataset, column, method].loc[model]
                except KeyError:
                    value = "N/A"
            
                if (dataset, column, method) in decreased_performance_data.get(model, []):
                    value = f"({value})"
                    
                if (dataset, column, method) in best_method_data.get(model, []):
                    value = f"\\textbf{{{value}}}"
                    
                if (dataset, column, method) in bad_syntactical_performance_data.get(model, []):
                    value = f"\\cellcolor{{red!10}}{{{value}}}"
                    
                col.append(f"{value}")
        row = " & ".join(col)
        params = model_trainable_params[method].loc[model]
        row = "        " + methods[method] + " & " + params + " & " + row + " \\\\"
        table.append(row)
    table.append("")

table.append("       \\bottomrule")
table.append("    \\end{tabularx}")
table.append("    \\begin{tablenotes}[flushleft]\\small")
table.append("      \\item \\textbf{Bold}: best-performing training method per model. (Parentheses): decreased performance compared to baseline. \\colorbox{red!10}{Red}: $<$ 50\\% syntactical valid samples. \\underline{Underline}: Other notable results (see in \\Cref{sec:syntax}).")
table.append("    \\end{tablenotes}")
table.append("\\end{threeparttable}")
table.append("\\end{table*}")

#print("\n".join(table))

In [62]:
result = "\n".join(table)
table_path = Path.cwd().parent / 'tables' / 'eval_summary.tex'
table_path.parent.mkdir(parents=True, exist_ok=True)
with open(table_path, 'w') as f:
    f.write(result)