In [13]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('../data/')

In [14]:
models = {
    'codegen-350M-multi': 'CodeGen-350M-multi',
    'codegen2-1B_P': 'CodeGen2-1B',
    'starcoder2-3b': 'StarCoder2-3B',
    'codegen2-3_7B_P': 'CodeGen2-3.7B',
    'CodeLlama-7b-hf': 'CodeLlama-7B',
    'codegen2-7B_P': 'CodeGen2-7B',
    'starcoder2-7b': 'StarCoder2-7B',
    'starcoderbase': 'StarCoderBase',
    'starcoder2-15b': 'StarCoder2-15B',
    'codegen2-16B_P': 'CodeGen2-16B',
}

#datasets = ['methods2test_runnable', 'humaneval-x']
datasets = {
    'methods2test_runnable': '\\textsc{Methods2Test\\textsubscript{runnable}}',
    'humaneval-x': '\\textsc{HumanEval-X\\textsubscript{java}}',
}

columns = ['valid_syntax', 'scores', 'coverage_runnable', 'coverage_instruction', 'coverage_branch']

methods = {
    'pre-trained': 'None',
    'fine-tuning': 'Fine-tuning',
    'lora': 'LoRA',
    'ia3': '(IA)\\textsuperscript{3}',
    'prompt-tuning': 'Prompt tuning',
}

In [15]:
#CodeGen2-3.7B LoRA ValidSyntax
#CodeGen2-1B Prompt tuning ValidSyntax

In [16]:
notable_results = [
    ('methods2test_runnable', 'codegen2-1B_P', 'prompt-tuning', 'valid_syntax'),
    ('methods2test_runnable', 'codegen2-3_7B_P', 'LoRA', 'valid_syntax')
]

In [17]:
model_trainable_params =  pd.read_csv(DATA_DIR / 'params_data.csv', index_col=0)
model_trainable_params.index = model_trainable_params.index.str.split('/').str[1]
model_trainable_params = model_trainable_params.loc[models.keys()]
model_trainable_params["pre-trained"] = '0'
model_trainable_params

Unnamed: 0_level_0,pre-trained,fine-tuning,prompt-tuning,lora,ia3
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
codegen-350M-multi,0,356712448,20480,1310720,143360
codegen2-1B_P,0,1015306240,40960,2097152,229376
starcoder2-3b,0,3030371328,61440,4546560,468480
codegen2-3_7B_P,0,3641174016,81920,4194304,458752
CodeLlama-7b-hf,0,6738546688,81920,8388608,614400
codegen2-7B_P,0,6862858240,81920,8388608,917504
starcoder2-7b,0,7173923840,92160,7340032,753664
starcoderbase,0,15517456384,122880,8028160,1239040
starcoder2-15b,0,15655899136,122880,12124160,1249280
codegen2-16B_P,0,16032155648,122880,13369344,1462272


In [18]:

data = {}

for dataset in datasets.keys():
    dataset_dir = DATA_DIR / dataset
    
    coverage_branch = pd.read_csv(dataset_dir / 'coverage_branch.csv', index_col=0)
    coverage_branch.index = coverage_branch.index.str.split('/').str[1]
    coverage_branch = coverage_branch.loc[models.keys()]
    
    coverage_instruction = pd.read_csv(dataset_dir / 'coverage_instruction.csv', index_col=0)
    coverage_instruction.index = coverage_instruction.index.str.split('/').str[1]
    coverage_instruction = coverage_instruction.loc[models.keys()]
    
    coverage_runnable = pd.read_csv(dataset_dir / 'coverage_runnable.csv', index_col=0)
    coverage_runnable.index = coverage_runnable.index.str.split('/').str[1]
    coverage_runnable = coverage_runnable.loc[models.keys()]
    
    
    scores = pd.read_csv(dataset_dir / 'scores.csv', index_col=0)
    scores.index = scores.index.str.split('/').str[1]
    scores = scores.loc[models.keys()]
    
    valid_syntax = pd.read_csv(dataset_dir / 'valid_syntax.csv', index_col=0)
    valid_syntax.index = valid_syntax.index.str.split('/').str[1]
    valid_syntax = valid_syntax.loc[models.keys()]
    
    data[dataset] = pd.concat([valid_syntax, scores, coverage_runnable, coverage_instruction, coverage_branch], axis=1, keys=columns)
    

#concat all datasets
data = pd.concat(data, axis=1)
data = data.astype(float).round(2)
data


Unnamed: 0_level_0,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,methods2test_runnable,...,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x,humaneval-x
Unnamed: 0_level_1,valid_syntax,valid_syntax,valid_syntax,valid_syntax,valid_syntax,scores,scores,scores,scores,scores,...,coverage_instruction,coverage_instruction,coverage_instruction,coverage_instruction,coverage_instruction,coverage_branch,coverage_branch,coverage_branch,coverage_branch,coverage_branch
Unnamed: 0_level_2,lora,ia3,prompt-tuning,pre-trained,fine-tuning,pre-trained,fine-tuning,ia3,prompt-tuning,lora,...,pre-trained,fine-tuning,ia3,prompt-tuning,lora,pre-trained,fine-tuning,ia3,prompt-tuning,lora
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
codegen-350M-multi,0.96,0.95,0.96,0.96,0.98,0.24,0.3,0.24,0.23,0.26,...,0.97,1.0,0.99,0.98,0.97,0.9,0.83,0.94,0.92,0.89
codegen2-1B_P,0.38,0.02,0.7,0.0,0.75,0.0,0.15,0.26,0.26,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
starcoder2-3b,0.98,0.94,0.93,0.93,0.96,0.17,0.3,0.29,0.17,0.31,...,0.99,0.99,0.99,1.0,1.0,0.85,0.85,0.83,0.76,0.57
codegen2-3_7B_P,0.42,0.0,0.22,0.0,0.41,0.0,0.11,0.0,0.26,0.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CodeLlama-7b-hf,0.98,0.98,0.97,0.98,0.98,0.31,0.32,0.31,0.3,0.34,...,0.98,0.99,0.99,0.98,1.0,0.84,0.86,0.85,0.82,0.72
codegen2-7B_P,0.99,0.98,0.98,0.99,0.98,0.28,0.31,0.29,0.28,0.31,...,0.98,1.0,0.98,0.99,0.99,0.81,0.7,0.8,0.69,0.75
starcoder2-7b,0.97,0.95,0.93,0.92,0.97,0.17,0.31,0.32,0.33,0.31,...,0.99,0.99,0.99,0.99,0.99,0.83,0.9,0.88,0.86,0.87
starcoderbase,0.92,0.92,0.91,0.92,0.97,0.17,0.34,0.17,0.19,0.17,...,0.99,0.99,0.99,0.99,1.0,0.81,0.75,0.81,0.68,0.74
starcoder2-15b,0.98,0.93,0.98,0.93,0.98,0.2,0.34,0.2,0.33,0.34,...,0.99,1.0,0.99,1.0,0.99,0.88,0.81,0.88,0.83,0.83
codegen2-16B_P,0.99,0.98,0.98,0.98,0.99,0.3,0.33,0.3,0.3,0.33,...,0.99,1.0,0.99,0.97,0.99,0.81,0.78,0.81,0.71,0.82


In [19]:
best_method_data = {}

for dataset in datasets.keys():
    for column in columns:
        for model in models.keys():
            model_data = data[dataset, column].loc[model]
            # Find max value and what method it corresponds to
            max_value = model_data.max()
            max_indices = model_data[model_data == max_value].index.tolist()
            # If all values are the same, max_indices will be empty
            #if set(max_indices) == set(model_data.index.tolist()):
            #    print(f"All values are the same for {dataset}, {column}, {model}.")
            #    continue
            
            for method_index in max_indices:
                best_method_data.setdefault(model, [])
                best_method_data[model].append((dataset, column, method_index))
                
best_method_data

{'codegen-350M-multi': [('methods2test_runnable',
   'valid_syntax',
   'fine-tuning'),
  ('methods2test_runnable', 'scores', 'fine-tuning'),
  ('methods2test_runnable', 'coverage_runnable', 'lora'),
  ('methods2test_runnable', 'coverage_instruction', 'pre-trained'),
  ('methods2test_runnable', 'coverage_instruction', 'ia3'),
  ('methods2test_runnable', 'coverage_branch', 'fine-tuning'),
  ('humaneval-x', 'valid_syntax', 'prompt-tuning'),
  ('humaneval-x', 'valid_syntax', 'fine-tuning'),
  ('humaneval-x', 'valid_syntax', 'lora'),
  ('humaneval-x', 'valid_syntax', 'pre-trained'),
  ('humaneval-x', 'valid_syntax', 'ia3'),
  ('humaneval-x', 'scores', 'lora'),
  ('humaneval-x', 'coverage_runnable', 'pre-trained'),
  ('humaneval-x', 'coverage_runnable', 'ia3'),
  ('humaneval-x', 'coverage_instruction', 'fine-tuning'),
  ('humaneval-x', 'coverage_branch', 'ia3')],
 'codegen2-1B_P': [('methods2test_runnable', 'valid_syntax', 'fine-tuning'),
  ('methods2test_runnable', 'scores', 'ia3'),
  ('me

In [20]:
decreased_performance_data = {}

for dataset in datasets.keys():
    for column in columns:
        for model in models.keys():
            model_data = data[dataset, column].loc[model]
            # Find max value and what method it corresponds to
            baseline_value = model_data.loc['pre-trained']
            
            decreased_indices = model_data[model_data < baseline_value].index.tolist()
            
            for method_index in decreased_indices:
                decreased_performance_data.setdefault(model, [])
                decreased_performance_data[model].append((dataset, column, method_index))

decreased_performance_data

{'codegen-350M-multi': [('methods2test_runnable', 'valid_syntax', 'ia3'),
  ('methods2test_runnable', 'scores', 'prompt-tuning'),
  ('methods2test_runnable', 'coverage_instruction', 'fine-tuning'),
  ('methods2test_runnable', 'coverage_instruction', 'prompt-tuning'),
  ('methods2test_runnable', 'coverage_instruction', 'lora'),
  ('humaneval-x', 'scores', 'fine-tuning'),
  ('humaneval-x', 'scores', 'prompt-tuning'),
  ('humaneval-x', 'coverage_runnable', 'fine-tuning'),
  ('humaneval-x', 'coverage_runnable', 'prompt-tuning'),
  ('humaneval-x', 'coverage_runnable', 'lora'),
  ('humaneval-x', 'coverage_branch', 'fine-tuning'),
  ('humaneval-x', 'coverage_branch', 'lora')],
 'CodeLlama-7b-hf': [('methods2test_runnable',
   'valid_syntax',
   'prompt-tuning'),
  ('methods2test_runnable', 'scores', 'prompt-tuning'),
  ('methods2test_runnable', 'coverage_runnable', 'prompt-tuning'),
  ('methods2test_runnable', 'coverage_branch', 'fine-tuning'),
  ('methods2test_runnable', 'coverage_branch', '

In [21]:
bad_syntactical_performance_data = {}

for dataset in datasets.keys():
    for method in methods.keys():
        for model in models.keys():
            valid_syntax = data[dataset, "valid_syntax", method].loc[model]
            if valid_syntax < 0.5:
                for column in columns:  # Skip 'valid_syntax'
                    bad_syntactical_performance_data.setdefault(model, [])
                    bad_syntactical_performance_data[model].append((dataset, column, method))
                    

bad_syntactical_performance_data

{'codegen2-1B_P': [('methods2test_runnable', 'valid_syntax', 'pre-trained'),
  ('methods2test_runnable', 'scores', 'pre-trained'),
  ('methods2test_runnable', 'coverage_runnable', 'pre-trained'),
  ('methods2test_runnable', 'coverage_instruction', 'pre-trained'),
  ('methods2test_runnable', 'coverage_branch', 'pre-trained'),
  ('methods2test_runnable', 'valid_syntax', 'lora'),
  ('methods2test_runnable', 'scores', 'lora'),
  ('methods2test_runnable', 'coverage_runnable', 'lora'),
  ('methods2test_runnable', 'coverage_instruction', 'lora'),
  ('methods2test_runnable', 'coverage_branch', 'lora'),
  ('methods2test_runnable', 'valid_syntax', 'ia3'),
  ('methods2test_runnable', 'scores', 'ia3'),
  ('methods2test_runnable', 'coverage_runnable', 'ia3'),
  ('methods2test_runnable', 'coverage_instruction', 'ia3'),
  ('methods2test_runnable', 'coverage_branch', 'ia3'),
  ('humaneval-x', 'valid_syntax', 'pre-trained'),
  ('humaneval-x', 'scores', 'pre-trained'),
  ('humaneval-x', 'coverage_runnab

In [22]:
def escape_latex(text):
    """Escape LaTeX special characters."""
    return text.replace('_', '\\_').replace('%', '\\%').replace('&', '\\&').replace('$', '\\$')

In [23]:
table = []

table.append("\\begin{table*}[htbp]")
table.append("\\begin{threeparttable}")
table.append("    \\newcolumntype{Y}{>{\\centering\\arraybackslash}X}")
table.append("    \\newcolumntype{R}{>{\\raggedright\\arraybackslash}X}")
table.append("    \\newcolumntype{L}{>{\\raggedleft\\arraybackslash}X}")
table.append("    \\centering")
table.append("    \\footnotesize")
table.append("    \\caption{Comparison of syntactical validity and CodeBLEU scores from experiments using different tuning methods across various models on testing split of \\textsc{Methods2Test\\textsubscript{runnable}} and \textsc{HumanEval-X\\textsubscript{java}} datasets. }\\label{tab:eval-summary}")

row = "    \\begin{tabularx}{\\textwidth}{lr"
for i, dataset in enumerate(datasets, start=1):
    row += "L" * 5  # 5 columns for each dataset
    if i < len(datasets):
        row += "!{\\color{white}\\ }"  # Add a space for the vertical line
row += "}"
table.append(row)
#table.append("    \\begin{tabularx}{\\textwidth}{lrLLLLL!{\\color{white}\\ }LLLLL}")


table.append("        \\toprule")

row = "        \\multirow{2}{*}{\\textbf{Method}} & \\multirow{3}{*}{\\parbox[t]{1cm}{\\centering \\textbf{Trainable\\\\params}}}"
for dataset in datasets:
    row += " & \\multicolumn{5}{c}{\\textbf{" + datasets[dataset] + "}}"
row += "\\\\"
table.append(row)

#\cmidrule(lr){3-7}\cmidrule(lr){8-12}
row = "        "
for i, dataset in enumerate(datasets):
    index = 3 + (i * 5)
    row += "\\cmidrule(lr){" + f"{index}-{index + 4}" + "}"
table.append(row)

row  = "        &"
for dataset in datasets:
    row += " & \\rotatebox[origin=l]{90}{Valid syntax} & \\rotatebox[origin=l]{90}{CodeBLEU} & \\rotatebox[origin=l]{90}{pass@1} & \\rotatebox[origin=l]{90}{Instr Cov} & \\rotatebox[origin=l]{90}{Branch Cov}"
row += "\\\\"
table.append(row)

table.append("        \\hline")


for model in models.keys():
    table.append("        \\multicolumn{" + str(2+5*len(datasets)) + "}{l}{\\cellcolor{gray!10}{\\textbf{" + models[model] + "}}} \\bigstrut \\\\*")
    for method in methods:
        col = []
        for dataset in datasets:
            for column in columns:
                try:
                    value = data[dataset, column, method].loc[model]
                except KeyError:
                    value = "N/A"
            
                if (dataset, column, method) in decreased_performance_data.get(model, []):
                    value = f"({value})"
                    
                if (dataset, column, method) in best_method_data.get(model, []):
                    value = f"\\textbf{{{value}}}"
                    
                if (dataset, column, method) in bad_syntactical_performance_data.get(model, []):
                    value = f"\\cellcolor{{red!10}}{{{value}}}"
                    
                col.append(f"{value}")
        row = " & ".join(col)
        params = model_trainable_params[method].loc[model]
        row = "        " + methods[method] + " & " + params + " & " + row + " \\\\"
        table.append(row)
    table.append("")

table.append("       \\bottomrule")
table.append("    \\end{tabularx}")
table.append("    \\begin{tablenotes}[flushleft]\\small")
table.append("      \\item \\textbf{Bold}: best-performing training method per model. (Parentheses): decreased performance compared to baseline. \\colorbox{red!10}{Red}: $<$ 50\\% syntactical valid samples. \\underline{Underline}: Other notable results (see in \\Cref{sec:syntax}).")
table.append("    \\end{tablenotes}")
table.append("\\end{threeparttable}")
table.append("\\end{table*}")

#print("\n".join(table))

In [24]:
result = "\n".join(table)
table_path = Path.cwd().parent / 'tables' / 'eval_summary.tex'
table_path.parent.mkdir(parents=True, exist_ok=True)
with open(table_path, 'w') as f:
    f.write(result)