This script analyzes the results of the evaluated responses. It computes the following metrics:<br>
successful build rate: percentage of additional error count == 0<br>
pass@1 rate: percentage of additional_failure_count == 0<br>
CodeBLEU: average CodeBLEU score<br>

In [2]:
import os
import numpy as np
import pandas as pd
import re
from scipy import stats

import javalang

from config import config_dict, logger




In [3]:
def margin_of_error(p_hat, n, confidence_level=0.95):
  """Calculates the margin of error for a proportion.

  Args:
    p_hat: The sample proportion.
    n: The sample size.
    confidence_level: The confidence level.

  Returns:
    The margin of error.
  """

  z = stats.norm.ppf(1 - (confidence_level / 2))
  se = np.sqrt(p_hat * (1 - p_hat) / n)
  return z * se

In [4]:
def count_tokens_in_label(label):
    label = str(label)
    label = label.replace('<EOT>', '')
    try:
        return len(list(javalang.tokenizer.tokenize(label)))
    except:
        return len(re.split(',| |_|-|=|\.|\+|\*', label))

def analyze_results_file(results_df, base_modules=False):
    logger.info("Analyzing results")

    successful_build_rate = (results_df['pass'] == 0).sum() / len(results_df)
    successful_build_rate_moe = margin_of_error(successful_build_rate, len(results_df))

    no_error_rate = ((results_df['pass'] == 0) & (results_df['error_count'] == 0)).sum() / len(results_df)
    no_error_rate_moe = margin_of_error(no_error_rate, len(results_df))

    pass_at_1_rate = ((results_df['pass'] == 0) & (results_df['failure_count'] == 0)).sum() / len(results_df)
    pass_at_1_rate_moe = margin_of_error(pass_at_1_rate, len(results_df))


    results_dict = {}
    columns_to_average = ['ngram_match_score', 'weighted_ngram_match_score', 'syntax_match_score', 'dataflow_match_score', 'label_token_count', 'completion_token_count']

    if base_modules:
        average_code_bleu = 1
        average_code_bleu_moe = 0
        
        for column in columns_to_average:
            results_dict[f'average_{column}'] = np.nan

    else:
        average_code_bleu = results_df['codebleu'].mean()
        average_code_bleu_moe = results_df['codebleu'].std() #  is this how you calculate the standard error for a bounded continuous variable?
        results_df['label_token_count'] = results_df['label'].apply(count_tokens_in_label)
        results_df['completion_token_count'] = results_df['completion'].apply(count_tokens_in_label)
        
        for column in columns_to_average:
            results_dict[f'average_{column}'] = results_df[column].mean()

    results_dict['successful_build_rate'] = successful_build_rate
    results_dict['successful_build_rate_moe'] = successful_build_rate_moe
    results_dict['no_error_rate'] = no_error_rate
    results_dict['no_error_rate_moe'] = no_error_rate_moe
    results_dict['pass_at_1_rate'] = pass_at_1_rate
    results_dict['pass_at_1_rate_moe'] = pass_at_1_rate_moe
    results_dict['average_code_bleu'] = average_code_bleu
    results_dict['average_code_bleu_moe'] = average_code_bleu_moe

    return results_dict

In [6]:
results_dict = {}
evaluated_responses_directory ='../data/run_20231127_170530/evaluated_responses/'
base_module_results_path = os.path.join(evaluated_responses_directory, 'base_module_build_results_237c7f40.json')
logger.info(base_module_results_path)
modules_results_df = pd.read_json(base_module_results_path).transpose()
modules_results_df['pass'] = modules_results_df['pass'].apply(lambda x: 0 if x else 1)
results_dict['base_modules'] = analyze_results_file(modules_results_df, True)

output_files = [
    'codellama-7b_tokens_output.csv', 
    'codellama-7b_lines_output.csv',  
    'codellama-7b_methods_output.csv', 
    '20231202_lines_codellama-7b-lora-20231107-201630_output.csv', 
    '20231202_methods_codellama-7b-lora-20231107-201630_output.csv', 
    '20231202_tokens_codellama-7b-lora-20231107-201630_output.csv'
    ]

for file in output_files:
    logger.info(file)
    results_file = os.path.join(evaluated_responses_directory, file)
    results_df = pd.read_csv(results_file)
    model = file.replace('.csv', '')
    results_dict[model] = analyze_results_file(results_df)

    if 'lines' in model:
        results_dict[model]['prompt_type'] = 'lines' 
    elif 'tokens' in model:
        results_dict[model]['prompt_type'] = 'tokens' 
    elif 'methods' in model:
        results_dict[model]['prompt_type'] = 'methods' 

results_df = pd.DataFrame(results_dict).transpose().sort_index()
results_summary_path = os.path.join(evaluated_responses_directory, 'results_summary.csv')
# results_df.to_csv(results_summary_path)
results_df

2023-12-09 17:00:24,447 - INFO - ../data/run_20231127_170530/evaluated_responses/base_module_build_results_237c7f40.json
2023-12-09 17:00:24,468 - INFO - Analyzing results
2023-12-09 17:00:24,471 - INFO - codellama-7b_tokens_output.csv
2023-12-09 17:00:24,530 - INFO - Analyzing results
2023-12-09 17:00:24,703 - INFO - codellama-7b_lines_output.csv
2023-12-09 17:00:24,753 - INFO - Analyzing results
2023-12-09 17:00:24,915 - INFO - codellama-7b_methods_output.csv
2023-12-09 17:00:24,959 - INFO - Analyzing results
2023-12-09 17:00:25,374 - INFO - 20231202_lines_codellama-7b-lora-20231107-201630_output.csv
2023-12-09 17:00:25,424 - INFO - Analyzing results
2023-12-09 17:00:25,637 - INFO - 20231202_methods_codellama-7b-lora-20231107-201630_output.csv
2023-12-09 17:00:25,687 - INFO - Analyzing results
2023-12-09 17:00:26,106 - INFO - 20231202_tokens_codellama-7b-lora-20231107-201630_output.csv
2023-12-09 17:00:26,152 - INFO - Analyzing results


Unnamed: 0,average_ngram_match_score,average_weighted_ngram_match_score,average_syntax_match_score,average_dataflow_match_score,average_label_token_count,average_completion_token_count,successful_build_rate,successful_build_rate_moe,no_error_rate,no_error_rate_moe,pass_at_1_rate,pass_at_1_rate_moe,average_code_bleu,average_code_bleu_moe,prompt_type
20231202_lines_codellama-7b-lora-20231107-201630_output,0.338706,0.607381,0.659051,0.316521,12.138,72.379,0.34,0.000939,0.335,0.000936,0.33,0.000932,0.638087,0.238234,lines
20231202_methods_codellama-7b-lora-20231107-201630_output,0.226282,0.490709,0.524821,0.315732,68.443,103.227,0.341,0.00094,0.338,0.000938,0.331,0.000933,0.505914,0.245222,methods
20231202_tokens_codellama-7b-lora-20231107-201630_output,0.041936,0.197008,0.125959,0.0,2.001,79.894,0.494,0.000991,0.494,0.000991,0.494,0.000991,0.341226,0.069991,tokens
base_modules,,,,,,,0.970588,0.001285,0.970588,0.001285,0.970588,0.001285,1.0,0.0,
codellama-7b_lines_output,0.398411,0.578653,0.666216,0.298311,12.138,48.021,0.308,0.000915,0.304,0.000912,0.295,0.000904,0.648531,0.249855,lines
codellama-7b_methods_output,0.285209,0.45923,0.528883,0.272435,68.443,75.358,0.309,0.000916,0.305,0.000913,0.299,0.000908,0.51015,0.272875,methods
codellama-7b_tokens_output,0.054881,0.20547,0.119731,0.0,2.001,69.27,0.493,0.000991,0.493,0.000991,0.493,0.000991,0.345021,0.067985,tokens


In [69]:
results_df.iloc[[3,6,2,4,0,5,1],:]

Unnamed: 0,average_ngram_match_score,average_weighted_ngram_match_score,average_syntax_match_score,average_dataflow_match_score,average_label_token_count,average_completion_token_count,successful_build_rate,successful_build_rate_moe,no_error_rate,no_error_rate_moe,pass_at_1_rate,pass_at_1_rate_moe,average_code_bleu,average_code_bleu_moe,prompt_type
base_modules,,,,,,,0.970588,0.001285,0.970588,0.001285,0.970588,0.001285,1.0,0.0,
codellama-7b_output_tokens,0.054881,0.20547,0.119731,0.0,2.001,69.27,0.493,0.000991,0.493,0.000991,0.493,0.000991,0.345021,0.067985,tokens
20231202_tokens_codellama-7b-lora-20231107-201630_output,0.041936,0.197008,0.125959,0.0,2.001,79.894,0.494,0.000991,0.494,0.000991,0.494,0.000991,0.341226,0.069991,tokens
codellama-7b_output_lines,0.398411,0.578653,0.666216,0.298311,12.138,48.021,0.308,0.000915,0.304,0.000912,0.295,0.000904,0.648531,0.249855,lines
20231202_lines_codellama-7b-lora-20231107-201630_output,0.338706,0.607381,0.659051,0.316521,12.138,72.379,0.34,0.000939,0.335,0.000936,0.33,0.000932,0.638087,0.238234,lines
codellama-7b_output_methods,0.285209,0.45923,0.528883,0.272435,68.443,75.358,0.309,0.000916,0.305,0.000913,0.299,0.000908,0.51015,0.272875,methods
20231202_methods_codellama-7b-lora-20231107-201630_output,0.226282,0.490709,0.524821,0.315732,68.443,103.227,0.341,0.00094,0.338,0.000938,0.331,0.000933,0.505914,0.245222,methods
