This notebook computes the pass@k score across k result dataframes provided as input.

In [1]:
import pandas as pd

In [2]:
def calculate_pass_rate(df_list, rows):
    """
    Calculate the maximum pass rate and corresponding index for each row across multiple dataframes.
    Pass rate is defined as the number of tests passed for each code segment.
    Args:
      df_list (list): List of dataframes to compare.
      rows (int): Number of rows in each dataframe.
    Returns:
      max_pass_rate_list (list): List of maximum pass rates for each row.
      max_index_list (list): List of indices corresponding to the maximum pass rates.
    """
    max_pass_rate_list = []
    max_index_list = []
    for i in range(rows):
        max_pass_rate = 0
        max_index = 0
        for j in range(len(df_list)):
            pass_rate = df_list[j]['Total Passed'].iloc[i]
            max_pass_rate = max(max_pass_rate, pass_rate)
            if max_pass_rate == pass_rate:
                max_index = j
        max_pass_rate_list.append(max_pass_rate)
        max_index_list.append(max_index)
    return max_pass_rate_list, max_index_list

In [3]:
def calculate_distribution(df):
    """
    Calculate the distribution of test results across the dataframe.
    
    Returns:
      dist (dict): A dictionary containing the distribution of test results.
    """
    dist = {}
    for col in df.columns:
        if col in ['text', 'GT Code', 'Generated Code', 'Total Passed', 'True Positives', 'False Positives', 'False Negatives', 'Accuracy', 'Recall', 'Precision', 'Mismacthes']:
            continue
        counts = df[col].value_counts().to_dict()
        tp = 0
        fp = 0
        fn = 0
        passed = 0
        for key, value in counts.items():
            key = [int(k) for k in key[1:-1].split(', ')]
            if key[0]==1:
                passed += value
            if key[0]==1 and key[1]==1:
                tp += value
            elif key[0]==0 and key[1]==0:
                fp += value
            elif key[0]==0 and key[1]==1:
                fn += value
        acc = passed / len(df)
        recall = tp / (tp + fn) if (tp + fn) else 0
        precision = tp / (tp + fp) if (tp + fp) else 0
        dist[col] = { 'Accuracy': acc, 'Recall': recall, 'Precision': precision }
    return dist

In [4]:
def calculate_metrics_from_result_df(df_result):
    """
    Calculate the overall metrics from the results dataframe.
    
    Returns:
      mean_accuracy (float): Average accuracy over all rows.
      mean_recall (float): Average recall over all rows.
      mean_precision (float): Average precision over all rows.
    """
    n = len(df_result)
    total_accuracy = df_result['Accuracy'].sum()
    total_recall = df_result['Recall'].sum()
    total_precision = df_result['Precision'].sum()
    
    mean_accuracy = total_accuracy / n if n else 0
    mean_recall = total_recall / n if n else 0
    mean_precision = total_precision / n if n else 0
    dist = calculate_distribution(df_result)
    
    return mean_accuracy, mean_recall, mean_precision, dist

In [5]:
k = 3 # Update this to the value of k you want to use

# Read the k CSV files into dataframes (sample paths for k=3 shown)
df1 = pd.read_csv('path/to/your/csvfile1.csv')
df2 = pd.read_csv('path/to/your/csvfile2.csv')
df3 = pd.read_csv('path/to/your/csvfile3.csv')

# Assuming the dataframes have a consistent structure, concatenate them into a list
df_list = [df1, df2, df3]

# Number of rows in each dataframe
rows = len(df1)

total_tests = 16 # Total number of semantic tests (fixed value)

# Calculate the pass rates and indices
max_pass_rate_list, max_index_list = calculate_pass_rate(df_list, rows)

# Calculating number of rows where all tests passed
all_passed = 0
for i in range(rows):
    if max_pass_rate_list[i] == total_tests:
        all_passed+=1
print(f"Total number of rows: {rows}")
print(f"Number of rows where all tests passed: {all_passed}")
print(f"Percentage of rows where all tests passed (pass@{k}): {all_passed/rows*100:.2f}%")

Total number of rows: 9
Number of rows where all tests passed: 3
Percentage of rows where all tests passed (pass@3): 33.33%


In [6]:
# Create a results dataframe consolidating the best results
df_consolidated = []
for i in range(rows):
    df_consolidated.append(df_list[max_index_list[i]].iloc[i])
df_consolidated = pd.DataFrame(df_consolidated)
df_consolidated.head()

df_consolidated.to_csv('path/to/your/consolidated_results.csv', index=False)

In [7]:
# Calculate metrics from the consolidated results dataframe
mean_acc, mean_rec, mean_prec, dist = calculate_metrics_from_result_df(df_consolidated)
print(f"Mean Accuracy: {mean_acc:.4f}")
print(f"Mean Recall: {mean_rec:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print("Distribution of test results:")
for key, value in dist.items():
    print(key, value)
dist = pd.DataFrame(dist)
dist.to_csv('path/to/your/distribution_results.csv', index=False)

Mean Accuracy: 0.8264
Mean Recall: 0.7963
Mean Precision: 0.7685
Distribution of test results:
Information Description {'Accuracy': 1.0, 'Recall': 1.0, 'Precision': 1.0}
Definition Term {'Accuracy': 1.0, 'Recall': 1.0, 'Precision': 1.0}
Definition Meaning {'Accuracy': 1.0, 'Recall': 1.0, 'Precision': 1.0}
Definition Exclusions {'Accuracy': 0.8888888888888888, 'Recall': 0, 'Precision': 0}
Rule Entity {'Accuracy': 0.7777777777777778, 'Recall': 1.0, 'Precision': 1.0}
Rule Type {'Accuracy': 0.7777777777777778, 'Recall': 1.0, 'Precision': 1.0}
Rule Description {'Accuracy': 0.6666666666666666, 'Recall': 1.0, 'Precision': 1.0}
Rule Conditions {'Accuracy': 0.6666666666666666, 'Recall': 1.0, 'Precision': 0.5}
Exemption Description {'Accuracy': 0.8888888888888888, 'Recall': 0.0, 'Precision': 0}
Refines {'Accuracy': 0.6666666666666666, 'Recall': 0.0, 'Precision': 0}
Is Refined By {'Accuracy': 0.7777777777777778, 'Recall': 0.0, 'Precision': 0}
Follows {'Accuracy': 0.7777777777777778, 'Recall': 0.6