This notebook provides the code to run the semantic tests for a given test file. 

In [1]:
import unittest
from class_structure import *
from serialize import *
from test_statements import *
import pandas as pd

total_tests = 16

In [2]:
# Test cases for the class structure of a codebase

def test_all(self, gt_code, generated_code):
    """
    Run all tests on the generated code against the ground truth code.
    This function takes the following parameters:
    - self: The instance of the class containing the test methods
    - gt_code: The ground truth code to compare against
    - generated_code: The generated code to be tested
    Returns a tuple containing:
    - total_passed: Total number of tests passed
    - tp: True Positives
    - fp: False Positives
    - fn: False Negatives
    - mismatch: Mismatches
    """
    total_passed = 0 # tp + tn
    tp = 0
    fp = 0
    fn = 0
    mismatch = 0
    p, v = test_information_description(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v = test_definition_term(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v=test_definition_meaning(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v=test_definition_exclusions(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v=test_rule_entity(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v=test_rule_type(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v=test_rule_description(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v=test_rule_conditions(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v=test_exemption_description(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v = test_reference_relationship(self, gt_code, generated_code)
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v = test_statement_relationship(self, gt_code, generated_code, relation='refines')
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v = test_statement_relationship(self, gt_code, generated_code, relation='is_refined_by')
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v = test_statement_relationship(self, gt_code, generated_code, relation='follows')
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v = test_statement_relationship(self, gt_code, generated_code, relation='is_followed_by')
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v = test_statement_relationship(self, gt_code, generated_code, relation='has_exception')
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    p, v = test_statement_relationship(self, gt_code, generated_code, relation='is_exception_to')
    total_passed+=p
    if not p and v==1: fn+=1
    elif not p and v==2: mismatch+=1
    elif not p and not v: fp+=1
    elif p and v: tp+=1
    return total_passed, tp, fp, fn, mismatch

In [3]:
def calculate_distribution(df):
    """
    Calculate the distribution of test results across the dataframe.
    
    Returns:
      dist (dict): A dictionary containing the distribution of test results.
    """
    dist = {}
    for col in df.columns:
        if col in ['text', 'GT Code', 'Generated Code', 'Total Passed', 'True Positives', 'False Positives', 'False Negatives', 'Accuracy', 'Recall', 'Precision', 'Mismacthes']:
            continue
        counts = df[col].value_counts().to_dict()
        tp = 0
        fp = 0
        fn = 0
        passed = 0
        for key, value in counts.items():
            key = [int(k) for k in key[1:-1].split(', ')]
            if key[0]==1:
                passed += value
            if key[0]==1 and key[1]==1:
                tp += value
            elif key[0]==0 and key[1]==0:
                fp += value
            elif key[0]==0 and key[1]==1:
                fn += value
        acc = passed / len(df)
        recall = tp / (tp + fn) if (tp + fn) else 0
        precision = tp / (tp + fp) if (tp + fp) else 0
        dist[col] = { 'Accuracy': acc, 'Recall': recall, 'Precision': precision }
    return dist

def create_result_df(gt_df, gen_df):
    """
    Create a results dataframe that includes:
      - The original text, GT code, and generated code.
      - Overall test metrics (Total Passed, True Positives, False Positives, False Negatives).
      - Calculated Accuracy (Total Passed/total_tests), Recall, and Precision per row.
      - A mapping of individual test results for each test type.
      
    Returns:
      df_result (pd.DataFrame): The dataframe containing all the results.
      mean_accuracy (float): Average accuracy over all rows.
      mean_recall (float): Average recall over all rows.
      mean_precision (float): Average precision over all rows.
    """
    results = []
    total_accuracy = 0
    total_recall = 0
    total_precision = 0

    for idx, gt_row in gt_df.iterrows():
        print(f"Row {idx}")
        # Retrieve corresponding values
        gt_code = gt_row['code']
        text = gt_row['text']
        gen_code = gen_df.loc[idx, 'code']
        
        # Create a test case instance to reuse for all tests on this row.
        test_case = unittest.TestCase()
        
        # Run the aggregated test function once per row
        total_passed, tp, fp, fn, mismatch = test_all(test_case, gt_code, gen_code)
        # Compute row metrics (with division safeguards)
        row_accuracy = total_passed / total_tests if total_tests else 0
        row_recall = tp / (tp + fn) if (tp + fn) else 0
        row_precision = tp / (tp + fp) if (tp + fp) else 0

        total_accuracy += row_accuracy
        total_recall += row_recall
        total_precision += row_precision
        
        # Create mapping of individual test results.
        test_map = {
            'Information Description': test_information_description(test_case, gt_code, gen_code),
            'Definition Term': test_definition_term(test_case, gt_code, gen_code),
            'Definition Meaning': test_definition_meaning(test_case, gt_code, gen_code),
            'Definition Exclusions': test_definition_exclusions(test_case, gt_code, gen_code),
            'Rule Entity': test_rule_entity(test_case, gt_code, gen_code),
            'Rule Type': test_rule_type(test_case, gt_code, gen_code),
            'Rule Description': test_rule_description(test_case, gt_code, gen_code),
            'Rule Conditions': test_rule_conditions(test_case, gt_code, gen_code),
            'Exemption Description': test_exemption_description(test_case, gt_code, gen_code),
            'Refines': test_statement_relationship(test_case, gt_code, gen_code, relation='refines'),
            'Is Refined By': test_statement_relationship(test_case, gt_code, gen_code, relation='is_refined_by'),
            'Follows': test_statement_relationship(test_case, gt_code, gen_code, relation='follows'),
            'Is Followed By': test_statement_relationship(test_case, gt_code, gen_code, relation='is_followed_by'),
            'Exceptions': test_statement_relationship(test_case, gt_code, gen_code, relation='has_exception'),
            'Is Exception To': test_statement_relationship(test_case, gt_code, gen_code, relation='is_exception_to'),
            'References': test_reference_relationship(test_case, gt_code, gen_code)
        }
        
        # Build row dictionary
        row_data = {
            'text': text,
            'GT Code': gt_code,
            'Generated Code': gen_code,
            'Total Passed': total_passed,
            'Mismacthes': mismatch,
            'True Positives': tp,
            'False Positives': fp,
            'False Negatives': fn,
            'Accuracy': row_accuracy,
            'Recall': row_recall,
            'Precision': row_precision
        }
        # Merge test map into the row's data
        row_data.update(test_map)
        results.append(row_data)
    # Create dataframe from results and compute overall averages.
    df_result = pd.DataFrame(results)
    n = len(gt_df)
    mean_accuracy = total_accuracy / n if n else 0
    mean_recall = total_recall / n if n else 0
    mean_precision = total_precision / n if n else 0
    
    return df_result, mean_accuracy, mean_recall, mean_precision

In [12]:
# Specify the paths to the ground truth and generated code files
gt_file = "path/to/ground_truth.csv"
gen_file = "path/to/generated_code.csv"

# Load the ground truth and generated code dataframes
gt_df = pd.read_csv(gt_file)
gen_df = pd.read_csv(gen_file)

In [13]:
# Ensure the 'code' column in both dataframes is properly formatted
gen_df['code'] = gen_df['code'].apply(
    lambda x: x.split("```")[1][7:].strip() if isinstance(x, str) and "```" in x else x
)

In [14]:
# Run the tests and create the result dataframe

df_result, m_accuracy, m_recall, m_precision = create_result_df(gt_df, gen_df)
print("Mean Accuracy: ", m_accuracy)
print("Mean Recall: ", m_recall)
print("Mean Precision: ", m_precision)

Row 0
Info description test case passed
Definition test case passed
Def meaning test case passed
Def exclusions test case passed
Rule entity test case passed
Rule type test case passed
Rule description test case passed
Rule condition test case passed
Exemption description test case passed
references test case passed
refines test case passed
is_refined_by test case passed
follows test case passed
is_followed_by test case passed
has_exception test case passed
is_exception_to test case passed
Info description test case passed
Definition test case passed
Def meaning test case passed
Def exclusions test case passed
Rule entity test case passed
Rule type test case passed
Rule description test case passed
Rule condition test case passed
Exemption description test case passed
refines test case passed
is_refined_by test case passed
follows test case passed
is_followed_by test case passed
has_exception test case passed
is_exception_to test case passed
references test case passed
Row 1
Info descr

In [15]:
# Save the result dataframe to a CSV file

df_result.to_csv("path/to/semantic_test_result.csv", index=False)
