This notebook includes the code to run structural tests for a given test file.

In [1]:
import csv
from class_structure import *
from serialize import *
import unittest
import pandas as pd

In [2]:
def run_code_string(code_str: str) -> List[Section]:
    """
    Executes the given code string in a fresh namespace and
    returns a list of all Section objects created by that code.
    """
    try:
        namespace = {}
        # Execute the code in an isolated namespace
        exec(code_str, globals(), namespace)

        # Collect all variables in 'namespace' that are Section instances
        sections = [
            value
            for value in namespace.values()
            if isinstance(value, Section)
        ]
        return sections
    except (SyntaxError, Exception):
        return []
    
def compare_strings_with_threshold(s1, s2, threshold=10):
    """
    Compute the Levenshtein (edit) distance between two strings and compare it against a threshold.
    
    Parameters:
      s1 (str): First string.
      s2 (str): Second string.
      threshold (int): Maximum allowable edit distance to consider the strings similar.
    
    Returns:
      tuple: (bool, int) where the bool is True if the edit distance is <= threshold,
             and the int is the computed edit distance.
    """
    m, n = len(s1), len(s2)
    dp = [[0]*(n+1) for _ in range(m+1)]
    
    for i in range(m+1):
        dp[i][0] = i
    for j in range(n+1):
        dp[0][j] = j
    
    for i in range(1, m+1):
        for j in range(1, n+1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,      # deletion
                dp[i][j-1] + 1,      # insertion
                dp[i-1][j-1] + cost  # substitution
            )
    
    edit_distance = dp[m][n]
    return (edit_distance <= threshold, edit_distance)

# compare list of strings with threshold
def compare_list_strings_with_threshold(list1, list2, threshold=10):
    """
    Compare two lists of strings using the edit distance.
    
    Parameters:
      list1 (list): First list of strings.
      list2 (list): Second list of strings.
      threshold (int): Edit distance threshold for string comparisons.
      
    Returns:
      bool: True if every corresponding pair of strings are considered equal; False otherwise.
    """
    if len(list1) != len(list2):
        return False
    
    for s1, s2 in zip(list1, list2):
        similar, _ = compare_strings_with_threshold(s1, s2, threshold)
        if not similar:
            return False
    return True

def compare_serialized_expr(expr1, expr2, threshold=10):
    """
    Recursively compare two serialized Expression dictionaries.
    
    The dictionaries are assumed to have at least the following keys:
      - "text": a string (already lower-cased)
      - "includes": a list of nested serialized expressions or statements
      - "sectionNumber": an integer (or None)
    
    For string values, the comparison is based on the edit distance.
    For lists, elements are compared in order recursively.
    Other values are compared using standard equality.
    
    Parameters:
      expr1 (dict): First serialized expression.
      expr2 (dict): Second serialized expression.
      threshold (int): Edit distance threshold for string comparisons.
      
    Returns:
      bool: True if expr1 and expr2 are considered equal under these rules; False otherwise.
    """
    # Check that both dictionaries have the same keys.
    if set(expr1.keys()) != set(expr2.keys()):
        return False

    # Compare "text" using edit distance.
    if "text" in expr1 and "text" in expr2:
        if isinstance(expr1["text"], str) and isinstance(expr2["text"], str):
            similar, _ = compare_strings_with_threshold(expr1["text"], expr2["text"], threshold)
            if not similar:
                return False
        else:
            if expr1["text"] != expr2["text"]:
                return False

    # Compare "sectionNumber" using edit distance.
    if "sectionNumber" in expr1 and "sectionNumber" in expr2:
            similar, _ = compare_strings_with_threshold(str(expr1["sectionNumber"]), str(expr2["sectionNumber"]), threshold)
            if not similar:
                return False

    # Compare "includes" recursively.
    if "includes" in expr1 and "includes" in expr2:
        list1 = expr1["includes"]
        list2 = expr2["includes"]
        if len(list1) != len(list2):
            return False
        for item1, item2 in zip(list1, list2):
            # If the items are dictionaries, assume they are serialized expressions/statements.
            if isinstance(item1, dict) and isinstance(item2, dict):
                if not compare_serialized_expr(item1, item2, threshold):
                    return False
            # Otherwise, if they are strings, use the string comparison.
            elif isinstance(item1, str) and isinstance(item2, str):
                similar, _ = compare_strings_with_threshold(item1, item2, threshold)
                if not similar:
                    return False
            else:
                # For any other types, use direct equality.
                if item1 != item2:
                    return False

    return True


def test_section_number(self, code_string1, code_string2):
    """
    Test the section number of two code snippets.
    This function runs both code snippets, serializes the resulting sections,
    and compares the section numbers to ensure they match within a threshold.
    Parameters:
      code_string1 (str): The first code snippet as a string.
      code_string2 (str): The second code snippet as a string.
    Returns:
      int: 1 if the test passes, 0 if it fails.
    """
    try:
        # 1. Run both snippets to get top-level sections
        list_s1 = run_code_string(code_string1)
        list_s2 = run_code_string(code_string2)

        # 2. Serialize them
        dict1 = [serialize_section(s) for s in list_s1]
        dict2 = [serialize_section(s) for s in list_s2]
        # compare length of dictionaries
        self.assertEqual(len(dict1), len(dict2), "The two code snippets did not produce the same number of sections.")

        for i in range(len(dict1)):
            # 3. Compare
            assert compare_list_strings_with_threshold([d['sectionNumber'] for d in dict1], [d['sectionNumber'] for d in dict2], threshold=10), "The two code snippets did not produce the same section number"
        print("Section number test case passed")
        return 1
    except Exception as e:
        print(e)
        print("Section number test case failed")
        return 0

def test_section_name(self, gt_code, generated_code):
    """
    Test the section names of two code snippets.
    This function runs both code snippets, serializes the resulting sections,
    and compares the section names to ensure they match within a threshold.
    Parameters:
      gt_code (str): The ground truth code snippet as a string.
      generated_code (str): The generated code snippet as a string.
    Returns:
      int: 1 if the test passes, 0 if it fails.
    """
    try:
        # 1. Run both snippets to get top-level sections
        list_s1 = run_code_string(gt_code)
        list_s2 = run_code_string(generated_code)

        # 2. Serialize them
        dict1 = [serialize_section(s) for s in list_s1]
        dict2 = [serialize_section(s) for s in list_s2]

        # compare length of dictionaries
        self.assertEqual(len(dict1), len(dict2), "The two code snippets did not produce the same number of sections.")

        for i in range(len(dict1)):
            # 3. Compare
            assert compare_list_strings_with_threshold([d['sectionTitle'] for d in dict1], [d['sectionTitle'] for d in dict2], threshold=10), "The two code snippets did not produce the same section name"
        print("Section name test case passed")
        return 1
    except Exception as e:
        print("The two code snippets did not produce the same structure.")
        print(e)
        print("Section name test case failed")
        return 0

def test_section_subsections(self, gt_code, generated_code):
    """
    Test the subsections of two code snippets.
    This function runs both code snippets, serializes the resulting sections,
    and compares the number of subsections in each section to ensure they match.
    Parameters:
      gt_code (str): The ground truth code snippet as a string.
      generated_code (str): The generated code snippet as a string.
    Returns:
        int: 1 if the test passes, 0 if it fails.
        """
    try:
        # 1. Run both snippets to get top-level sections
        list_s1 = run_code_string(gt_code)
        list_s2 = run_code_string(generated_code)

        # 2. Serialize them
        dict1 = [serialize_section(s) for s in list_s1]
        dict2 = [serialize_section(s) for s in list_s2]
        print(dict1)
        print(dict2)

        # compare length of dictionaries
        self.assertEqual(len(dict1), len(dict2), "The two code snippets did not produce the same number of sections.")

        for i in range(len(dict1)):
            # 3. Compare number of subsections
            self.assertEqual(len(dict1[i]['subSections']), len(dict2[i]['subSections']), "The two code snippets did not produce the same number of subsections.")
        print("Subsection test case passed")
        return 1
    except Exception as e:
        print(e)
        print("Subsection test case failed")
        return 0

def test_section_expressions(self, gt_code, generated_code):
    """
    Test the expressions of two code snippets.
    This function runs both code snippets, serializes the resulting sections,
    and compares the number of expressions in each section to ensure they match.
    Parameters:
      gt_code (str): The ground truth code snippet as a string.
      generated_code (str): The generated code snippet as a string.
    Returns:
        int: 1 if the test passes, 0 if it fails.
    """
    try:
        # 1. Run both snippets to get top-level sections
        list_s1 = run_code_string(gt_code)
        list_s2 = run_code_string(generated_code)

        # 2. Serialize them
        dict1 = [serialize_section(s) for s in list_s1]
        dict2 = [serialize_section(s) for s in list_s2]
        print(dict1)
        print(dict2)

        # compare length of dictionaries
        self.assertEqual(len(dict1), len(dict2), "The two code snippets did not produce the same number of sections.")

        for i in range(len(dict1)):
            # 3. Compare number of expressions
            self.assertEqual(len(dict1[i]['expressions']), len(dict2[i]['expressions']), "The two code snippets did not produce the same number of expressions.")
        print("#Expression test case passed")
        return 1
    except Exception as e:
        print(e)
        print("#Expression test case failed")
        return 0

def test_section_statements(self, gt_code, generated_code):
    """
    Test the statements of two code snippets.
    This function runs both code snippets, serializes the resulting sections,
    and compares the number of statements in each section to ensure they match.
    Parameters:
      gt_code (str): The ground truth code snippet as a string.
      generated_code (str): The generated code snippet as a string.
    Returns:
        int: 1 if the test passes, 0 if it fails.
    """
    try:
        # 1. Run both snippets to get top-level sections
        list_s1 = run_code_string(gt_code)
        list_s2 = run_code_string(generated_code)

        # 2. Serialize them
        dict1 = [serialize_section(s) for s in list_s1]
        dict2 = [serialize_section(s) for s in list_s2]

        # compare length of dictionaries
        self.assertEqual(len(dict1), len(dict2), "The two code snippets did not produce the same number of sections.")

        for i in range(len(dict1)):
            # 3. Compare number of statements 
            self.assertEqual(len(dict1[i]['statements']), len(dict2[i]['statements']), "The two code snippets did not produce the same number of statements.")
        print("#Statement test case passed")
        return 1
    except Exception as e:
        print(e)
        print("#Statement test case failed")
        return 0

In [3]:
# test suite combining all tests
def test_all(self, gt_code, generated_code):
    """
    Run all tests on the generated code against the ground truth code.
    This function runs the following tests:
      - Section number
      - Section name
      - Section subsections
      - Section expressions
      - Section statements
    Parameters:
      gt_code (str): The ground truth code snippet as a string.
      generated_code (str): The generated code snippet as a string.
    Returns:
        int: The total number of passed tests.
    """
    passed = 0
    v=test_section_number(self, gt_code, generated_code)
    passed += v
    v=test_section_name(self, gt_code, generated_code)
    passed += v
    v=test_section_subsections(self, gt_code, generated_code)
    passed += v
    v=test_section_expressions(self, gt_code, generated_code)
    passed += v
    v=test_section_statements(self, gt_code, generated_code)
    passed += v
    return passed

In [13]:
# Global variable to define the total number of tests
total_tests = 5

def calculate_mean_accuracy_df(self, gt_df, gen_df):
    """
    Calculate the mean accuracy of the generated code against the ground truth code.
    This function runs all tests for each row in the dataframes and computes the mean accuracy.
    Parameters:
      gt_df (pd.DataFrame): DataFrame containing ground truth code.
      gen_df (pd.DataFrame): DataFrame containing generated code.
    Returns:
        float: The mean accuracy of the generated code.
    """
    passed=0
    for i in range(len(gt_df)):
        gt_code = gt_df.iloc[i]['code']
        generated_code = gen_df.iloc[i]['code']
        passed+=test_all(unittest.TestCase(), gt_code, generated_code) / total_tests
    mean_accuracy = passed/len(gt_df)
    return mean_accuracy 

def list_passed_df(self, gt_df, gen_df):
    """
    List the number of passed test cases for each row in the dataframes.
    This function runs all tests for each row in the dataframes and counts the number of passed tests.
    Parameters:
      gt_df (pd.DataFrame): DataFrame containing ground truth code.
      gen_df (pd.DataFrame): DataFrame containing generated code.
    Returns:
        list: A list of integers where each integer is the number of passed tests for that row.
        int: The total count of rows where all tests passed.
    """
    passed=[]
    count_tota_passed = 0
    for i in range(len(gt_df)):
        gt_code = gt_df.iloc[i]['code']
        generated_code = gen_df.iloc[i]['code']
        p=test_all(unittest.TestCase(), gt_code, generated_code)
        passed.append(p)
        if p==total_tests:
            count_tota_passed += 1
    return passed, count_tota_passed

def create_map_sample(self, gt_code, gen_code):
    """
    Create a map of test cases and their results for a single sample.
    This function runs all tests on the generated code against the ground truth code
    and creates a map with the results.
    Parameters:
      gt_code (str): The ground truth code snippet as a string.
      gen_code (str): The generated code snippet as a string.
    Returns:
        dict: A dictionary containing the results of all tests.
    """
    map = {}
    map["section_number"] = test_section_number(unittest.TestCase(), gt_code, gen_code)
    map["section_name"] = test_section_name(unittest.TestCase(), gt_code, gen_code)
    map["section_subsections"] = test_section_subsections(unittest.TestCase(), gt_code, gen_code)
    map["section_expressions"] = test_section_expressions(unittest.TestCase(), gt_code, gen_code)
    map["section_statements"] = test_section_statements(unittest.TestCase(), gt_code, gen_code)
    return map

def create_map_df(self, gt_df, gen_df):
    """
    Create a map of test cases and their results for each row in the dataframes.
    This function runs all tests for each row in the dataframes and creates a map with the results.
    Parameters:
      gt_df (pd.DataFrame): DataFrame containing ground truth code.
      gen_df (pd.DataFrame): DataFrame containing generated code.
    Returns:
        list: A list of dictionaries where each dictionary contains the results of all tests for that row.
    """
    map = []
    for i in range(len(gt_df)):
        print("test case: ", i)
        gt_code = gt_df.iloc[i]['code']
        gen_code = gen_df.iloc[i]['code']
        map.append(create_map_sample(unittest.TestCase(), gt_code, gen_code))
    return map

def create_result_df(self, gt_df, gen_df):
    """
    Create a result DataFrame containing the results of all tests for each row in the dataframes.
    This function runs all tests for each row in the dataframes and creates a DataFrame with the results.
    Parameters:
      gt_df (pd.DataFrame): DataFrame containing ground truth code.
      gen_df (pd.DataFrame): DataFrame containing generated code.
    Returns:
        pd.DataFrame: A DataFrame containing the results of all tests for each row.
        float: The mean accuracy of the generated code.
    """
    df_result = pd.DataFrame(create_map_df(unittest.TestCase(), gt_df, gen_df))
    # add text and code columns to the result dataframe
    df_result['Text'] = gt_df['text']
    df_result['GT Code'] = gt_df['code']
    df_result['Generated Code'] = gen_df['code']
    df_result['Total Passed'], count = list_passed_df(unittest.TestCase(), gt_df, gen_df)
    mean_accuracy = calculate_mean_accuracy_df(unittest.TestCase(), gt_df, gen_df)
    # move text and code columns to the front
    cols = df_result.columns.tolist()
    cols = cols[-4:] + cols[:-4]
    df_result = df_result[cols]
    print("Total passed test cases: ", count)
    return df_result, mean_accuracy

In [None]:
# Specify the paths to your ground truth and generated code files
gt_file = "path/to/ground_truth.csv"
gen_file = "path/to/generated_code.csv"

# Load the ground truth and generated code dataframes
gt_df = pd.read_csv(gt_file)
gen_df = pd.read_csv(gen_file)

In [7]:
# Inspect the ground truth dataframe
gt_df.head()

Unnamed: 0,text,code,tags
0,(1) This section applies to any person who con...,"s = Section(""(1)"")\ni=Information(s, Expressio...",['#condition']
1,"(2) For purposes of this section, the followin...","s2 = Section(sectionNumber=""(2)"")\nsa=Section(...","['#definition', '#continuation']"
2,"(2) For purposes of this section, the followin...","s2 = Section(""(2)"")\nsb=Section(""(b)"")\ns2.add...","['#definition', '#exclusion', '#condition']"
3,(3) A person who conducts business in this sta...,"s3 = Section(""(3)"")\n\nr1 = Rule(s3, Expressio...","['#obligation', '#continuation', '#reference',..."
4,(4) Any person who conducts business in this s...,"s4 = Section(""(4)"")\nr4 = Rule(s4, Expression(...","['#obligation', '#condition']"
5,(5) Any notification required by this section ...,"s5 = Section(""5."")\nr1 = Rule(s5, Expression(s...","['#obligation', '#condition']"
6,(6) Any notice required by the provisions of t...,"s6 = Section(""(6)"")\nr6 = Rule(s6, Expression(...","['#continuation', '#reference', '#permission',..."
7,(7) Any person who conducts business in this s...,"s7 = Section(""(7)"")\n\nr1 = Rule(s7, Expressio...","['#obligation', '#follows', '#reference', '#re..."
8,(8) Failure to comply with the requirements of...,"s8 = Section(sectionNumber=""(8)"")\n\nr1 = Rule...","['#prohibition', '#penalty', '#condition']"


In [8]:
# Inspect the generated code dataframe
gen_df.head()

Unnamed: 0,text,code,tags
0,(1) This section applies to any person who con...,"```python\ns = Section(""(1)"")\ni = Information...",['#condition']
1,"(2) For purposes of this section, the followin...","```python\ns2 = Section(sectionNumber=""(2)"")\n...","['#definition', '#continuation']"
2,"(2) For purposes of this section, the followin...","```python\ns2 = Section(""(2)"")\n\nsb = Section...","['#definition', '#continuation', '#exclusion']"
3,(3) A person who conducts business in this sta...,"```python\ns = Section(sectionNumber=""(3)"")\nr...","['#obligation', '#condition', '#prohibition', ..."
4,(4) Any person who conducts business in this s...,"```python\ns4 = Section(""(4)"")\nr4 = Rule(s4, ...","['#obligation', '#condition']"


In [9]:
# Ensure the 'code' column is properly formatted
gen_df['code'] = gen_df['code'].apply(
    lambda x: x.split("```")[1][7:].strip() if isinstance(x, str) and "```" in x else x
)

# Inspect the generated code dataframe after formatting
gen_df.head()

Unnamed: 0,text,code,tags
0,(1) This section applies to any person who con...,"s = Section(""(1)"")\ni = Information(s, Express...",['#condition']
1,"(2) For purposes of this section, the followin...","s2 = Section(sectionNumber=""(2)"")\n\nd_breach_...","['#definition', '#continuation']"
2,"(2) For purposes of this section, the followin...","s2 = Section(""(2)"")\n\nsb = Section(""(b)"")\ns2...","['#definition', '#continuation', '#exclusion']"
3,(3) A person who conducts business in this sta...,"s = Section(sectionNumber=""(3)"")\nr1 = Rule(s,...","['#obligation', '#condition', '#prohibition', ..."
4,(4) Any person who conducts business in this s...,"s4 = Section(""(4)"")\nr4 = Rule(s4, Expression(...","['#obligation', '#condition']"


In [14]:
# Create the result DataFrame and calculate mean accuracy
df_result, m_accuracy = create_result_df(unittest.TestCase(), gt_df, gen_df)
print("Mean Accuracy: ", m_accuracy)

test case:  0
Section number test case passed
Section name test case passed
[{'sectionNumber': '(1)', 'sectionTitle': '', 'subSections': [], 'expressions': [{'text': "this section applies to any person who conducts business in this state and who, in the ordinary course of the person's business functions, owns, licenses or maintains personal information of any resident of this state.", 'includes': [], 'sectionNumber': '(1)'}], 'statements': []}]
[{'sectionNumber': '(1)', 'sectionTitle': '', 'subSections': [], 'expressions': [{'text': "this section applies to any person who conducts business in this state and who, in the ordinary course of the person's business functions, owns, licenses or maintains personal information of any resident of this state", 'includes': [], 'sectionNumber': '(1)'}], 'statements': []}]
Subsection test case passed
[{'sectionNumber': '(1)', 'sectionTitle': '', 'subSections': [], 'expressions': [{'text': "this section applies to any person who conducts business in 

In [15]:
# Save the result DataFrame to a CSV file
df_result.to_csv("path/to/result.csv", index=False)