# Prediction Analysis

In [1]:
import os
from datasets import Dataset
from evaluate import load
from metric import make_metric
from util import color_source
from IPython.display import HTML

INPUT_PATH = "../input"
ROOT_PATH = os.path.join(INPUT_PATH, "Project_CodeNet")
GENERATED_PATH = os.path.join(INPUT_PATH, "repair")

DATASET = "alexjercan/AoC"
MODEL = "gpt-3.5-turbo"

num_sequences = 2
timeout = 2

data_name = DATASET.split("/")[-1]
model_name = MODEL.split("/")[-1]
EVAL_PATH = os.path.join(GENERATED_PATH, f"evaluation_data_{data_name}_{model_name}.data", "data-00000-of-00001.arrow")

EVAL_PATH

'../input/repair/evaluation_data_AoC_gpt-3.5-turbo.data/data-00000-of-00001.arrow'

In [2]:
evaluation_data = Dataset.from_file(EVAL_PATH)
evaluation_data

Dataset({
    features: ['year', 'day', 'part', 'pass', 'fail', 'test', 'change', 'i1', 'i2', 'j1', 'j2', 'language', 'index', 'predicted', 'html', 'any_correct', 'pass_bug_type', 'predicted_bug_type'],
    num_rows: 15
})

In [3]:
# Check if the predicted code is exactly the same as the ground truth
for bug_type in ["input", "algorithm", "output"]:
    exact_match = load("exact_match")
    ev_data = evaluation_data.filter(lambda example: example["pass_bug_type"] == bug_type)
    if len(ev_data) == 0:
        continue
    result = exact_match.compute(
        predictions=[p[0] for p in ev_data["predicted"]], references=ev_data["pass"]
    )
    print(f"Exact match ({bug_type}): {result['exact_match']}")
    
exact_match = load("exact_match")
result = exact_match.compute(
    predictions=[p[0] for p in evaluation_data["predicted"]], references=evaluation_data["pass"]
)
print(f"Exact match: {result['exact_match']}")

Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

Exact match (algorithm): 0.0


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

Exact match: 0.0


In [4]:
# Check if the prediction passes the test cases
for bug_type in ["input", "algorithm", "output"]:
    code_eval = make_metric(DATASET)
    ev_data = evaluation_data.filter(lambda example: example["pass_bug_type"] == bug_type)
    if len(ev_data) == 0:
        continue
    result, _ = code_eval(evaluation_data, num_sequences, timeout)
    print(f"Code eval ({bug_type}): {result}")

code_eval = make_metric(DATASET)
result, test_results = code_eval(evaluation_data, num_sequences, timeout)
print(f"Code eval: {result}")

Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_AoC_gpt-3.5-turbo.data/cache-8e0dc23a1ca5c674.arrow
Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_AoC_gpt-3.5-turbo.data/cache-38c4f06bb8177e0b.arrow
Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_AoC_gpt-3.5-turbo.data/cache-f8f026e4687b9c31.arrow


Code eval (algorithm): {'pass@1': 0.26666666666666666}
Code eval: {'pass@1': 0.26666666666666666}


In [5]:
# Check if the model modified the part of the code that it should
exact_match = load("exact_match")
result = exact_match.compute(
    predictions=[p[0] for p in evaluation_data["predicted_bug_type"]], references=evaluation_data["pass_bug_type"]
)
result

{'exact_match': 0.0}

In [10]:
print("Showing only execution correct results")
for i, (any_correct, html) in enumerate(zip(evaluation_data["any_correct"], evaluation_data["html"])):
    if not any_correct:
        continue

    display(HTML(html))

Showing only execution correct results


In [11]:
print("Showing only execution wrong results")
for i, (any_correct, html) in enumerate(zip(evaluation_data["any_correct"], evaluation_data["html"])):
    if any_correct:
        continue

    display(HTML(html))

Showing only execution wrong results
