# Prediction Analysis

In [1]:
import os
from datasets import Dataset
from evaluate import load
from metric import make_metric
from util import color_source
from IPython.display import HTML

INPUT_PATH = "../input"
ROOT_PATH = os.path.join(INPUT_PATH, "Project_CodeNet")
GENERATED_PATH = os.path.join(INPUT_PATH, "repair")

DATASET = "alexjercan/bugnet"
MODEL = "gpt-3.5-turbo"

num_sequences = 2
timeout = 2

data_name = DATASET.split("/")[-1]
model_name = MODEL.split("/")[-1]
EVAL_PATH = os.path.join(GENERATED_PATH, f"evaluation_data_{data_name}_{model_name}.data", "data-00000-of-00001.arrow")

EVAL_PATH

'../input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/data-00000-of-00001.arrow'

In [2]:
evaluation_data = Dataset.from_file(EVAL_PATH)
evaluation_data

Dataset({
    features: ['problem_id', 'language', 'original_status', 'fail', 'pass', 'change', 'i1', 'i2', 'j1', 'j2', 'error', 'stderr', 'description', 'input', 'output', 'index', 'predicted', 'html', 'any_correct', 'pass_bug_type', 'predicted_bug_type'],
    num_rows: 100
})

In [3]:
# Check if the predicted code is exactly the same as the ground truth
for bug_type in ["input", "algorithm", "output"]:
    exact_match = load("exact_match")
    ev_data = evaluation_data.filter(lambda example: example["pass_bug_type"] == bug_type)
    if len(ev_data) == 0:
        continue
    result = exact_match.compute(
        predictions=[p[0] for p in ev_data["predicted"]], references=ev_data["pass"]
    )
    print(f"Exact match ({bug_type}): {result['exact_match']}")
    
exact_match = load("exact_match")
result = exact_match.compute(
    predictions=[p[0] for p in evaluation_data["predicted"]], references=evaluation_data["pass"]
)
print(f"Exact match: {result['exact_match']}")

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Exact match (input): 0.0


Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Exact match (algorithm): 0.0


Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Exact match (output): 0.0
Exact match: 0.0


In [4]:
# Check if the prediction passes the test cases
for bug_type in ["input", "algorithm", "output"]:
    code_eval = make_metric(DATASET)
    ev_data = evaluation_data.filter(lambda example: example["pass_bug_type"] == bug_type)
    if len(ev_data) == 0:
        continue
    result, _ = code_eval(evaluation_data, num_sequences, timeout)
    print(f"Code eval ({bug_type}): {result}")

code_eval = make_metric(DATASET)
result, test_results = code_eval(evaluation_data, num_sequences, timeout)
print(f"Code eval: {result}")

Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-20a5307177b127ab.arrow


Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-efd9eadf9a2a3012.arrow
Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-3725bef521e42010_*_of_00004.arrow
Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-1b42c61222fedb6f_*_of_00004.arrow


Code eval (input): {'pass@1': 0.01}


Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-2dc57d5db687fb4c.arrow
Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-3725bef521e42010_*_of_00004.arrow
Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-1b42c61222fedb6f_*_of_00004.arrow


Code eval (algorithm): {'pass@1': 0.01}


Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-3725bef521e42010_*_of_00004.arrow
Loading cached processed dataset at /home/alex/Documents/bug-detection/input/repair/evaluation_data_bugnet_gpt-3.5-turbo.data/cache-1b42c61222fedb6f_*_of_00004.arrow


Code eval (output): {'pass@1': 0.01}
Code eval: {'pass@1': 0.01}


In [5]:
# Check if the model modified the part of the code that it should
exact_match = load("exact_match")
result = exact_match.compute(
    predictions=[p[0] for p in evaluation_data["predicted_bug_type"]], references=evaluation_data["pass_bug_type"]
)
result

{'exact_match': 0.0}

In [6]:
print("Showing only execution correct results")
for i, (any_correct, html) in enumerate(zip(evaluation_data["any_correct"], evaluation_data["html"])):
    if not any_correct:
        continue

    display(HTML(html))

Showing only execution correct results


In [7]:
print("Showing only execution wrong results")
for i, (any_correct, html) in enumerate(zip(evaluation_data["any_correct"], evaluation_data["html"])):
    if any_correct:
        continue

    display(HTML(html))

Showing only execution wrong results
