# Codex predicted solutions

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML

GENERATED_ROOT_PATH = os.path.join("..", "input", "bugnet")
GENERATED_PAIRS_PATH = os.path.join(GENERATED_ROOT_PATH, "generated_pairs.csv")
CODEX_PAIRS_PATH = os.path.join(GENERATED_ROOT_PATH, "codex_results.csv")

In [2]:
codex_df = pd.read_csv(CODEX_PAIRS_PATH, keep_default_na=False)

codex_df.head()

Unnamed: 0,problem_id,language,original_status,original_src,changed_src,change,i1,i2,j1,j2,error,codex_predicted,execute_output
0,p00001,C++,Time Limit Exceeded,#include <stdio.h>\nint main(void) {\n int a ...,#include <stdio.h>\nint main(void) {\n int a ...,replace,3,4,3,4,TLE,\n#include <stdio.h>\nint main(void) {\n int ...,
1,p00001,C++,Runtime Error,#include <algorithm>\n#include <iostream>\nusi...,#include <algorithm>\n#include <iostream>\nusi...,replace,9,10,9,10,-11,\n#include <algorithm>\n#include <iostream>\nu...,
2,p00001,C++,Runtime Error,#include <algorithm>\n#include <cstdio>\n\nusi...,#include <algorithm>\n#include <cstdio>\n\nusi...,replace,11,12,11,12,-11,\n#include <algorithm>\n#include <cstdio>\n\nu...,
3,p00001,C++,Time Limit Exceeded,#include <algorithm>\n#include <bitset>\n#incl...,#include <algorithm>\n#include <bitset>\n#incl...,replace,35,38,35,36,TLE,\n#include <iostream>\nusing namespace std;\n\...,
4,p00001,C++,Time Limit Exceeded,#include <iostream>\nusing namespace std;\n\ni...,#include <iostream>\nusing namespace std;\n\ni...,replace,19,20,19,20,TLE,\n#include <iostream>\nusing namespace std;\n\...,


In [3]:
codex_df["correct_exact"] = codex_df["codex_predicted"] == codex_df["changed_src"]

correct = codex_df["correct_exact"].sum()
print(f"The exact accuracy of the codex api is {correct / len(codex_df)}")

The exact accuracy of the codex api is 0.0


In [4]:
DERIVED_PATH = os.path.join("..", "input", "Project_CodeNet", "derived")

def id2inout(problem_id: str, name: str = "input") -> str:
    return os.path.join(DERIVED_PATH, "input_output", "data", problem_id, f"{name}.txt")

def check(row: pd.Series) -> bool:
    with open(id2inout(row["problem_id"], name="output"), "r") as f:
        output = f.read()

    return output == row["execute_output"]

codex_df["correct_execute"] = codex_df.apply(check, axis="columns")

correct = codex_df["correct_execute"].sum()
print(f"The execute accuracy of the codex api is {correct / len(codex_df)}")

The execute accuracy of the codex api is 0.14


In [5]:
codex_lang_df = codex_df.groupby("language")["correct_exact"].agg(["sum", "count"])
codex_lang_df["accuracy"] = codex_lang_df["sum"] / codex_lang_df["count"]

print("Exact match accuracy")
codex_lang_df

Exact match accuracy


Unnamed: 0_level_0,sum,count,accuracy
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C++,0,100,0.0
Python,0,100,0.0


In [6]:
codex_lang_df = codex_df.groupby("language")["correct_execute"].agg(["sum", "count"])
codex_lang_df["accuracy"] = codex_lang_df["sum"] / codex_lang_df["count"]

print("Execute accuracy")
codex_lang_df

Execute accuracy


Unnamed: 0_level_0,sum,count,accuracy
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C++,0,100,0.0
Python,28,100,0.28


In [7]:
def get_bug_type(row: pd.Series) -> str:
    line = "\n".join(row["changed_src"].splitlines()[row["j1"] : row["j2"]])
    language = row["language"]

    if language == "Python":
        return (
            "input"
            if "input" in line
            else "output"
            if "print" in line
            else "algorithm"
        )

    if language == "C++":
        return (
            "input"
            if ("cin" in line or "scanf" in line)
            else "output"
            if ("cout" in line or "printf" in line)
            else "algorithm"
        )

    raise NotImplementedError(f"{language} not implemented yet")

codex_df["type"] = codex_df.apply(get_bug_type, axis="columns")

In [8]:
codex_lang_df = codex_df.groupby(["language", "type"])["correct_exact"].agg(
    ["sum", "count"]
)
codex_lang_df["accuracy"] = codex_lang_df["sum"] / codex_lang_df["count"]

print("Exact match accuracy")
codex_lang_df

Exact match accuracy


Unnamed: 0_level_0,Unnamed: 1_level_0,sum,count,accuracy
language,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C++,algorithm,0,26,0.0
C++,input,0,70,0.0
C++,output,0,4,0.0
Python,algorithm,0,17,0.0
Python,input,0,72,0.0
Python,output,0,11,0.0


In [9]:
codex_lang_df = codex_df.groupby(["language", "type"])["correct_execute"].agg(
    ["sum", "count"]
)
codex_lang_df["accuracy"] = codex_lang_df["sum"] / codex_lang_df["count"]

print("Execute accuracy")
codex_lang_df

Execute accuracy


Unnamed: 0_level_0,Unnamed: 1_level_0,sum,count,accuracy
language,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C++,algorithm,0,26,0.0
C++,input,0,70,0.0
C++,output,0,4,0.0
Python,algorithm,3,17,0.176471
Python,input,19,72,0.263889
Python,output,6,11,0.545455


In [10]:
def color_source(source_code: str, i1: int, i2: int, color: str='red'):
    lines = source_code.splitlines(keepends=True)
    
    text = ""
    for i, line_str in enumerate(lines):
        for char in line_str:
            norm_color = 'black'
            if char == ' ':
                char = "•"
                norm_color = 'lightgrey'
            if char == '\n':
                char = "↵\n"
                norm_color = 'lightgrey'
            text += f'<span style="color:{color if i1 <= i and i <= i2 else norm_color};">{char}</span>'

    return "<pre>" + text + "</pre>"

In [11]:
codex_df["original_src_html"] = codex_df.apply(
    lambda row: color_source(row["original_src"], row["i1"], row["i2"], color="red"), 
    axis="columns",
)

codex_df["changed_src_html"] = codex_df.apply(
    lambda row: color_source(row["changed_src"], row["j1"], row["j2"], color="green"), 
    axis="columns",
)

codex_df["codex_src_html"] = codex_df.apply(
    lambda row: color_source(row["codex_predicted"], -1, -1, color="green"), 
    axis="columns",
)

In [12]:
codex_correct_df = codex_df[codex_df["correct_exact"]]
for i in range(min(len(codex_correct_df), 10)):
    display(HTML(f"<h2>Example {i}</h2>"))
    
    display(HTML(f"<h6>Original Source Code</h6>"))
    display(HTML(codex_correct_df["original_src_html"].iloc[i]))

    display(HTML(f"<h6>Changed Source Code</h6>"))
    display(HTML(codex_correct_df["changed_src_html"].iloc[i]))

    display(HTML(f"<h6>Codex Prediction</h6>"))
    display(HTML(codex_correct_df["codex_src_html"].iloc[i]))

In [13]:
codex_correct_df = codex_df[codex_df["correct_execute"]]
for i in range(min(len(codex_correct_df), 10)):
    display(HTML(f"<h2>Example {i}</h2>"))
    
    display(HTML(f"<h6>Original Source Code</h6>"))
    display(HTML(codex_correct_df["original_src_html"].iloc[i]))

    display(HTML(f"<h6>Changed Source Code</h6>"))
    display(HTML(codex_correct_df["changed_src_html"].iloc[i]))

    display(HTML(f"<h6>Codex Prediction</h6>"))
    display(HTML(codex_correct_df["codex_src_html"].iloc[i]))

In [14]:
for i in range(10):
    display(HTML(f"<h2>Example {i}</h2>"))
    
    display(HTML(f"<h6>Original Source Code</h6>"))
    display(HTML(codex_df["original_src_html"].iloc[i]))

    display(HTML(f"<h6>Changed Source Code</h6>"))
    display(HTML(codex_df["changed_src_html"].iloc[i]))

    display(HTML(f"<h6>Codex Prediction</h6>"))
    display(HTML(codex_df["codex_src_html"].iloc[i]))