In [1]:
import pandas as pd
import os
import autopep8

In [2]:
def read_file_to_str(filepath):
  with open(filepath, 'r') as f:
    return f.read()

In [3]:
problem_solutions_path = 'data/solutions'
problems = [os.path.splitext(filename)[0] for filename in os.listdir(problem_solutions_path)]

problem_solutions = {}
for problem in problems:
    problem_solutions[problem] = read_file_to_str(f'{problem_solutions_path}/{problem}.py')

In [4]:
codex_solution_df = pd.read_csv('data/codex-solutions-2023-01-18.csv')
codex_solution_df.columns

Index(['priming', 'extra_with_indentation_priming', 'scrambled_lines',
       'prompt', 'codex_output', 'problem'],
      dtype='object')

Compute basic stats

In [5]:
# extract lines from code into a set of whitespace-stripped lines
def extract_lines(code: str):
    return {x.strip() for x in code.strip().split('\n') if x.strip() != ''}


In [6]:
sdf = codex_solution_df # [s]olution [d]ata [f]rame
sdf.priming.fillna('', inplace=True)
for a, priming in enumerate(sdf.priming.unique()):
    for b, extra_indent_priming in enumerate(sdf.extra_with_indentation_priming.unique()):
        i = a * len(sdf.extra_with_indentation_priming.unique()) + b
        df = sdf[(sdf.priming == priming) & (sdf.extra_with_indentation_priming == extra_indent_priming)].copy()
        df['solution'] = df['problem'].apply(lambda problem: problem_solutions[problem])
        df['output_matches_solution'] = df['codex_output'].str.strip() == df['solution'].str.strip()
        codex_lines = df['codex_output'].apply(extract_lines)
        correct_lines = df['solution'].apply(extract_lines)
        df['unwanted_modification'] = correct_lines != codex_lines # TODO: handle distractors where appropriate

        # output matches solution
        oms = df.output_matches_solution
        print(f'V{i+1} percentage of outputs matching solution: {oms.sum()}/{oms.count()} = {oms.mean():.5f}')

        # unwanted modification (altered or added lines)
        um = df.unwanted_modification
        print(f'V{i+1} percentage of unwanted modifications: {um.sum()}/{um.count()} = {um.mean():.5f}')
        print()
        # df['altered_or_added_lines'] = codex_lines - correct_lines


V1 percentage of outputs matching solution: 11/240 = 0.04583
V1 percentage of unwanted modifications: 33/240 = 0.13750

V2 percentage of outputs matching solution: 0/0 = nan
V2 percentage of unwanted modifications: 0/0 = nan

V3 percentage of outputs matching solution: 17/240 = 0.07083
V3 percentage of unwanted modifications: 31/240 = 0.12917

V4 percentage of outputs matching solution: 20/240 = 0.08333
V4 percentage of unwanted modifications: 29/240 = 0.12083

V5 percentage of outputs matching solution: 16/240 = 0.06667
V5 percentage of unwanted modifications: 28/240 = 0.11667

V6 percentage of outputs matching solution: 14/240 = 0.05833
V6 percentage of unwanted modifications: 29/240 = 0.12083

V7 percentage of outputs matching solution: 17/240 = 0.07083
V7 percentage of unwanted modifications: 26/240 = 0.10833

V8 percentage of outputs matching solution: 19/240 = 0.07917
V8 percentage of unwanted modifications: 39/240 = 0.16250

V9 percentage of outputs matching solution: 17/240 = 0

Basic stats using PEP8 formatted code

In [7]:
sdf = codex_solution_df # [s]olution [d]ata [f]rame
pep8_sdf = pd.DataFrame()
sdf.priming.fillna('', inplace=True)
for a, priming in enumerate(sdf.priming.unique()):
    for b, extra_indent_priming in enumerate(sdf.extra_with_indentation_priming.unique()):
        i = a * len(sdf.extra_with_indentation_priming.unique()) + b
        df = sdf[(sdf.priming == priming) & (sdf.extra_with_indentation_priming == extra_indent_priming)].copy()
        df['solution'] = df['problem'].apply(lambda problem: problem_solutions[problem])

        # format code with autopep8 so that we can compare it without worrying about whitespace within lines
        df['pep8_codex_output'] = df['codex_output'].apply(autopep8.fix_code)
        df['pep8_solution'] = df['solution'].apply(autopep8.fix_code)
        df['pep8_output_matches_solution'] = df['pep8_codex_output'] == df['pep8_solution']
        codex_lines = df['pep8_codex_output'].apply(extract_lines)
        correct_lines = df['pep8_solution'].apply(extract_lines)
        df['pep8_unwanted_modification'] = correct_lines != codex_lines

        # output matches solution
        oms = df.pep8_output_matches_solution
        print(f'V{i+1} percentage of outputs matching solution: {oms.sum()}/{oms.count()} = {oms.mean():.5f}')

        # unwanted modification (altered or added lines)
        um = df.pep8_unwanted_modification
        print(f'V{i+1} percentage of unwanted modifications: {um.sum()}/{um.count()} = {um.mean():.5f}')
        print()

        df['pep8_altered_or_added_lines'] = codex_lines - correct_lines
        pep8_sdf = pd.concat([pep8_sdf, df])

V1 percentage of outputs matching solution: 102/240 = 0.42500
V1 percentage of unwanted modifications: 27/240 = 0.11250

V2 percentage of outputs matching solution: 0/0 = nan
V2 percentage of unwanted modifications: 0/0 = nan

V3 percentage of outputs matching solution: 106/240 = 0.44167
V3 percentage of unwanted modifications: 25/240 = 0.10417

V4 percentage of outputs matching solution: 114/240 = 0.47500
V4 percentage of unwanted modifications: 23/240 = 0.09583

V5 percentage of outputs matching solution: 88/240 = 0.36667
V5 percentage of unwanted modifications: 22/240 = 0.09167

V6 percentage of outputs matching solution: 56/240 = 0.23333
V6 percentage of unwanted modifications: 22/240 = 0.09167

V7 percentage of outputs matching solution: 87/240 = 0.36250
V7 percentage of unwanted modifications: 20/240 = 0.08333

V8 percentage of outputs matching solution: 109/240 = 0.45417
V8 percentage of unwanted modifications: 34/240 = 0.14167

V9 percentage of outputs matching solution: 85/240

Inspect altered lines

In [8]:
df = pep8_sdf
row = df[df.pep8_altered_or_added_lines.apply(len) > 0].iloc[0]

print('#Codex output:')
print(row['pep8_codex_output'])
# print('#Prompt:')
# print(row.prompt)
print('#Solution:')
print(row['pep8_solution'])
print('#Altered or added lines:')
print(row.pep8_altered_or_added_lines)

#Codex output:
def check_guess(guess, target):
    if guess < target:
        return "too low"
    elif guess == target:
        return 'correct'
    else:
        return 'too high'

#Solution:
def check_guess(guess, target):
    if guess < target:
        return 'too low'
    elif guess == target:
        return 'correct'
    else:
        return 'too high'

#Altered or added lines:
{'return "too low"'}


Compare solutions and outputs

In [9]:
problem_i = 0
output_i = -1
row = df.iloc[output_i]
print(row.problem)
print('#Codex output:')
print(row.codex_output)
print('#Solution:')
print(problem_solutions[row.problem])

Ericson2017figure4
#Codex output:
def avgValuesInRange(numList, start, end):
if (end - start + 1) >= 1:
sum = 0
for index in range(start,end+1):
value = numList[index]
sum = sum + value
return sum / (end - start + 1)
return 0

#Solution:
def avgValuesInRange(numList, start, end):
	sum = 0
	for index in range(start,end+1):
		value = numList[index]
		sum = sum + value
	if (end - start + 1) >= 1:
		return sum / (end - start + 1)
	return 0



Quick test to verify Codex output correctness

In [10]:
def avgValuesInRange2(numList, start, end):
	sum = 0
	for index in range(start,end-1):
		value = numList[index]
		sum = sum + value
	if (end - start + 1) >= 1:
		return sum / (end - start + 1)
	return 0

def avgValuesInRange(numList, start, end):
    if (end - start + 1) >= 1:
        sum = 0
        for index in range(start,end-1):
            value = numList[index]
            sum = sum + value
        return sum / (end - start + 1)
    return 0

print(avgValuesInRange([1,2,3,6,5,6,7,8,9,10], 2, 5))
avgValuesInRange2([1,2,3,6,5,6,7,8,9,10], 2, 5)

2.25


2.25