In [1]:
import pandas as pd
import os
import autopep8

In [2]:
def read_file_to_str(filepath):
  with open(filepath, 'r') as f:
    return f.read()

In [3]:
problem_solutions_path = 'data/solutions'
problems = [os.path.splitext(filename)[0] for filename in os.listdir(problem_solutions_path)]

problem_solutions = {}
for problem in problems:
    problem_solutions[problem] = read_file_to_str(f'{problem_solutions_path}/{problem}.py')

In [13]:
# codex_solution_df = pd.read_csv('CODEXParsons/data/codex-solutions-2023-01-18.csv')
codex_solution_df = pd.read_csv('data/codex-solutions-2023-01-18.csv')
codex_solution_df2 = pd.read_csv('data/codex-solutions-2023-01-21.csv')
codex_solution_df = pd.concat([codex_solution_df, codex_solution_df2])
codex_solution_df.columns

Index(['priming', 'extra_with_indentation_priming', 'scrambled_lines',
       'prompt', 'codex_output', 'problem'],
      dtype='object')

Compute basic stats

In [14]:
# extract lines from code into a set of whitespace-stripped lines
def extract_lines(code: str):
    return {x.strip() for x in code.strip().split('\n') if x.strip() != ''}

def remove_empty_lines(code: str):
    return '\n'.join([x for x in code.strip().split('\n') if x.strip() != ''])


In [15]:
sdf = codex_solution_df # [s]olution [d]ata [f]rame
sdf.priming.fillna('', inplace=True)
sdf.codex_output.fillna('', inplace=True)
for a, priming in enumerate(sdf.priming.unique()):
    for b, extra_indent_priming in enumerate(sdf.extra_with_indentation_priming.unique()):
        i = a * len(sdf.extra_with_indentation_priming.unique()) + b
        df = sdf[(sdf.priming == priming) & (sdf.extra_with_indentation_priming == extra_indent_priming)].copy()
        df['solution'] = df['problem'].apply(lambda problem: problem_solutions[problem])
        df['output_matches_solution'] = df['codex_output'].str.strip() == df['solution'].str.strip()
        codex_lines = df['codex_output'].apply(extract_lines)
        correct_lines = df['solution'].apply(extract_lines)
        df['unwanted_modification'] = correct_lines != codex_lines # TODO: handle distractors where appropriate

        # output matches solution
        oms = df.output_matches_solution
        print(f'V{i+1} percentage of outputs matching solution: {oms.sum()}/{oms.count()} = {oms.mean():.5f}')

        # unwanted modification (altered or added lines)
        um = df.unwanted_modification
        print(f'V{i+1} percentage of unwanted modifications: {um.sum()}/{um.count()} = {um.mean():.5f}')
        print()
        # df['altered_or_added_lines'] = codex_lines - correct_lines


V1 percentage of outputs matching solution: 11/240 = 0.04583
V1 percentage of unwanted modifications: 33/240 = 0.13750

V2 percentage of outputs matching solution: 17/240 = 0.07083
V2 percentage of unwanted modifications: 44/240 = 0.18333

V3 percentage of outputs matching solution: 17/240 = 0.07083
V3 percentage of unwanted modifications: 31/240 = 0.12917

V4 percentage of outputs matching solution: 20/240 = 0.08333
V4 percentage of unwanted modifications: 29/240 = 0.12083

V5 percentage of outputs matching solution: 16/240 = 0.06667
V5 percentage of unwanted modifications: 28/240 = 0.11667

V6 percentage of outputs matching solution: 14/240 = 0.05833
V6 percentage of unwanted modifications: 29/240 = 0.12083

V7 percentage of outputs matching solution: 17/240 = 0.07083
V7 percentage of unwanted modifications: 26/240 = 0.10833

V8 percentage of outputs matching solution: 19/240 = 0.07917
V8 percentage of unwanted modifications: 39/240 = 0.16250

V9 percentage of outputs matching soluti

Basic stats using PEP8 formatted code

In [16]:
sdf = codex_solution_df # [s]olution [d]ata [f]rame
pep8_sdf = pd.DataFrame()

sdf.priming.fillna('', inplace=True)
for a, priming in enumerate(sdf.priming.unique()):
    for b, extra_indent_priming in enumerate(sdf.extra_with_indentation_priming.unique()):
        i = a * len(sdf.extra_with_indentation_priming.unique()) + b
        df = sdf[(sdf.priming == priming) & (sdf.extra_with_indentation_priming == extra_indent_priming)].copy()
        df['solution'] = df['problem'].apply(lambda problem: problem_solutions[problem])

        # format code with autopep8 so that we can compare it without worrying about whitespace within lines
        df['pep8_codex_output'] = df['codex_output'].apply(autopep8.fix_code).apply(remove_empty_lines)
        df['pep8_solution'] = df['solution'].apply(autopep8.fix_code).apply(remove_empty_lines)
        df['pep8_output_matches_solution'] = df['pep8_codex_output'] == df['pep8_solution']

        codex_lines = df['pep8_codex_output'].apply(extract_lines)
        correct_lines = df['pep8_solution'].apply(extract_lines)
        df['pep8_unwanted_modification'] = (codex_lines - correct_lines).apply(len) > 0

        # output matches solution
        oms = df.pep8_output_matches_solution
        print(f'V{i+1} percentage of outputs matching solution: {oms.sum()}/{oms.count()} = {oms.mean():.5f}')

        # unwanted modification (altered or added lines)
        um = df.pep8_unwanted_modification
        print(f'V{i+1} percentage of unwanted modifications: {um.sum()}/{um.count()} = {um.mean():.5f}')
        print()

        df['pep8_altered_or_added_lines'] = codex_lines - correct_lines
        pep8_sdf = pd.concat([pep8_sdf, df])
        

V1 percentage of outputs matching solution: 105/240 = 0.43750
V1 percentage of unwanted modifications: 15/240 = 0.06250

V2 percentage of outputs matching solution: 101/240 = 0.42083
V2 percentage of unwanted modifications: 26/240 = 0.10833

V3 percentage of outputs matching solution: 106/240 = 0.44167
V3 percentage of unwanted modifications: 11/240 = 0.04583

V4 percentage of outputs matching solution: 114/240 = 0.47500
V4 percentage of unwanted modifications: 15/240 = 0.06250

V5 percentage of outputs matching solution: 88/240 = 0.36667
V5 percentage of unwanted modifications: 8/240 = 0.03333

V6 percentage of outputs matching solution: 56/240 = 0.23333
V6 percentage of unwanted modifications: 14/240 = 0.05833

V7 percentage of outputs matching solution: 87/240 = 0.36250
V7 percentage of unwanted modifications: 9/240 = 0.03750

V8 percentage of outputs matching solution: 109/240 = 0.45417
V8 percentage of unwanted modifications: 23/240 = 0.09583

V9 percentage of outputs matching sol

In [17]:
plot_df = pep8_sdf[pep8_sdf.extra_with_indentation_priming == True]

def remove_indentation(code: str):
    lines = code.split('\n')
    return '\n'.join([x.strip() for x in lines])

indentation_ignored_match = plot_df['pep8_solution'].apply(remove_indentation) == plot_df['pep8_codex_output'].apply(remove_indentation)
plot_df['pep8_indentation_ignored_match'] = indentation_ignored_match

_df = plot_df.groupby('priming')[['pep8_output_matches_solution', 'pep8_unwanted_modification']].mean().reset_index()
_df

print('Output deviates from setup')
_df: pd.DataFrame = plot_df.groupby(['priming', 'problem'])['pep8_unwanted_modification'].sum().reset_index()
_df = _df.pivot(index='problem', columns='priming', values='pep8_unwanted_modification')[sorted(_df.priming.unique())]
# add column and row for totals
_df['total'] = _df.sum(axis=1)
_df.loc['total'] = _df.sum(axis=0)
_df.sort_values('total', ascending=True, inplace=True)
_df.sort_values(by=_df.index[-1], axis=1, inplace=True)
_df.style.to_latex('plots-and-tables/deviation.tex', hrules=True, caption='Number of cases where output deviated from the setup (included a formatted line not present in the formatted scrambled set)', label='tab:pep8-deviation')
display(_df)

print('Output does not match the solution')
_df: pd.DataFrame = plot_df.groupby(['priming', 'problem'])['pep8_output_matches_solution'].agg(lambda x: x.apply(lambda x: not x).sum()).reset_index()
_df = _df.pivot(index='problem', columns='priming', values='pep8_output_matches_solution')[sorted(_df.priming.unique())]
# add column and row for totals
_df['total'] = _df.sum(axis=1)
_df.loc['total'] = _df.sum(axis=0)
_df.sort_values('total', ascending=True, inplace=True)
_df.sort_values(by=_df.index[-1], axis=1, inplace=True)
_df.style.to_latex('plots-and-tables/matches.tex', hrules=True, caption='Number of cases where formatted output does NOT match the formatted solution line by line.', label='tab:pep8-matches')
display(_df)

print('Output does NOT match the solution (indentation ignored)')
c = 'pep8_indentation_ignored_match'
_df: pd.DataFrame = plot_df.groupby(['priming', 'problem'])[c].agg(lambda x: x.apply(lambda x: not x).sum()).reset_index()
_df = _df.pivot(index='problem', columns='priming', values=c)[sorted(_df.priming.unique())]
# add column and row for totals
_df['total'] = _df.sum(axis=1)
_df.loc['total'] = _df.sum(axis=0)
_df.sort_values('total', ascending=True, inplace=True)
_df.sort_values(by=_df.index[-1], axis=1, inplace=True)
_df.style.to_latex('plots-and-tables/indent-ignored-matches.tex', hrules=True, caption='Number of cases where formatted output does NOT match the formatted solution line by line (discounting incorrent indentation).', label='tab:pep8-indent-ignored-matches')
_df

Output deviates from setup


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plot_df['pep8_indentation_ignored_match'] = indentation_ignored_match


priming,Reorder the lines,Reorder and indent the lines if needed,Put the lines in order to solve the problem,Reorder and indent the lines,Sequence the lines correctly,Produce the right answer,Unnamed: 7_level_0,total
problem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ericson2017figure1,0,0,0,0,0,0,0,0
Ericson2022figure3,0,0,0,0,0,0,0,0
Ericson2022figure4,0,0,0,0,0,0,0,0
Haynes_Magyar2022figure2,0,0,0,0,0,0,0,0
Haynes_Magyar2022figure4,0,0,0,0,0,0,0,0
Hou2022figure2,0,0,0,0,0,0,0,0
Ericson2022figure8,0,0,0,0,1,1,0,2
Karavirta2012Figure3,0,0,1,1,0,0,0,2
Weinmann2021figure1,0,0,0,0,0,3,4,7
Ericson2022figure2,1,2,2,1,2,2,1,11


Output does not match the solution


priming,Reorder and indent the lines,Unnamed: 2_level_0,Reorder the lines,Reorder and indent the lines if needed,Sequence the lines correctly,Put the lines in order to solve the problem,Produce the right answer,total
problem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ericson2022figure3,0,0,0,0,0,0,0,0
Weinmann2021figure1,3,9,4,3,3,3,7,32
Haynes_Magyar2022figure2,1,1,6,4,10,10,11,43
Ericson2022figure2,9,3,10,10,11,13,12,68
Hou2022figure2,5,9,9,15,12,15,15,80
Haynes_Magyar2022figure4,12,10,14,14,14,14,15,93
Ericson2022figure4,14,13,16,15,13,14,14,99
Ericson2022figure8,14,15,15,15,15,15,15,104
Ericson2017figure4,17,17,18,17,18,17,20,124
Ericson2018figure5,19,18,20,20,19,18,20,134


Output does NOT match the solution (indentation ignored)


priming,Reorder and indent the lines,Reorder and indent the lines if needed,Reorder the lines,Put the lines in order to solve the problem,Sequence the lines correctly,Unnamed: 6_level_0,Produce the right answer,total
problem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ericson2017figure1,0,0,0,0,0,0,0,0
Ericson2022figure3,0,0,0,0,0,0,0,0
Haynes_Magyar2022figure2,0,0,0,0,0,0,0,0
Hou2022figure2,1,1,1,1,1,0,1,6
Ericson2022figure2,1,2,1,3,2,1,2,12
Haynes_Magyar2022figure4,0,0,0,3,7,4,10,24
Weinmann2021figure1,3,3,3,3,3,9,7,31
Ericson2022figure4,14,15,16,14,12,12,11,94
Karavirta2012Figure3,14,12,12,16,13,15,15,97
Ericson2022figure8,14,15,15,15,15,15,15,104


Inspect altered lines

In [18]:
df = pep8_sdf
row = df[df.pep8_altered_or_added_lines.apply(len) > 0].iloc[0]

print('#Codex output:')
print(row['pep8_codex_output'])
# print('#Prompt:')
# print(row.prompt)
print('#Solution:')
print(row['pep8_solution'])
print('#Altered or added lines:')
print(row.pep8_altered_or_added_lines)

#Codex output:
def check_guess(guess, target):
    if guess < target:
        return "too low"
    elif guess == target:
        return 'correct'
    else:
        return 'too high'
#Solution:
def check_guess(guess, target):
    if guess < target:
        return 'too low'
    elif guess == target:
        return 'correct'
    else:
        return 'too high'
#Altered or added lines:
{'return "too low"'}


Compare solutions and outputs

In [19]:
problem_i = 0
output_i = -1
row = df.iloc[output_i]
print(row.problem)
print('#Codex output:')
print(row.codex_output)
print('#Solution:')
print(problem_solutions[row.problem])

Ericson2017figure4
#Codex output:
def avgValuesInRange(numList, start, end):
if (end - start + 1) >= 1:
sum = 0
for index in range(start,end+1):
value = numList[index]
sum = sum + value
return sum / (end - start + 1)
return 0

#Solution:
def avgValuesInRange(numList, start, end):
	sum = 0
	for index in range(start,end+1):
		value = numList[index]
		sum = sum + value
	if (end - start + 1) >= 1:
		return sum / (end - start + 1)
	return 0



Quick test to verify Codex output correctness

In [20]:
def avgValuesInRange2(numList, start, end):
	sum = 0
	for index in range(start,end-1):
		value = numList[index]
		sum = sum + value
	if (end - start + 1) >= 1:
		return sum / (end - start + 1)
	return 0

def avgValuesInRange(numList, start, end):
    if (end - start + 1) >= 1:
        sum = 0
        for index in range(start,end-1):
            value = numList[index]
            sum = sum + value
        return sum / (end - start + 1)
    return 0

print(avgValuesInRange([1,2,3,6,5,6,7,8,9,10], 2, 5))
avgValuesInRange2([1,2,3,6,5,6,7,8,9,10], 2, 5)

2.25


2.25