In [30]:
import pandas as pd

# Set max column width to None (no truncation)
pd.set_option('display.max_colwidth', None)

# Optionally, set display width to wrap columns (for wide DataFrames)
pd.set_option('display.width', 0)

In [31]:
base_df = pd.read_csv("../data/base-datasets-sanitized/base_4N_dataset_sanitized.csv")
base_df.dropna(subset=["erroneous_line_number"], inplace=True) # keep only rows with wrong answers
base_df["erroneous_line_number"] = base_df["erroneous_line_number"].astype(str)

In [32]:
def check_lines(df):
    temp = df[df["erroneous_line_number"] != "FA"]
    problematic_samples = []
    for index, row in temp.iterrows():
        correct_lines = row["correct_answer"].split("\n")
        wrong_lines = row["wrong_answer"].split("\n")
        if len(correct_lines) != len(wrong_lines):
            problematic_samples.append(row)
    return pd.DataFrame(problematic_samples)

check = check_lines(base_df)
problematic_indices = check["index"].tolist()
base_df = base_df[~base_df["index"].isin(problematic_indices)]

In [33]:
non_FA_df = base_df[base_df["erroneous_line_number"] != "FA"]
FA_df = base_df[base_df["erroneous_line_number"] == "FA"]
non_FA_df.loc[:, "eln"] = non_FA_df.apply(lambda row: int(row["erroneous_line_number"][1]) - 1, axis=1)
FA_df.loc[:, "eln"] = FA_df.apply(lambda row: int(row["solution_length"]) - 1, axis=1)

# merge the two
merged_df = pd.concat([non_FA_df, FA_df], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_FA_df.loc[:, "eln"] = non_FA_df.apply(lambda row: int(row["erroneous_line_number"][1]) - 1, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FA_df.loc[:, "eln"] = FA_df.apply(lambda row: int(row["solution_length"]) - 1, axis=1)


In [39]:
merged_df["eln"].value_counts()

eln
0    1517
1     912
2     575
3     308
4     122
5      53
6      19
7       7
8       2
Name: count, dtype: int64

In [40]:
merged_df["solution_length"].value_counts()

solution_length
4     993
3     891
5     764
6     459
7     185
8     100
2      87
9      32
10      4
Name: count, dtype: int64

In [41]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,index,tier,question,correct_answer,wrong_answer,error_type,erroneous_line_number,explanation,error_subtype,source,solution_length,relative_line_position,eln
0,0,1,tier4,"Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?","Weng earns 12/60 = $0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $10.\n#### 10","Weng earns 12/60 = $0.2 per minute.\nWorking 50 minutes, she earned 50 x 50 = $2500.\n#### 2500",conceptual_error,L2,Incorrect operand. The variable 'minutes_worked' (value: 50) was used instead of 'rate_per_minute' (value: 0.2).,incorrect_operand,programmatic,3,0.5,1
1,2,2,tier3,"Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?","In the beginning, Betty has only 100 / 2 = $50.\nBetty's grandparents gave her 15 * 2 = $30.\nThis means, Betty needs 100 - 50 - 30 - 15 = $5 more.\n#### 5","In the beginning, Betty has only 100 / 2 = $50.\nBetty's grandparents gave her 15 * 2 = $30.\nThis means, Betty needs 100 - 50 - 30 = $20 more.\n#### 20",conceptual_error,L3,forgot to consider the money Betty's parents gave her.,,manual,4,0.666667,2
2,4,3,tier3,"Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?","Maila read 12 x 2 = 24 pages today.\nSo she was able to read a total of 12 + 24 = 36 pages since yesterday.\nThere are 120 - 36 = 84 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 84/2 = 42 pages.\n#### 42","Maila read 12 x 2 = 42 pages today.\nSo she was able to read a total of 12 + 42 = 54 pages since yesterday.\nThere are 120 - 54 = 66 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 66/2 = 33 pages.\n#### 33",computational_error,L1,"The result of this computation should be 24, not 42. It appears two adjacent digits were swapped.",generate_digit_transposition_error,programmatic,5,0.0,0
3,6,4,tier1,James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?,He writes each friend 3*2=6 pages a week\nSo he writes 6*2=12 pages every week\nThat means he writes 12*52=624 pages a year\n#### 624,He writes each friend 3*2=6 pages a week\nSo he writes 6*2=12 pages every week\nThat means he writes 12*48=576 pages a year\n#### 576,conceptual_error,L3,each year has 52 weeks not 48.,,manual,4,0.666667,2
4,8,6,tier1,"Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?",He eats 32 from the largest pizzas because 2 x 16 = 32\nHe eats 16 from the small pizza because 2 x 8 = 16\nHe eats 48 pieces because 32 + 16 = 48\n#### 48,He eats 32 from the largest pizzas because 2 x 16 = 32\nHe eats 24 from the small pizza because 3 x 8 = 24\nHe eats 56 pieces because 32 + 24 = 56\n#### 56,conceptual_error,L2,he ate 2 small pizzas not 3.,,manual,4,0.333333,1


In [42]:
merged_df["error_type"].value_counts()

error_type
computational_error    1814
conceptual_error       1701
Name: count, dtype: int64

In [43]:
merged_df.columns

Index(['Unnamed: 0', 'index', 'tier', 'question', 'correct_answer',
       'wrong_answer', 'error_type', 'erroneous_line_number', 'explanation',
       'error_subtype', 'source', 'solution_length', 'relative_line_position',
       'eln'],
      dtype='object')

In [44]:
flawed_df = pd.DataFrame({
    "index": merged_df["index"],
    "tier": merged_df["tier"],
    "question": merged_df["question"],
    "answer": merged_df["wrong_answer"],
    "error_type": merged_df["error_type"],
    "explanation": merged_df["explanation"],
    "solution_length": merged_df["solution_length"],
    "eln": merged_df["eln"]
})

correct_df = pd.DataFrame({
    "index": merged_df["index"],
    "tier": merged_df["tier"],
    "question": merged_df["question"],
    "answer": merged_df["correct_answer"],
    "error_type": "correct",
    "explanation": None,
    "solution_length": merged_df["correct_answer"].apply(lambda x: len(str(x).split("\n"))),
    "eln": -1  # -1 indicates no erroneous line number for correct answers
})

In [45]:
final_df = pd.concat([flawed_df, correct_df], ignore_index=True)

In [46]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7030 entries, 0 to 7029
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   index            7030 non-null   int64 
 1   tier             7030 non-null   object
 2   question         7030 non-null   object
 3   answer           7030 non-null   object
 4   error_type       7030 non-null   object
 5   explanation      3515 non-null   object
 6   solution_length  7030 non-null   int64 
 7   eln              7030 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 439.5+ KB


In [47]:
final_df["error_type"].value_counts()

error_type
correct                3515
computational_error    1814
conceptual_error       1701
Name: count, dtype: int64

In [48]:
final_df.to_csv("../data/base-datasets-sanitized/line_classification.csv", index=False)
final_df.sample(20).to_csv("../data/base-datasets-sanitized/line_classification_sample.csv", index=False)

In [50]:
final_df["solution_length"].value_counts()

solution_length
4     2017
3     1747
5     1585
6      932
7      384
8      201
2       87
9       69
10       8
Name: count, dtype: int64

In [34]:
def check_error_lines(df):
    correct_mismatch = []
    wrong_mismatch = []
    for index, row in df.iterrows():
        correct_lines = row["correct_answer"].split("\n")
        wrong_lines = row["wrong_answer"].split("\n")
        eln = row["eln"]
        for i in range(eln):
            if correct_lines[i] != wrong_lines[i]:
                correct_mismatch.append(row)
                break

        if correct_lines[eln] == wrong_lines[eln]:
            wrong_mismatch.append(row)
            continue

    return {"correct_mismatch": pd.DataFrame(correct_mismatch), 
            "wrong_mismatch": pd.DataFrame(wrong_mismatch)}


In [35]:
mismatches = check_error_lines(non_FA_df)
print("Correct line mismatches:", len(mismatches['correct_mismatch']))
print("Wrong line mismatches:", len(mismatches['wrong_mismatch']))

Correct line mismatches: 228
Wrong line mismatches: 25


In [36]:
mismatch_indices = mismatches['correct_mismatch']["index"].tolist() + mismatches['wrong_mismatch']["index"].tolist()
non_FA_df = non_FA_df[~non_FA_df["index"].isin(mismatch_indices)]

In [37]:
# def make_partial_samples_non_FA(row):
#     """
#     For a given row, generate:
#     1. All correct partials (label 0)
#     2. All wrong partials starting from the erroneous line (label 1)
#     """
#     question = row["question"]
#     correct_lines = row["correct_answer"].split("\n")
#     wrong_lines = row["wrong_answer"].split("\n")
#     eln = row["eln"]

#     partials = []

#     # All correct partials (label 0)
#     for i in range(len(correct_lines)):
#         answer = "\n".join(correct_lines[:i+1])  # Include the i-th line
#         partials.append({
#             "index": row["index"],
#             "question": question,
#             "answer": answer,
#             "last_line": i,
#             "original_eln": eln,
#             "label": 0
#         })

#     # Wrong partials from the erroneous line onwards (label 1)
#     for i in range(eln, len(wrong_lines)):
#         answer = "\n".join(wrong_lines[:i+1])
#         partials.append({
#             "index": row["index"],
#             "question": question,
#             "answer": answer,
#             "last_line": i,
#             "original_eln": eln,
#             "label": 1
#         })

#     return pd.DataFrame(partials)

In [38]:
# Apply the function to each row and collect the resulting DataFrames in a list
partials_list = [make_partial_samples_non_FA(row) for _, row in non_FA_df.iterrows()]

# Concatenate all the DataFrames into a single DataFrame
all_partials_df = pd.concat(partials_list, ignore_index=True)

NameError: name 'make_partial_samples_non_FA' is not defined

In [None]:
all_partials_df.describe()

Unnamed: 0,index,last_line,original_eln,label
count,24482.0,24482.0,24482.0,24482.0
mean,3360.958582,2.125439,0.940119,0.442325
std,2220.359118,1.650399,1.292078,0.496673
min,1.0,0.0,0.0,0.0
25%,1299.25,1.0,0.0,0.0
50%,3178.0,2.0,0.0,0.0
75%,5264.0,3.0,2.0,1.0
max,7471.0,9.0,8.0,1.0


In [None]:
# all_partials_df.to_csv("../data/base-datasets-sanitized/partials.csv", index=False)

In [None]:
# sample_partials = all_partials_df.head(11)

In [None]:
# sample_partials.to_csv("../data/base-datasets-sanitized/sample_partials.csv", index=False)