# Start from here

In [17]:
import re
import json
from datasets import load_dataset
from pathlib import Path

In [18]:
dataset = load_dataset("openai/gsm8k", "main")

In [19]:
train_data = dataset["train"]

In [20]:
example = train_data[0]
print(example["question"])
print(example["answer"])


Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72


In [21]:
train_data

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})

## Error in the final answer only

In [22]:
question = example["question"]
original_answer = example["answer"]

# Split into lines for L1, L2, ...
lines = original_answer.strip().splitlines()

# Find the line that contains '#### <number>'
for i, line in enumerate(lines):
    match = re.match(r"####\s*(-?\d+)", line.strip())
    if match:
        correct = int(match.group(1))
        flawed = correct + 1
        error_line_index = i
        break
else:
    print("❌ No final answer line found.")
    exit()

# Create modified lines
flawed_lines = lines.copy()
flawed_lines[error_line_index] = re.sub(r"####\s*\d+", f"#### {flawed}", lines[error_line_index])

# Recombine into modified answer
flawed_answer = "\n".join(flawed_lines)

# Compute error location as "L<N>"
error_location = f"L{error_line_index + 1}"

# Create label
label = {
    "verdict": "Flawed",
    "error_details": {
        "error_type": "computational_error",
        "erroneous_line_number": error_location,
        "explanation": f"The final answer is too high by 1. It should be {correct}, not {flawed}.",
        "error_in_text": lines[error_line_index],
        "correction_in_text": f"#### {correct}"
    }
}

# Output
print("📌 QUESTION:\n", question)
print("\n✅ ORIGINAL ANSWER:\n", original_answer)
print("\n❌ FLAWED ANSWER:\n", flawed_answer)
print("\n🧾 LABEL:\n", label)


📌 QUESTION:
 Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

✅ ORIGINAL ANSWER:
 Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

❌ FLAWED ANSWER:
 Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 73

🧾 LABEL:
 {'verdict': 'Flawed', 'error_details': {'error_type': 'computational_error', 'erroneous_line_number': 'L3', 'explanation': 'The final answer is too high by 1. It should be 72, not 73.', 'error_in_text': '#### 72', 'correction_in_text': '#### 72'}}


In [23]:
# Output path
output_path = Path("gsm8k_train_flawed_plus1_final_answer.jsonl")

with output_path.open("w", encoding="utf-8") as f_out:
    for i, ex in enumerate(dataset["train"]):
        q = ex["question"]
        a = ex["answer"]

        # Split into lines for L1, L2, ...
        lines = a.strip().splitlines()

        # Find the line with the final answer
        for idx, line in enumerate(lines):
            match = re.match(r"####\s*(-?\d+)", line.strip())
            if match:
                correct = int(match.group(1))
                flawed = correct + 1
                error_line_index = idx
                break
        else:
            print(i, "❌ No final answer line found.\n")
            print(a)
            continue  # skip if no final answer found

        # Copy and modify lines
        flawed_lines = lines.copy()
        flawed_lines[error_line_index] = re.sub(r"####\s*\d+", f"#### {flawed}", lines[error_line_index]) #f"#### {flawed}"

        # # Replace the final display number in any inline computation
        # flawed_lines = [
        #     re.sub(r"(>>)" + str(correct) + r"\b", r"\1" + str(flawed), line)
        #     for line in flawed_lines
        # ]

        flawed_answer = "\n".join(flawed_lines)

        error_location = f"L{error_line_index + 1}"

        # Create label
        label = {
            "verdict": "Flawed",
            "error_details": {
                "error_type": "computational_error",
                "erroneous_line_number": f"L{error_line_index + 1}",
                "explanation": f"The final answer is too high by 1. It should be {correct}, not {flawed}.",
                "error_in_text": lines[error_line_index],
                "correction_in_text": f"#### {correct}"
            }
        }

        # Final example
        entry = {
            "id": i,
            "question": q,
            "flawed_answer": flawed_answer,
            "label": label
        }
        f_out.write(json.dumps(entry) + "\n")
        
print(f"✅ Finished writing {output_path.name}")

✅ Finished writing gsm8k_train_flawed_plus1_final_answer.jsonl


In [24]:
with open("gsm8k_train_flawed_plus1_final_answer.jsonl") as f:
    print(len(f.readlines()))  # Should be >0


7473


In [25]:
# Output path
output_path = Path("gsm8k_test_flawed_plus1_final_answer.jsonl")

with output_path.open("w", encoding="utf-8") as f_out:
    for i, ex in enumerate(dataset["test"]):
        q = ex["question"]
        a = ex["answer"]

        # Split into lines for L1, L2, ...
        lines = a.strip().splitlines()

        # Find the line with the final answer
        for idx, line in enumerate(lines):
            match = re.match(r"####\s*(-?\d+)", line.strip())
            if match:
                correct = int(match.group(1))
                flawed = correct + 1
                error_line_index = idx
                break
        else:
            print(i, "❌ No final answer line found.\n")
            print(a)
            continue  # skip if no final answer found

        # Copy and modify lines
        flawed_lines = lines.copy()
        flawed_lines[error_line_index] = re.sub(r"####\s*\d+", f"#### {flawed}", lines[error_line_index]) #f"#### {flawed}"

        # # Replace the final display number in any inline computation
        # flawed_lines = [
        #     re.sub(r"(>>)" + str(correct) + r"\b", r"\1" + str(flawed), line)
        #     for line in flawed_lines
        # ]

        flawed_answer = "\n".join(flawed_lines)

        error_location = f"L{error_line_index + 1}"

        # Create label
        label = {
            "verdict": "Flawed",
            "error_details": {
                "error_type": "computational_error",
                "erroneous_line_number": f"L{error_line_index + 1}",
                "explanation": f"The final answer is too high by 1. It should be {correct}, not {flawed}.",
                "error_in_text": lines[error_line_index],
                "correction_in_text": f"#### {correct}"
            }
        }

        # Final example
        entry = {
            "id": i,
            "question": q,
            "flawed_answer": flawed_answer,
            "label": label
        }
        f_out.write(json.dumps(entry) + "\n")

print(f"✅ Finished writing {output_path.name}")

✅ Finished writing gsm8k_test_flawed_plus1_final_answer.jsonl


## Error in the second last line

In [26]:
question = example["question"]
original_answer = example["answer"]

# Split into lines for L1, L2, ...
lines = original_answer.strip().splitlines()

# Find the line that contains '#### <number>'
for i, line in enumerate(lines):
    match = re.match(r"####\s*(-?\d+)", line.strip())
    if match:
        correct = int(match.group(1))
        flawed = correct + 1
        error_line_index = i
        break
else:
    print("❌ No final answer line found.")
    exit()

# Create modified lines
flawed_lines = lines.copy()
flawed_lines[error_line_index] = re.sub(r"####\s*\d+", f"#### {flawed}", lines[error_line_index])

# Replace only the number after the final '>>' (the human-readable result)
flawed_lines = [
    re.sub(str(correct) + r"(>>)" + str(correct) + r"\b", str(flawed) + r">>" + str(flawed), line)
    for line in flawed_lines
]

# Recombine into modified answer
flawed_answer = "\n".join(flawed_lines)

# Compute error location as "L<N>"
error_location = f"L{error_line_index}"

# Create label
label = {
    "verdict": "Flawed",
    "error_details": {
        "error_type": "computational_error",
        "erroneous_line_number": error_location,
        "explanation": f"The answer is too high by 1. It should be {correct}, not {flawed}.",
        "error_in_text": lines[error_line_index],
        "correction_in_text": f"#### {correct}"
    }
}

# Output
print("📌 QUESTION:\n", question)
print("\n✅ ORIGINAL ANSWER:\n", original_answer)
print("\n❌ FLAWED ANSWER:\n", flawed_answer)
print("\n🧾 LABEL:\n", label)


📌 QUESTION:
 Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

✅ ORIGINAL ANSWER:
 Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

❌ FLAWED ANSWER:
 Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=73>>73 clips altogether in April and May.
#### 73

🧾 LABEL:
 {'verdict': 'Flawed', 'error_details': {'error_type': 'computational_error', 'erroneous_line_number': 'L2', 'explanation': 'The answer is too high by 1. It should be 72, not 73.', 'error_in_text': '#### 72', 'correction_in_text': '#### 72'}}


In [27]:
# Output path
output_path = Path("gsm8k_train_flawed_plus1_2nd_last.jsonl")

with output_path.open("w", encoding="utf-8") as f_out:
    for i, ex in enumerate(dataset["train"]):
        q = ex["question"]
        a = ex["answer"]

        # Split into lines for L1, L2, ...
        lines = a.strip().splitlines()

        # Find the line with the final answer
        for idx, line in enumerate(lines):
            match = re.match(r"####\s*(-?\d+)", line.strip())
            if match:
                correct = int(match.group(1))
                flawed = correct + 1
                error_line_index = idx
                break
        else:
            print(i, "❌ No final answer line found.\n")
            print(a)
            continue  # skip if no final answer found

        # Copy and modify lines
        flawed_lines = lines.copy()
        flawed_lines[error_line_index] = re.sub(r"####\s*\d+", f"#### {flawed}", lines[error_line_index]) #f"#### {flawed}"

        # Replace only the number after the final '>>' (the human-readable result)
        flawed_lines = [
            re.sub(str(correct) + r"(>>)" + str(correct) + r"\b", str(flawed) + r">>" + str(flawed), line)
            for line in flawed_lines
        ]

        flawed_answer = "\n".join(flawed_lines)

        error_location = f"L{error_line_index}"

        # Create label
        label = {
            "verdict": "Flawed",
            "error_details": {
                "error_type": "computational_error",
                "erroneous_line_number": f"L{error_line_index}",
                "explanation": f"The 2n last line value is too high by 1. It should be {correct}, not {flawed}.",
                "error_in_text": lines[error_line_index],
                "correction_in_text": f"#### {correct}"
            }
        }

        # Final example
        entry = {
            "id": i,
            "question": q,
            "flawed_answer": flawed_answer,
            "label": label
        }
        f_out.write(json.dumps(entry) + "\n")

print(f"✅ Finished writing {output_path.name}")

✅ Finished writing gsm8k_train_flawed_plus1_2nd_last.jsonl


In [28]:
# Output path
output_path = Path("gsm8k_test_flawed_plus1_2nd_last.jsonl")

with output_path.open("w", encoding="utf-8") as f_out:
    for i, ex in enumerate(dataset["test"]):
        q = ex["question"]
        a = ex["answer"]

        # Split into lines for L1, L2, ...
        lines = a.strip().splitlines()

        # Find the line with the final answer
        for idx, line in enumerate(lines):
            match = re.match(r"####\s*(-?\d+)", line.strip())
            if match:
                correct = int(match.group(1))
                flawed = correct + 1
                error_line_index = idx
                break
        else:
            print(i, "❌ No final answer line found.\n")
            print(a)
            continue  # skip if no final answer found

        # Copy and modify lines
        flawed_lines = lines.copy()
        flawed_lines[error_line_index] = re.sub(r"####\s*\d+", f"#### {flawed}", lines[error_line_index]) #f"#### {flawed}"

        # Replace only the number after the final '>>' (the human-readable result)
        flawed_lines = [
            re.sub(str(correct) + r"(>>)" + str(correct) + r"\b", str(flawed) + r">>" + str(flawed), line)
            for line in flawed_lines
        ]

        flawed_answer = "\n".join(flawed_lines)

        error_location = f"L{error_line_index}"

        # Create label
        label = {
            "verdict": "Flawed",
            "error_details": {
                "error_type": "computational_error",
                "erroneous_line_number": f"L{error_line_index}",
                "explanation": f"The 2n last line value is too high by 1. It should be {correct}, not {flawed}.",
                "error_in_text": lines[error_line_index],
                "correction_in_text": f"#### {correct}"
            }
        }

        # Final example
        entry = {
            "id": i,
            "question": q,
            "flawed_answer": flawed_answer,
            "label": label
        }
        f_out.write(json.dumps(entry) + "\n")

print(f"✅ Finished writing {output_path.name}")

✅ Finished writing gsm8k_test_flawed_plus1_2nd_last.jsonl


## Combine data

In [29]:
with open("gsm8k_train_flawed_plus1_final_answer.jsonl", "r") as f:
    lines = f.readlines()
    print(f"Total lines: {len(lines)}")
    print("First line:", lines[0] if lines else "No data")


Total lines: 7473
First line: {"id": 0, "question": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", "flawed_answer": "Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 73", "label": {"verdict": "Flawed", "error_details": {"error_type": "computational_error", "erroneous_line_number": "L3", "explanation": "The final answer is too high by 1. It should be 72, not 73.", "error_in_text": "#### 72", "correction_in_text": "#### 72"}}}



In [30]:
original_dataset = load_dataset("openai/gsm8k", "main")

def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

flawed_train_final_answer = load_jsonl("gsm8k_train_flawed_plus1_final_answer.jsonl")
flawed_test = load_jsonl("gsm8k_test_flawed_plus1_final_answer.jsonl")

combined_train_final_answer = []

# Add correct examples from original dataset
for ex in original_dataset["train"]:
    combined_train_final_answer.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# Add flawed examples
for ex in flawed_train_final_answer:
    combined_train_final_answer.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined training set size: {len(combined_train_final_answer)}")

combined_test_final_answer = []

# Add correct examples from original test split
for ex in original_dataset["test"]:
    combined_test_final_answer.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# Add flawed examples from test JSONL
for ex in flawed_test:
    combined_test_final_answer.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined test set size: {len(combined_test_final_answer)}")


Combined training set size: 14946
Combined test set size: 2638


In [31]:
original_dataset = load_dataset("openai/gsm8k", "main")

def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

flawed_train_2nd_last = load_jsonl("gsm8k_train_flawed_plus1_2nd_last.jsonl")
flawed_test = load_jsonl("gsm8k_test_flawed_plus1_2nd_last.jsonl")

combined_train_2nd_last = []

# Add correct examples from original dataset
for ex in original_dataset["train"]:
    combined_train_2nd_last.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# Add flawed examples
for ex in flawed_train_2nd_last:
    combined_train_2nd_last.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined training set size: {len(combined_train_2nd_last)}")

combined_test_2nd_last = []

# Add correct examples from original test split
for ex in original_dataset["test"]:
    combined_test_2nd_last.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# Add flawed examples from test JSONL
for ex in flawed_test:
    combined_test_2nd_last.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined test set size: {len(combined_test_2nd_last)}")


Combined training set size: 14946
Combined test set size: 2638


In [32]:
import json

# Save combined training set
with open("combined_train_final_answer.json", "w") as train_file:
    json.dump(combined_train_final_answer, train_file, indent=4)

# Save combined test set
with open("combined_test_final_answer.json", "w") as test_file:
    json.dump(combined_test_final_answer, test_file, indent=4)

with open("combined_train_2nd_last.json", "w") as train_file:
    json.dump(combined_train_2nd_last, train_file, indent=4)

# Save combined test set
with open("combined_test_2nd_last.json", "w") as test_file:
    json.dump(combined_test_2nd_last, test_file, indent=4)

print("Combined datasets saved successfully!")

Combined datasets saved successfully!
