# Multiplication

### (0 to 99,999) times 1-digit multiplication

In [None]:
import random

def make_dataset(
    train_size=(100, 300, 600, 1000, 8000), # total: 10,000
    test_size=10000,
    validation_size=10000,
    seed=42,
    train_path="training.txt",
    test_path="testing.txt",
    validation_path="validation.txt"
):
    random.seed(seed)
    # 1) Generate all (a,b) pairs with a∈[0,999], b∈[0,9]
    all_pairs = [(a, b) for a in range(100000) for b in range(10)]

    # 2) Split by digit‑length of 'a'
    ones   = [(a, b) for a, b in all_pairs if a < 10]
    twos   = [(a, b) for a, b in all_pairs if 10 <= a < 100]
    threes = [(a, b) for a, b in all_pairs if 100 <= a < 1000]
    four = [(a, b) for a, b in all_pairs if 1000 <= a < 10000]
    five = [(a, b) for a, b in all_pairs if 10000 <= a < 100000]

    # 3) Assemble training subsets
    train_ones   = ones[:]  # all 1‑digit × 1‑digit (100)
    train_twos   = random.sample(twos, train_size[1])    # 300 of 2‑digit × 1‑digit
    train_threes = random.sample(threes, train_size[2])  # 600 of 3‑digit × 1‑digit
    train_fours  = random.sample(four, train_size[3])    # 1000 of 4‑digit × 1‑digit
    train_fives  = random.sample(five, train_size[4])    # 8000 of 5‑digit × 1‑digit

    training = list(set(train_ones + train_twos + train_threes + train_fours + train_fives))
    assert len(training) == sum(train_size), "Training size mismatch!"

    # 4) Remaining pool for testing
    remaining = list(set(all_pairs) - set(training))
    testing   = random.sample(remaining, test_size)
    remaining_for_validation = list(set(remaining) - set(testing))
    validation = random.sample(remaining_for_validation, validation_size)

    # 5) Shuffle before writing
    random.shuffle(training)
    random.shuffle(testing)

    # 6) Write out to the given paths
    with open(train_path, "w") as f_tr:
        for a, b in training:
            f_tr.write(f"{a}*{b}={a*b}$\n")

    with open(test_path, "w") as f_te:
        for a, b in testing:
            f_te.write(f"{a}*{b}=\n")

    with open("validation.txt", "w") as f_val:
        for a, b in validation:
            f_val.write(f"{a}*{b}={a*b}$\n")

    print(f"Wrote {len(training)} shuffled lines (with answers) to '{train_path}'")
    print(f"Wrote {len(testing)} shuffled lines (no answers) to '{test_path}'")
    print(f"Wrote {len(validation)} shuffled lines (with answers) to '{validation_path}.txt'")

if __name__ == "__main__":
    # Example with custom filenames
    make_dataset(
        seed=42,
        train_path="0_to_99,999_times_1_digit_bal_train.txt",
        test_path="0_to_99,999_times_1_digit_bal_test.txt",
        validation_path="0_to_99,999_times_1_digit_bal_validation.txt"
    )


Wrote 10000 shuffled lines (with answers) to '0_to_99,999_times_1_digit_bal_train.txt'
Wrote 10000 shuffled lines (no answers) to '0_to_99,999_times_1_digit_bal_test.txt'
Wrote 10000 shuffled lines (with answers) to 'validation.txt'


### (0 to 999,999) times 1-digit multiplication

In [None]:
import random

def make_dataset(
    train_size=(100, 200, 400, 800, 1500, 7000), # total: 10,000
    test_size=3000,
    validation_size=3000,
    seed=42,
    train_path="training.txt",
    test_path="testing.txt",
    validation_path="validation.txt"
):
    random.seed(seed)
    # 1) Generate all (a,b) pairs with a∈[0,999], b∈[0,9]
    all_pairs = [(a, b) for a in range(1000000) for b in range(10)]

    # 2) Split by digit‑length of 'a'
    ones   = [(a, b) for a, b in all_pairs if a < 10]
    twos   = [(a, b) for a, b in all_pairs if 10 <= a < 100]
    threes = [(a, b) for a, b in all_pairs if 100 <= a < 1000]
    four = [(a, b) for a, b in all_pairs if 1000 <= a < 10000]
    five = [(a, b) for a, b in all_pairs if 10000 <= a < 100000]
    six = [(a, b) for a, b in all_pairs if 100000 <= a < 1000000]

    # 3) Assemble training subsets
    train_ones   = ones[:]  # all 1‑digit × 1‑digit (100)
    train_twos   = random.sample(twos, train_size[1])    # 300 of 2‑digit × 1‑digit
    train_threes = random.sample(threes, train_size[2])  # 600 of 3‑digit × 1‑digit
    train_fours  = random.sample(four, train_size[3])    # 1000 of 4‑digit × 1‑digit
    train_fives  = random.sample(five, train_size[4])    # 8000 of 5‑digit × 1‑digit
    train_sixes  = random.sample(six, train_size[5])      # 1500 of 6‑digit × 1‑digit

    training = list(set(train_ones + train_twos + train_threes + train_fours + train_fives + train_sixes))
    assert len(training) == sum(train_size), "Training size mismatch!"

    # 4) Remaining pool for testing
    remaining = list(set(all_pairs) - set(training))
    testing   = random.sample(remaining, test_size)
    remaining_for_validation = list(set(remaining) - set(testing))
    validation = random.sample(remaining_for_validation, validation_size)

    # 5) Shuffle before writing
    random.shuffle(training)
    random.shuffle(testing)

    # 6) Write out to the given paths
    with open(train_path, "w") as f_tr:
        for a, b in training:
            f_tr.write(f"{a}*{b}={a*b}$\n")

    with open(test_path, "w") as f_te:
        for a, b in testing:
            f_te.write(f"{a}*{b}={a*b}\n")

    with open(validation_path, "w") as f_val:
        for a, b in validation:
            f_val.write(f"{a}*{b}={a*b}$\n")

    print(f"Wrote {len(training)} shuffled lines (with answers) to '{train_path}'")
    print(f"Wrote {len(testing)} shuffled lines (no answers) to '{test_path}'")
    print(f"Wrote {len(validation)} shuffled lines (with answers) to '{validation_path}.txt'")

if __name__ == "__main__":
    # Example with custom filenames
    make_dataset(
        seed=42,
        train_path="0_to_six_digit_times_1_digit_bal_train.txt",
        test_path="0_to_six_digit_times_1_digit_bal_test.txt",
        validation_path="0_to_six_digit_times_1_digit_bal_validation.txt"
    )


Wrote 10000 shuffled lines (with answers) to '0_to_six_digit_times_1_digit_bal_train.txt'
Wrote 3000 shuffled lines (no answers) to '0_to_six_digit_times_1_digit_bal_test.txt'
Wrote 3000 shuffled lines (with answers) to '0_to_six_digit_times_1_digit_bal_validation.txt.txt'


# Addition 2 Operand

### 2 Operand 0-999, Without Zero Padding

In [2]:
import random


def format_train_val(a: int, b: int) -> str:
    """
    Format for training/validation: '229+594=823$'
    """
    return f"{a}+{b}={a + b}$"


def format_test(a: int, b: int) -> str:
    """
    Format for testing: '252+699='
    """
    return f"{a}+{b}={a + b}$"


def main(
    train_size=10000,
    test_size=3000,
    val_size=3000,
    seed=42,
    train_file="train.txt",
    test_file="test.txt",
    val_file="val.txt"
):
    # 1. Build all (a,b) pairs
    all_pairs = [(a, b) for a in range(1000) for b in range(1000)]

    # 2. Shuffle
    random.seed(seed)
    random.shuffle(all_pairs)

    # 3. Slice
    train_pairs = all_pairs[:train_size]
    test_pairs  = all_pairs[train_size:train_size + test_size]
    val_pairs   = all_pairs[train_size + test_size:train_size + test_size + val_size]

    # 4. Write files with different formats
    with open(train_file, "w") as f_train, \
         open(test_file,  "w") as f_test, \
         open(val_file,   "w") as f_val:

        for a, b in train_pairs:
            f_train.write(format_train_val(a, b) + "\n")

        for a, b in test_pairs:
            f_test.write(format_test(a, b) + "\n")

        for a, b in val_pairs:
            f_val.write(format_train_val(a, b) + "\n")

    print(f"Wrote {train_size} train, {test_size} test, {val_size} val examples.")


if __name__ == "__main__":
    main()


Wrote 10000 train, 3000 test, 3000 val examples.


### 2 Operand 0-999, Output Leading Digit Dropped

In [None]:
import random


def format_train_val(a: int, b: int) -> str:
    """
    Format for training/validation: '229+594=234$'  # note: we drop the leading digit of the sum
    """
    full = f"{a + b:04d}" # full sum with leading digit
    truncated = full[1:]  # drop the leading digit
    return f"{a:03d}+{b:03d}={truncated}$"


def format_test(a: int, b: int) -> str:
    """
    Format for testing: '252+699='
    """
    full = f"{a + b:04d}" # full sum with leading digit
    truncated = full[1:]  # drop the leading digit
    return f"{a:03d}+{b:03d}={truncated}$"


def main(
    train_size=10000,
    test_size=3000,
    val_size=3000,
    seed=42,
    train_file="train.txt",
    test_file="test.txt",
    val_file="val.txt"
):
    # 1. Build all (a,b) pairs
    all_pairs = [(a, b) for a in range(1000) for b in range(1000)]

    # 2. Shuffle
    random.seed(seed)
    random.shuffle(all_pairs)

    # 3. Slice
    train_pairs = all_pairs[:train_size]
    test_pairs  = all_pairs[train_size:train_size + test_size]
    val_pairs   = all_pairs[train_size + test_size:train_size + test_size + val_size]

    # 4. Write files with different formats
    with open(train_file, "w") as f_train, \
         open(test_file,  "w") as f_test, \
         open(val_file,   "w") as f_val:

        for a, b in train_pairs:
            f_train.write(format_train_val(a, b) + "\n")

        for a, b in test_pairs:
            f_test.write(format_test(a, b) + "\n")

        for a, b in val_pairs:
            f_val.write(format_train_val(a, b) + "\n")

    print(f"Wrote {train_size} train, {test_size} test, {val_size} val examples.")


if __name__ == "__main__":
    main()


Wrote 10000 train, 3000 test, 3000 val examples.


### 2 Operand 0-999, Zero Padding to Both Input & Output

In [None]:
import random


def format_train_val(a: int, b: int) -> str:
    """
    Format for training/validation: '229+594=823$'
    """
    return f"{a:03d}+{b:03d}={a + b:04d}$"


def format_test(a: int, b: int) -> str:
    """
    Format for testing: '252+699='
    """
    return f"{a:03d}+{b:03d}="


def main(
    train_size=10000,
    test_size=3000,
    val_size=3000,
    seed=42,
    train_file="train.txt",
    test_file="test.txt",
    val_file="val.txt"
):
    # 1. Build all (a,b) pairs
    all_pairs = [(a, b) for a in range(1000) for b in range(1000)]

    # 2. Shuffle
    random.seed(seed)
    random.shuffle(all_pairs)

    # 3. Slice
    train_pairs = all_pairs[:train_size]
    test_pairs  = all_pairs[train_size:train_size + test_size]
    val_pairs   = all_pairs[train_size + test_size:train_size + test_size + val_size]

    # 4. Write files with different formats
    with open(train_file, "w") as f_train, \
         open(test_file,  "w") as f_test, \
         open(val_file,   "w") as f_val:

        for a, b in train_pairs:
            f_train.write(format_train_val(a, b) + "\n")

        for a, b in test_pairs:
            f_test.write(format_test(a, b) + "\n")

        for a, b in val_pairs:
            f_val.write(format_train_val(a, b) + "\n")

    print(f"Wrote {train_size} train, {test_size} test, {val_size} val examples.")


if __name__ == "__main__":
    main()


Wrote 10000 train, 3000 test, 3000 val examples.


# Addition 4 Operand

### Generate 4 Operand 0-to-999 without Padding

In [5]:
import random

TRAIN_SIZE = 1_000_000
TEST_SIZE  = 10_000
VAL_SIZE   = 10_000
SEED       = 42

def decode_idx(idx: int):
    """Turn an integer 0 <= idx < 1000**4 into four 3‑digit numbers."""
    d = idx % 1000
    idx //= 1000
    c = idx % 1000
    idx //= 1000
    b = idx % 1000
    idx //= 1000
    a = idx  # now in [0,999]
    return a, b, c, d

def fmt(a, b, c, d):
    return f"{a}+{b}+{c}+{d}={a+b+c+d}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE
    # Sample unique indices in [0, 10**12):
    sampled = random.sample(range(1000**4), total_needed)

    train_idx = sampled[:TRAIN_SIZE]
    test_idx  = sampled[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE]
    val_idx   = sampled[TRAIN_SIZE+TEST_SIZE:]

    with open("train.txt", "w") as f_train, \
         open("test.txt",  "w") as f_test, \
         open("val.txt",   "w") as f_val:

        for idx in train_idx:
            f_train.write(fmt(*decode_idx(idx)) + "\n")

        for idx in test_idx:
            f_test.write(fmt(*decode_idx(idx)) + "\n")

        for idx in val_idx:
            f_val.write(fmt(*decode_idx(idx)) + "\n")

    print(f"Wrote {TRAIN_SIZE} train, {TEST_SIZE} test, {VAL_SIZE} val examples.")

if __name__ == "__main__":
    main()


Wrote 1000000 train, 10000 test, 10000 val examples.


### Generate 4 Operand 0-999, Output without leading digit

In [None]:
import random

TRAIN_SIZE = 1_000_000
TEST_SIZE  = 10_000
VAL_SIZE   = 10_000
SEED       = 42

def decode_idx(idx: int):
    """Turn an integer 0 <= idx < 1000**4 into four 3‑digit numbers."""
    d = idx % 1000
    idx //= 1000
    c = idx % 1000
    idx //= 1000
    b = idx % 1000
    idx //= 1000
    a = idx  # now in [0,999]
    return a, b, c, d

def fmt(a, b, c, d):
    # 1) pad inputs to 3 digits
    a_s = f"{a:03d}"
    b_s = f"{b:03d}"
    c_s = f"{c:03d}"
    d_s = f"{d:03d}"

    # 2) compute sum and pad to 4 digits
    total = a + b + c + d
    total_s = f"{total:04d}"

    # 3) drop leading digit to get a 3‑digit output
    trimmed = total_s[1:]

    return f"{a_s}+{b_s}+{c_s}+{d_s}={trimmed}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE
    sampled = random.sample(range(1000**4), total_needed)

    train_idx = sampled[:TRAIN_SIZE]
    test_idx  = sampled[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE]
    val_idx   = sampled[TRAIN_SIZE+TEST_SIZE:]

    for filename, indices in [("train.txt", train_idx),
                              ("test.txt",  test_idx),
                              ("val.txt",   val_idx)]:
        with open(filename, "w") as f:
            for idx in indices:
                f.write(fmt(*decode_idx(idx)) + "\n")

    print(f"Wrote {TRAIN_SIZE} train, {TEST_SIZE} test, {VAL_SIZE} val examples.")

if __name__ == "__main__":
    main()


Wrote 1000000 train, 10000 test, 10000 val examples.


### Generate 4 Operands 0-to-999, Zero Padding to Both Input and Output

In [None]:
import random

TRAIN_SIZE = 1_000_000
TEST_SIZE  = 10_000
VAL_SIZE   = 10_000
SEED       = 42

def decode_idx(idx: int):
    """Turn an integer 0 <= idx < 1000**4 into four 3‑digit numbers."""
    d = idx % 1000
    idx //= 1000
    c = idx % 1000
    idx //= 1000
    b = idx % 1000
    idx //= 1000
    a = idx  # now in [0,999]
    return a, b, c, d

def fmt(a, b, c, d):
    return f"{a:03d}+{b:03d}+{c:03d}+{d:03d}={a+b+c+d:04d}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE
    # Sample unique indices in [0, 10**12):
    sampled = random.sample(range(1000**4), total_needed)

    train_idx = sampled[:TRAIN_SIZE]
    test_idx  = sampled[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE]
    val_idx   = sampled[TRAIN_SIZE+TEST_SIZE:]

    with open("train.txt", "w") as f_train, \
         open("test.txt",  "w") as f_test, \
         open("val.txt",   "w") as f_val:

        for idx in train_idx:
            f_train.write(fmt(*decode_idx(idx)) + "\n")

        for idx in test_idx:
            f_test.write(fmt(*decode_idx(idx)) + "\n")

        for idx in val_idx:
            f_val.write(fmt(*decode_idx(idx)) + "\n")

    print(f"Wrote {TRAIN_SIZE} train, {TEST_SIZE} test, {VAL_SIZE} val examples.")

if __name__ == "__main__":
    main()


Wrote 1000000 train, 10000 test, 10000 val examples.


### Generate 4 Operands 0-999, Balanced Digit (1,2,3 digit, each 1/3 chance)

In [None]:
import random
from pathlib import Path

def sample_operand():
    """
    Returns one integer:
      – with probability 1/3 in [0, 9]    (1-digit)
      – with probability 1/3 in [10, 99] (2-digit)
      – with probability 1/3 in [100, 999] (3-digit)
    """
    r = random.random()
    if r < 1/3:
        lo, hi = 0, 9
    elif r < 2/3:
        lo, hi = 10, 99
    else:
        lo, hi = 100, 999
    return random.randint(lo, hi)

seen = set()

def generate_dataset(num_samples: int,
                     output_path: Path,
                     include_answer: bool = True,
                     append_dollar: bool = False):
    lines = []

    while len(lines) < num_samples:
        ops = [sample_operand() for _ in range(4)]
        base = "+".join(map(str, ops)) + "="
        if base not in seen:
            seen.add(base)
            line = base
            if include_answer:
                line += str(sum(ops))
                if append_dollar:
                    line += "$"
            lines.append(line + "\n")


    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text("".join(lines))

if __name__ == "__main__":
    generate_dataset(
        num_samples=1_000_000,
        output_path=Path('/Users/perfectpeter/Library/CloudStorage/GoogleDrive-xzhao468@wisc.edu/My Drive/addition/data/train.txt'),
        include_answer=True,
        append_dollar=True
    )
    generate_dataset(
        num_samples=10_000,
        output_path=Path('/Users/perfectpeter/Library/CloudStorage/GoogleDrive-xzhao468@wisc.edu/My Drive/addition/data/val.txt'),
        include_answer=True,
        append_dollar=True
    )
    generate_dataset(
        num_samples=10_000,
        output_path=Path('/Users/perfectpeter/Library/CloudStorage/GoogleDrive-xzhao468@wisc.edu/My Drive/addition/data/test.txt'),
        include_answer=False
    )


## Generate Data for Statistical Measurement
### -- The data is used for calculating some statistical metrics during training, e.g. mutual infomation

In [None]:
"""
4 Operand -- Input Drawn Uniformly from 100 to 999, Reject Output < 1000
"""
import random

def generate_examples(num_examples, low=100, high=999, min_sum=1000):
    examples = []
    while len(examples) < num_examples:
        a = random.randint(low, high)
        b = random.randint(low, high)
        c = random.randint(low, high)
        d = random.randint(low, high)
        total = a + b + c + d
        if total >= min_sum:
            examples.append(f"{a}+{b}+{c}+{d}={total}$")
    return examples

def main():
    num_examples = 3000
    output_file = "4_operand_addition_stats_measurement_data_plain.txt"
    examples = generate_examples(num_examples)
    with open(output_file, "w", encoding="utf-8") as f:
        for line in examples:
            f.write(line + "\n")
    print(f"Wrote {num_examples} examples to '{output_file}'")

if __name__ == "__main__":
    main()


Wrote 3000 examples to '4_operand_addition_stats_measurement_data_plain.txt'


In [None]:
"""
4 Operands Input Uniformly Drawn from 0 to 999

Output w/o leading digit, Pad 0 to Both Input and Output
"""

import random

def generate_examples(num_examples, low=0, high=999, min_sum=0):
    examples = []
    while len(examples) < num_examples:
        a = random.randint(low, high)
        b = random.randint(low, high)
        c = random.randint(low, high)
        d = random.randint(low, high)
        total = a + b + c + d
        if total >= min_sum:
            # pad each input to 3 digits
            a_s = f"{a:03d}"
            b_s = f"{b:03d}"
            c_s = f"{c:03d}"
            d_s = f"{d:03d}"

            # pad sum to 4 digits then take the last 3
            total_s_full = f"{total:04d}"
            total_s = total_s_full[-3:]

            examples.append(f"{a_s}+{b_s}+{c_s}+{d_s}={total_s}$")
    return examples

def main():
    random.seed(42)
    num_examples = 3000
    output_file = "4_operand_addition_stats_measurement_data_plain.txt"
    examples = generate_examples(num_examples)

    with open(output_file, "w", encoding="utf-8") as f:
        for line in examples:
            f.write(line + "\n")

    print(f"Wrote {num_examples} examples to '{output_file}'")

if __name__ == "__main__":
    main()


Wrote 3000 examples to '4_operand_addition_stats_measurement_data_plain.txt'


In [None]:
"""
Reverse Results in Measurement Data
"""

import os

INPUT_FILENAMES = [
    './4_operand_addition_stats_measurement_data_plain.txt'
]

OUTPUT_FILENAMES = [
    './4_operand_addition_stats_measurement_data_reversed.txt'
]

def reverse_results(input_path: str, output_path: str) -> None:
    """
    Read each line from `input_path`, reverse the two‐digit result before the '$',
    and write the modified line to `output_path`.
    """
    if not os.path.isfile(input_path):
        print(f"Error: '{input_path}' does not exist.")
        return

    with open(input_path, "r") as infile, open(output_path, "w") as outfile:
        for raw_line in infile:
            line = raw_line.rstrip("\n")
            # Skip empty lines
            if not line.strip():
                continue

            # Expect format: <expression>=<two_digits>$
            if "=" in line and line.endswith("$"):
                left, right = line.split("=", 1)
                # right is like "42$" or "02$"
                digits = right[:-1]  # drop the trailing '$'
                # Reverse the digit string (e.g., "42" -> "24", "02" -> "20")
                reversed_digits = digits[::-1]
                new_line = f"{left}={reversed_digits}$"
            else:
                # If line doesn't match expected pattern, leave it unchanged
                new_line = line

            outfile.write(new_line + "\n")

    print(f"Processed '{input_path}' and wrote results to '{output_path}'.")


if __name__ == "__main__":
    for input_file, output_file in zip(INPUT_FILENAMES, OUTPUT_FILENAMES):
        reverse_results(input_file, output_file)

Processed './4_operand_addition_stats_measurement_data_plain.txt' and wrote results to './4_operand_addition_stats_measurement_data_reversed.txt'.


## Generate Output Most Significant Digit Randomized 4 Operands 0-999 Data

In [None]:
import torch
import copy
from typing import List, Dict
import re

digits_per_num = 3  # Number of digits in each number
base_data_file = "./train_reverse.txt"
output_path = "./train_reverse_output_highest_digit_rand.txt"

def create_meta_for_addition(data):
    """Create metadata for addition data."""
    # Define the vocabulary for addition problems
    # This includes digits, operators, equals sign, and newline
    chars = sorted(list(set(data)))
    vocab_size = len(chars)
    # Create encoder and decoder dictionaries
    stoi = {ch: i for i, ch in enumerate(chars)}
    itos = {i: ch for i, ch in enumerate(chars)}

    meta = {
        'vocab_size': vocab_size,
        'vocab': chars,
        'stoi': stoi,
        'itos': itos
    }
    return meta

def encode_addition(text, meta=meta):
    """Encode text to tensor using the metadata."""
    return torch.tensor([meta['stoi'][c] for c in text], dtype=torch.long)

def decode_addition(tensor, meta=meta):
    """Decode tensor to text using the metadata."""
    if isinstance(tensor, torch.Tensor):
        return ''.join([meta['itos'][i.item()] for i in tensor])
    else:
        return ''.join([meta['itos'][i] for i in tensor])
    

def token_to_numeric(tensor, meta):
    """Convert tensor to numeric digits."""
    # Build lookup tensor
    lookup_tensor = torch.empty(len(meta["vocab"]), dtype=torch.long)
    for i, s in enumerate(meta["vocab"]):
        if s.isdigit():
            lookup_tensor[i] = int(s)
    return lookup_tensor[tensor]  # Same shape as tensor


def randomize_test_data(data: torch.Tensor, metadata, digits_per_num=3, randomize_digit_place=[0,1], seed=2025,
                        randomize="input", valid_carry=False, reverse_input=False, reverse_output=False) -> torch.Tensor:
    """
    randomize_test_data randomizes a part of the test data by keeping some digits and randomizing the other digits
    Arguments:
        data is a 2-order tensor of shape (sample_size, seq_len), representing tokenized inputs (padded right to the same length) such as
        '437+357+579+984=7532' and '932+084+230+349=5951' (reverse output)
        digits_per_num is the number of digits in a number
        randomize_digit_place is a list indicating which digits are to be randomized. [0, 1] means the least two digits are to be randomized
        randomize: if "input" then the input numbers are randomized, if "output" then the output number is randomized
        valid_carry is a boolean indicating whether randomization keeps carry valid (carry operation before randomization remains so)
    """
    assert isinstance(randomize_digit_place, list)
    L = len(randomize_digit_place)
    n, T = data.shape
    S = digits_per_num + 1
    assert (T - S) % S == 0, "data format not conform to expectation, e.g., '437+357+579+984=7532'. "
    assert randomize in ["input", "output"], "randomize is either `input` or `output`."
    num_op = (T - S) // S
    torch.manual_seed(seed)

    ids0 = [digits_per_num-1-id for id in randomize_digit_place] if not reverse_input else randomize_digit_place
    ids1 = [digits_per_num-id for id in randomize_digit_place] if not reverse_output else randomize_digit_place
    ids_rand_input = torch.cat([torch.arange(num_op).long() * S + j for j in ids0])
    ids_rand_output = torch.tensor(ids1).long() + S*num_op
    new_data = copy.deepcopy(data)
    ids2 = []
    if randomize == "output":
        for col_id in ids_rand_output:
            new_data[:,col_id] = data[torch.randperm(n),col_id]
        return new_data
    if valid_carry: # if control for valid carry
        if 0 in randomize_digit_place: # if least significant digit is randomized
            J = max(randomize_digit_place) if reverse_input else digits_per_num-1-max(randomize_digit_place) #
            ids2 = torch.arange(num_op).long() * S + J
            all_carry = token_to_numeric(data[:,ids2], meta=metadata).sum(dim=1) // 10
            unique_carry = torch.unique(all_carry)
            for carry in unique_carry:
                ids_rand = (all_carry == carry)
                n_rand = ids_rand.sum().item()
                subset_ids = ids_rand.nonzero(as_tuple=True)[0]
                subset_data = new_data[ids_rand, :][:, ids2]
                subset_data = subset_data[torch.randperm(n_rand), :]
                ii, jj = torch.meshgrid(subset_ids, ids2, indexing='ij')
                new_data[ii, jj] = subset_data
        # randomize other digits independently
        for col_id in ids_rand_input:
            if col_id not in ids2:
                new_data[:,col_id] = data[torch.randperm(n),col_id]
    else: # if disregard carry
        for col_id in ids_rand_input:
            new_data[:,col_id] = data[torch.randperm(n),col_id]
    return new_data

def restore_original_format(decoded_line: str) -> str:
    """
    Given something like '098+007+120=2250',
    return '98+7+120=2250'.
    """
    # strip off the ending '$' if you want to keep it separate
    decoded_line = decoded_line.rstrip('$')
    lhs, rhs = decoded_line.split('=')
    nums = lhs.split('+')
    # int(…) will drop ALL leading zeros, but '000' → '0'
    nums = [str(int(n)) for n in nums]
    rhs = str(int(rhs))
    return '+'.join(nums) + '=' + rhs


with open(base_data_file, 'r') as f:
    train_data = f.read()

meta = create_meta_for_addition(train_data)

with open(base_data_file, "r", encoding="utf-8") as f:
    lines = [line.rstrip() for line in f]

# a simple way to parse test strings
padded_lines = [] # add 0 padding, remove $; an example padded_lines[6] is '932+084+230+349=5951'
for i in range(len(lines)):
    numbers = re.split(r'[+=]', lines[i])
    numbers[-1] = numbers[-1][:-1]
    for k, number in enumerate(numbers[:-1]):
        numbers[k] = '0' * (3-len(number)) + number
    numbers[-1] = numbers[-1] + '0' * (4-len(numbers[-1]))
    padded_lines.append("+".join(numbers[:-1]) + "=" + numbers[-1])

base_data = torch.cat([encode_addition(padded_lines[i]).unsqueeze(0) for i in range(len(padded_lines))], dim=0)


randomized_data = randomize_test_data(
    data = base_data,
    metadata = meta,
    digits_per_num = digits_per_num,
    randomize_digit_place = [digits_per_num],
    seed = 2005,
    randomize = "output",
    valid_carry = False,
    reverse_input = False,
    reverse_output = True
)

with open(output_path, "w", encoding="utf-8") as out:
    for i in range(randomized_data.size(0)):
        decoded = decode_addition(randomized_data[i], meta)
        restored = restore_original_format(decoded)
        out.write(restored + "$\n")

print(f"Wrote {randomized_data.size(0)} lines to {output_path}")

Wrote 1000000 lines to ./train_reverse_output_highest_digit_rand.txt


## Addition with Scratchpad

In [None]:
import random

TRAIN_SIZE = 1_000_000
TEST_SIZE  = 10_000
VAL_SIZE   = 10_000
SEED       = 42

def decode_idx(idx: int):
    """Turn an integer 0 <= idx < 1000**4 into four 3-digit numbers."""
    d = idx % 1000
    idx //= 1000
    c = idx % 1000
    idx //= 1000
    b = idx % 1000
    idx //= 1000
    a = idx  # now in [0,999]
    return a, b, c, d

def fmt_augmented(a: int, b: int, c: int, d: int) -> str:
    """
    Return a line like:
      391+257+417+623 = [18->8#1] [18->8#1] [16->6#1] [1->1#0] $
    Blocks are LSB -> ... -> MSB (units first), each as [col_sum->digit#carry_out].
    """
    total = a + b + c + d
    # width includes any final carry (so we get the thousands column when needed)
    width = max(len(str(total)), len(str(a)), len(str(b)), len(str(c)), len(str(d)))

    # zfill strings for consistent digit indexing
    sa = str(a).zfill(width)
    sb = str(b).zfill(width)
    sc = str(c).zfill(width)
    sd = str(d).zfill(width)

    cols = []     # column sums (LSB -> MSB)
    digits = []   # resulting digit in that column
    carries = []  # carry out of that column (to next more significant column)
    carry_in = 0

    # iterate from least-significant digit (index -1) upward
    for i in range(1, width + 1):
        da = int(sa[-i])
        db = int(sb[-i])
        dc = int(sc[-i])
        dd = int(sd[-i])
        col_sum = da + db + dc + dd + carry_in
        digit = col_sum % 10
        carry_out = col_sum // 10

        cols.append(col_sum)
        digits.append(digit)
        carries.append(carry_out)

        carry_in = carry_out

    # Build blocks LSB -> MSB (left-to-right)
    blocks = [f"[{col}->{dig}#{car}]" for col, dig, car in zip(cols, digits, carries)]

    # Keep the readable input (no zero padding) and append the blocks and dollar terminator
    return f"{a}+{b}+{c}+{d}=" + " ".join(blocks) + "$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE
    # Sample unique indices in [0, 1000**4)
    sampled = random.sample(range(1000**4), total_needed)

    train_idx = sampled[:TRAIN_SIZE]
    test_idx  = sampled[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE]
    val_idx   = sampled[TRAIN_SIZE+TEST_SIZE:]

    with open("train.txt", "w") as f_train, \
         open("test.txt",  "w") as f_test, \
         open("val.txt",   "w") as f_val:

        for idx in train_idx:
            f_train.write(fmt_augmented(*decode_idx(idx)) + "\n")

        for idx in test_idx:
            f_test.write(fmt_augmented(*decode_idx(idx)) + "\n")

        for idx in val_idx:
            f_val.write(fmt_augmented(*decode_idx(idx)) + "\n")

    print(f"Wrote {TRAIN_SIZE} train, {TEST_SIZE} test, {VAL_SIZE} val examples.")

if __name__ == "__main__":
    main()


# Addition OOD

In [None]:
#!/usr/bin/env python3
"""
Create a .txt file with 1,110,000 addition training examples:
 - 1,000,000 examples with 4 operands
 - 100,000 examples with 3 operands
 - 10,000 examples with 2 operands

Each operand is drawn uniformly from 0..999 (inclusive).
Example line format:
306+404+42+870=1622$
236+890+980+656=2762$
"""

import random
import argparse
import sys

def generate_examples(n_operands: int, count: int):
    """Yield `count` examples with `n_operands` operands each."""
    for _ in range(count):
        operands = [random.randint(0, 999) for _ in range(n_operands)]
        lhs = "+".join(str(x) for x in operands)
        rhs = str(sum(operands))
        yield f"{lhs}={rhs}$\n"

def main(output_path: str, seed: int | None = None):
    if seed is not None:
        random.seed(seed)

    counts = {
        4: 1_000_000,
        3: 100_000,
        2: 10_000,
    }
    total = sum(counts.values())
    assert total == 1_110_000

    # Generate lists for each operand-count
    print(f"Generating examples: total {total} lines ...", file=sys.stderr)
    examples = []
    # To reduce peak memory you could stream to disk per-block and perform an external shuffle.
    # Here we keep all lines in memory and shuffle in-place (should fit in typical modern RAM).
    for n_operands, cnt in counts.items():
        print(f"  Generating {cnt} examples with {n_operands} operands...", file=sys.stderr)
        # build in small batches to avoid many small appends cost
        batch = []
        for i, ex in enumerate(generate_examples(n_operands, cnt), start=1):
            batch.append(ex)
            # flush batches to main list occasionally to avoid huge single batch
            if i % 100_000 == 0:
                examples.extend(batch)
                batch = []
        if batch:
            examples.extend(batch)

    print("Shuffling all examples...", file=sys.stderr)
    random.shuffle(examples)

    print(f"Writing to {output_path} ...", file=sys.stderr)
    with open(output_path, "w", encoding="utf-8") as f:
        # write in chunks to avoid one huge write call
        chunk_size = 100_000
        for i in range(0, len(examples), chunk_size):
            f.writelines(examples[i:i+chunk_size])

    print("Done.", file=sys.stderr)
    print(f"Wrote {len(examples)} examples to {output_path}.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate addition training examples.")
    parser.add_argument("--out", "-o", default="addition_train_1110000.txt",
                        help="Output filename (default: addition_train_1110000.txt)")
    parser.add_argument("--seed", "-s", type=int, default=None,
                        help="Optional random seed (useful for reproducibility)")
    args = parser.parse_args()
    main(args.out, args.seed)


# Subtraction

### Uniform Sampling

In [None]:
import random


def format_train_val(a: int, b: int) -> str:
    """
    Format for training/validation: '229-594=-435$'
    """
    if a - b >= 0:
        line = f"{a}-{b}=" + f"+{a - b}$"
    else:
        line = f"{a}-{b}={a - b}$"
    return line


def format_test(a: int, b: int) -> str:
    """
    Format for testing: '252+699='
    """
    if a - b >= 0:
        line = f"{a}-{b}=" + f"+{a - b}$"
    else:
        line = f"{a}-{b}={a - b}$"
    return line


def main(
    train_size=10000,
    test_size=3000,
    val_size=3000,
    seed=42,
    train_file="train.txt",
    test_file="test.txt",
    val_file="val.txt"
):
    # 1. Build all (a,b) pairs
    all_pairs = [(a, b) for a in range(10000) for b in range(10000)]

    # 2. Shuffle
    random.seed(seed)
    random.shuffle(all_pairs)

    # 3. Slice
    train_pairs = all_pairs[:train_size]
    test_pairs  = all_pairs[train_size:train_size + test_size]
    val_pairs   = all_pairs[train_size + test_size:train_size + test_size + val_size]

    # 4. Write files with different formats
    with open(train_file, "w") as f_train, \
         open(test_file,  "w") as f_test, \
         open(val_file,   "w") as f_val:

        for a, b in train_pairs:
            f_train.write(format_train_val(a, b) + "\n")

        for a, b in test_pairs:
            f_test.write(format_test(a, b) + "\n")

        for a, b in val_pairs:
            f_val.write(format_train_val(a, b) + "\n")

    print(f"Wrote {train_size} train, {test_size} test, {val_size} val examples.")


if __name__ == "__main__":
    main()


### Bucket Sampling

In [None]:
import random
from pathlib import Path

def sample_operand():
    """
    Returns one integer:
      – with probability 1/3 in [0, 9]    (1-digit)
      – with probability 1/3 in [10, 99] (2-digit)
      – with probability 1/3 in [100, 999] (3-digit)
    """
    r = random.random()
    if r < 1/6:
        lo, hi = 0, 9
    elif r < 1/2:
        lo, hi = 10, 99
    else:
        lo, hi = 100, 999
    return random.randint(lo, hi)

seen = set()

def calculate(ops):
    return ops[0] - ops[1]

def generate_dataset(num_samples: int,
                     output_path: Path,
                     include_answer: bool = True,
                     append_dollar: bool = False):
    lines = []

    while len(lines) < num_samples:
        ops = [sample_operand() for _ in range(2)]
        base = "-".join(map(str, ops)) + "="
        if base not in seen:
            seen.add(base)
            line = base
            if include_answer:
                answer = calculate(ops)
                if answer >= 0:
                    line += f"+{answer}"
                else:
                    line += str(answer)
                if append_dollar:
                    line += "$"
            lines.append(line + "\n")


    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text("".join(lines))

if __name__ == "__main__":
    generate_dataset(
        num_samples=10_000,
        output_path=Path('/Users/perfectpeter/Library/CloudStorage/GoogleDrive-llmfunexperiment@gmail.com/My Drive/addition/data/train.txt'),
        include_answer=True,
        append_dollar=True
    )
    generate_dataset(
        num_samples=3_000,
        output_path=Path('/Users/perfectpeter/Library/CloudStorage/GoogleDrive-llmfunexperiment@gmail.com/My Drive/addition/data/val.txt'),
        include_answer=True,
        append_dollar=True
    )
    # generate_dataset(
    #     num_samples=10_000,
    #     output_path=Path('/Users/perfectpeter/Library/CloudStorage/GoogleDrive-xzhao468@wisc.edu/My Drive/addition/data/test.txt'),
    #     include_answer=False
    # )


# Taking Maximum Task

### Length Bucket

In [None]:
import random

TRAIN_SIZE = 1_000_000
TEST_SIZE  = 10_000
VAL_SIZE   = 10_000
SEED       = 42

def sample_number():
    """
    Sample one number whose length is chosen uniformly from {1,2,3,4}.
    For length L:
      L=1 -> range 0..9
      L=2 -> range 10..99
      L=3 -> range 100..999
      L=4 -> range 1000..9999
    """
    L = random.choice([1, 2, 3, 4])
    if L == 1:
        low, high = 0, 9
    else:
        low, high = 10**(L-1), 10**L - 1
    return random.randint(low, high)

def fmt_max(a: int, b: int, c: int, d: int) -> str:
    """Return a line of the form: 'a, b, c, d=MAX' (no zero-padding)."""
    maximum = max(a, b, c, d)
    return f"{a},{b},{c},{d}={maximum}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE

    samples = []
    for _ in range(total_needed):
        a = sample_number()
        b = sample_number()
        c = sample_number()
        d = sample_number()
        samples.append((a, b, c, d))

    train_samples = samples[:TRAIN_SIZE]
    test_samples  = samples[TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE]
    val_samples   = samples[TRAIN_SIZE + TEST_SIZE:]

    with open("train.txt", "w") as f_train, \
         open("test.txt",  "w") as f_test, \
         open("val.txt",   "w") as f_val:

        for a, b, c, d in train_samples:
            f_train.write(fmt_max(a, b, c, d) + "\n")

        for a, b, c, d in test_samples:
            f_test.write(fmt_max(a, b, c, d) + "\n")

        for a, b, c, d in val_samples:
            f_val.write(fmt_max(a, b, c, d) + "\n")

    print(f"Wrote {TRAIN_SIZE} train, {TEST_SIZE} test, {VAL_SIZE} val examples.")

if __name__ == "__main__":
    main()


Wrote 1000000 train, 10000 test, 10000 val examples.


In [6]:
!python sample.py \
    --input ./train.txt \
    --output ./train_eval.txt \
    --sample-size 10000

Total lines in input file: 1000000
Wrote 10000 sampled lines to './train_eval.txt'.


# Sorting

### Sort with Plain Numbers

In [5]:
import random

TRAIN_SIZE = 1_000_000
TEST_SIZE  = 10_000
VAL_SIZE   = 10_000
SEED       = 42

def sample_number():
    """
    Sample one number whose length is chosen uniformly from {1,2,3,4}.
    For length L:
      L=1 -> range 0..9
      L=2 -> range 10..99
      L=3 -> range 100..999
      L=4 -> range 1000..9999
    """
    L = random.choice([1, 2, 3, 4])
    if L == 1:
        low, high = 0, 9
    else:
        low, high = 10**(L-1), 10**L - 1
    return random.randint(low, high)

def fmt_sort(a: int, b: int, c: int, d: int) -> str:
    """Return a line of the form: 'a,b,c,d=sa,sb,sc,sd$' where s* are ascending-sorted values."""
    sorted_vals = sorted([a, b, c, d])
    sorted_str = ",".join(str(x) for x in sorted_vals)
    return f"{a},{b},{c},{d}={sorted_str}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE

    samples = []
    for _ in range(total_needed):
        a = sample_number()
        b = sample_number()
        c = sample_number()
        d = sample_number()
        samples.append((a, b, c, d))

    train_samples = samples[:TRAIN_SIZE]
    test_samples  = samples[TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE]
    val_samples   = samples[TRAIN_SIZE + TEST_SIZE:]

    with open("train.txt", "w") as f_train, \
         open("test.txt",  "w") as f_test, \
         open("val.txt",   "w") as f_val:

        for a, b, c, d in train_samples:
            f_train.write(fmt_sort(a, b, c, d) + "\n")

        for a, b, c, d in test_samples:
            f_test.write(fmt_sort(a, b, c, d) + "\n")

        for a, b, c, d in val_samples:
            f_val.write(fmt_sort(a, b, c, d) + "\n")

    print(f"Wrote {TRAIN_SIZE} train, {TEST_SIZE} test, {VAL_SIZE} val examples.")

if __name__ == "__main__":
    main()


Wrote 1000000 train, 10000 test, 10000 val examples.


#### Generate test data, where the 4 numbers in each example are of different digit

In [None]:
import random

TRAIN_SIZE = 1_000_000
TEST_SIZE  = 10_000
VAL_SIZE   = 10_000
SEED       = 42

def sample_by_length(L: int) -> int:
    """Return a random integer with exactly L digits (L in {1,2,3,4})."""
    if L == 1:
        low, high = 0, 9
    else:
        low, high = 10**(L-1), 10**L - 1
    return random.randint(low, high)

def sample_numbers_all_lengths():
    """
    Return a tuple (a,b,c,d) containing one number of each length 1,2,3,4
    in random order (positions randomized).
    """
    nums = [sample_by_length(1),
            sample_by_length(2),
            sample_by_length(3),
            sample_by_length(4)]
    random.shuffle(nums)
    return tuple(nums)

def fmt_sort(a: int, b: int, c: int, d: int) -> str:
    """Return a line of the form: 'a,b,c,d=sa,sb,sc,sd$' where s* are ascending-sorted values."""
    sorted_vals = sorted([a, b, c, d])
    sorted_str = ",".join(str(x) for x in sorted_vals)
    return f"{a},{b},{c},{d}={sorted_str}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE

    samples = []
    for _ in range(total_needed):
        a, b, c, d = sample_numbers_all_lengths()
        samples.append((a, b, c, d))

    train_samples = samples[:TRAIN_SIZE]
    test_samples  = samples[TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE]
    val_samples   = samples[TRAIN_SIZE + TEST_SIZE:]

    # with open("train.txt", "w") as f_train, \
    #      open("test.txt",  "w") as f_test, \
    #      open("val.txt",   "w") as f_val:
    
    with open("test.txt",  "w") as f_test:

        # for a, b, c, d in train_samples:
        #     f_train.write(fmt_sort(a, b, c, d) + "\n")

        for a, b, c, d in test_samples:
            f_test.write(fmt_sort(a, b, c, d) + "\n")

        # for a, b, c, d in val_samples:
        #     f_val.write(fmt_sort(a, b, c, d) + "\n")

    print(f"Wrote {TEST_SIZE} test examples.")
    # print(f"Wrote {TRAIN_SIZE} train, {TEST_SIZE} test, {VAL_SIZE} val examples.")

if __name__ == "__main__":
    main()


Wrote 10000 test examples.


#### Generate test data, where the 4 numbers in each example are of 4-digit

In [2]:
import random

TRAIN_SIZE = 1_000_000
TEST_SIZE  = 10_000
VAL_SIZE   = 10_000
SEED       = 42

def sample_by_length(L: int) -> int:
    """Return a random integer with exactly L digits (L in {1,2,3,4})."""
    if L == 1:
        low, high = 0, 9
    else:
        low, high = 10**(L-1), 10**L - 1
    return random.randint(low, high)

def sample_numbers_all_lengths():
    """
    Return a tuple (a,b,c,d) containing one number of each length 1,2,3,4
    in random order (positions randomized).
    """
    nums = [sample_by_length(4),
            sample_by_length(4),
            sample_by_length(4),
            sample_by_length(4)]
    random.shuffle(nums)
    return tuple(nums)

def fmt_sort(a: int, b: int, c: int, d: int) -> str:
    """Return a line of the form: 'a,b,c,d=sa,sb,sc,sd$' where s* are ascending-sorted values."""
    sorted_vals = sorted([a, b, c, d])
    sorted_str = ",".join(str(x) for x in sorted_vals)
    return f"{a},{b},{c},{d}={sorted_str}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE

    samples = []
    for _ in range(total_needed):
        a, b, c, d = sample_numbers_all_lengths()
        samples.append((a, b, c, d))

    train_samples = samples[:TRAIN_SIZE]
    test_samples  = samples[TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE]
    val_samples   = samples[TRAIN_SIZE + TEST_SIZE:]

    # with open("train.txt", "w") as f_train, \
    #      open("test.txt",  "w") as f_test, \
    #      open("val.txt",   "w") as f_val:
    
    with open("test.txt",  "w") as f_test:

        # for a, b, c, d in train_samples:
        #     f_train.write(fmt_sort(a, b, c, d) + "\n")

        for a, b, c, d in test_samples:
            f_test.write(fmt_sort(a, b, c, d) + "\n")

        # for a, b, c, d in val_samples:
        #     f_val.write(fmt_sort(a, b, c, d) + "\n")

    print(f"Wrote {TEST_SIZE} test examples.")
    # print(f"Wrote {TRAIN_SIZE} train, {TEST_SIZE} test, {VAL_SIZE} val examples.")

if __name__ == "__main__":
    main()


Wrote 10000 test examples.


#### Generate test data. In each example, two numbers vary only in units, while the other two numbers are uniformly drawn from 0-9999

In [3]:
import random

TRAIN_SIZE = 0
TEST_SIZE  = 10_000
VAL_SIZE   = 0
SEED       = 42

def make_pair_same_top3() -> tuple[int, int]:
    """
    Create two integers in 0..9999 that share the same thousands, hundreds,
    and tens digits, and only differ in the units digit. The shared top-3
    digits are drawn uniformly from 0..9. The two units digits are distinct
    (sampled without replacement from 0..9).
    """
    thou = random.randint(0, 9)
    hund = random.randint(0, 9)
    tens = random.randint(0, 9)
    u1, u2 = random.sample(range(10), 2)  # ensure units differ
    n1 = thou * 1000 + hund * 100 + tens * 10 + u1
    n2 = thou * 1000 + hund * 100 + tens * 10 + u2
    return n1, n2

def sample_uniform_0_9999() -> int:
    """Return a uniform integer in 0..9999 (inclusive)."""
    return random.randint(0, 9999)

def sample_example() -> tuple[int, int, int, int]:
    """
    Build one example: two numbers that share top-3 digits (only units differ)
    plus two independent uniform numbers in 0..9999. Shuffle positions.
    """
    p1, p2 = make_pair_same_top3()
    o1 = sample_uniform_0_9999()
    o2 = sample_uniform_0_9999()
    items = [p1, p2, o1, o2]
    random.shuffle(items)
    return tuple(items)

def fmt_sort(a: int, b: int, c: int, d: int) -> str:
    """Return a line of the form: 'a,b,c,d=sa,sb,sc,sd$' where s* are ascending-sorted values."""
    sorted_vals = sorted([a, b, c, d])
    sorted_str = ",".join(str(x) for x in sorted_vals)
    return f"{a},{b},{c},{d}={sorted_str}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE

    samples = []
    for _ in range(total_needed):
        samples.append(sample_example())

    train_samples = samples[:TRAIN_SIZE]
    test_samples  = samples[TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE]
    val_samples   = samples[TRAIN_SIZE + TEST_SIZE:]

    # currently only writing test set as in your provided script
    with open("test.txt", "w") as f_test:
        for a, b, c, d in test_samples:
            f_test.write(fmt_sort(a, b, c, d) + "\n")

    print(f"Wrote {TEST_SIZE} test examples.")

if __name__ == "__main__":
    main()


Wrote 10000 test examples.


#### Generate test data. In each example, two numbers vary only in units and tens, while the other two numbers are uniformly drawn from 0-9999

In [4]:
import random

TRAIN_SIZE = 0
TEST_SIZE  = 10_000
VAL_SIZE   = 0
SEED       = 42

def make_pair_same_top3() -> tuple[int, int]:
    """
    Create two integers in 0..9999 that share the same thousands, hundreds,
    and tens digits, and only differ in the units digit. The shared top-3
    digits are drawn uniformly from 0..9. The two units digits are distinct
    (sampled without replacement from 0..9).
    """
    thou = random.randint(0, 9)
    hund = random.randint(0, 9)
    x1, x2 = random.sample(range(100), 2)  # ensure units differ
    tens1 = x1 // 10
    u1    = x1 % 10
    tens2 = x2 // 10
    u2    = x2 % 10
    n1 = thou * 1000 + hund * 100 + tens1 * 10 + u1
    n2 = thou * 1000 + hund * 100 + tens2 * 10 + u2
    return n1, n2

def sample_uniform_0_9999() -> int:
    """Return a uniform integer in 0..9999 (inclusive)."""
    return random.randint(0, 9999)

def sample_example() -> tuple[int, int, int, int]:
    """
    Build one example: two numbers that share top-3 digits (only units differ)
    plus two independent uniform numbers in 0..9999. Shuffle positions.
    """
    p1, p2 = make_pair_same_top3()
    o1 = sample_uniform_0_9999()
    o2 = sample_uniform_0_9999()
    items = [p1, p2, o1, o2]
    random.shuffle(items)
    return tuple(items)

def fmt_sort(a: int, b: int, c: int, d: int) -> str:
    """Return a line of the form: 'a,b,c,d=sa,sb,sc,sd$' where s* are ascending-sorted values."""
    sorted_vals = sorted([a, b, c, d])
    sorted_str = ",".join(str(x) for x in sorted_vals)
    return f"{a},{b},{c},{d}={sorted_str}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE

    samples = []
    for _ in range(total_needed):
        samples.append(sample_example())

    train_samples = samples[:TRAIN_SIZE]
    test_samples  = samples[TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE]
    val_samples   = samples[TRAIN_SIZE + TEST_SIZE:]

    # currently only writing test set as in your provided script
    with open("test.txt", "w") as f_test:
        for a, b, c, d in test_samples:
            f_test.write(fmt_sort(a, b, c, d) + "\n")

    print(f"Wrote {TEST_SIZE} test examples.")

if __name__ == "__main__":
    main()


Wrote 10000 test examples.


#### Generate test data. In each example, two numbers vary in units, tens and hundreds, while the other two numbers are uniformly drawn from 0-9999

In [5]:
import random

TRAIN_SIZE = 0
TEST_SIZE  = 10_000
VAL_SIZE   = 0
SEED       = 42

def make_pair_same_top3() -> tuple[int, int]:
    """
    Create two integers in 0..9999 that share the same thousands, hundreds,
    and tens digits, and only differ in the units digit. The shared top-3
    digits are drawn uniformly from 0..9. The two units digits are distinct
    (sampled without replacement from 0..9).
    """
    thou = random.randint(0, 9)
    x1, x2 = random.sample(range(1000), 2)  # ensure units differ
    hund1 = x1 // 100
    hund2 = x2 // 100
    x1 %= 100
    x2 %= 100
    tens1 = x1 // 10
    u1    = x1 % 10
    tens2 = x2 // 10
    u2    = x2 % 10
    n1 = thou * 1000 + hund1 * 100 + tens1 * 10 + u1
    n2 = thou * 1000 + hund2 * 100 + tens2 * 10 + u2
    return n1, n2

def sample_uniform_0_9999() -> int:
    """Return a uniform integer in 0..9999 (inclusive)."""
    return random.randint(0, 9999)

def sample_example() -> tuple[int, int, int, int]:
    """
    Build one example: two numbers that share top-3 digits (only units differ)
    plus two independent uniform numbers in 0..9999. Shuffle positions.
    """
    p1, p2 = make_pair_same_top3()
    o1 = sample_uniform_0_9999()
    o2 = sample_uniform_0_9999()
    items = [p1, p2, o1, o2]
    random.shuffle(items)
    return tuple(items)

def fmt_sort(a: int, b: int, c: int, d: int) -> str:
    """Return a line of the form: 'a,b,c,d=sa,sb,sc,sd$' where s* are ascending-sorted values."""
    sorted_vals = sorted([a, b, c, d])
    sorted_str = ",".join(str(x) for x in sorted_vals)
    return f"{a},{b},{c},{d}={sorted_str}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE

    samples = []
    for _ in range(total_needed):
        samples.append(sample_example())

    train_samples = samples[:TRAIN_SIZE]
    test_samples  = samples[TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE]
    val_samples   = samples[TRAIN_SIZE + TEST_SIZE:]

    # currently only writing test set as in your provided script
    with open("test.txt", "w") as f_test:
        for a, b, c, d in test_samples:
            f_test.write(fmt_sort(a, b, c, d) + "\n")

    print(f"Wrote {TEST_SIZE} test examples.")

if __name__ == "__main__":
    main()


Wrote 10000 test examples.


#### Generate test data. In each example, the 4 numbers have distinct thousands-place.

In [6]:
import random

TRAIN_SIZE = 0
TEST_SIZE  = 10_000
VAL_SIZE   = 0
SEED       = 42

def sample_example_four_thous_distinct() -> tuple[int, int, int, int]:
    """
    Choose 4 distinct thousands-place digits (0-9). For each chosen thousands
    digit t, pick the lower 3 digits uniformly from 0..999 and form number
    t*1000 + lower. Shuffle the four numbers before returning.
    """
    thousands = random.sample(range(10), 4)  # 4 distinct thousands digits
    items = []
    for t in thousands:
        lower = random.randint(0, 999)  # uniform among the 1000 choices in that thousand block
        items.append(t * 1000 + lower)
    random.shuffle(items)
    return tuple(items)

def fmt_sort(a: int, b: int, c: int, d: int) -> str:
    """Return a line of the form: 'a,b,c,d=sa,sb,sc,sd$' where s* are ascending-sorted values."""
    sorted_vals = sorted([a, b, c, d])
    sorted_str = ",".join(str(x) for x in sorted_vals)
    return f"{a},{b},{c},{d}={sorted_str}$"

def main():
    random.seed(SEED)

    total_needed = TRAIN_SIZE + TEST_SIZE + VAL_SIZE

    samples = [sample_example_four_thous_distinct() for _ in range(total_needed)]

    test_samples = samples[TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE]

    with open("test.txt", "w") as f_test:
        for a, b, c, d in test_samples:
            f_test.write(fmt_sort(a, b, c, d) + "\n")

    print(f"Wrote {TEST_SIZE} test examples.")

if __name__ == "__main__":
    main()


Wrote 10000 test examples.


### Sort with Symbols

In [7]:
maximum_number_of_digts = 4
maximum_length = 4
num_examples = 3000
reverse_all = True

def pick_char_set(max_len):
    """Pick a set of characters in a cyclic method for index hints"""
    # 102 characters
    set_of_chars = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    
    output = []
    start = random.randint(0, len(set_of_chars))
    if start + max_len > len(set_of_chars): # i.e. cycle round
        return set_of_chars[start:len(set_of_chars)] + set_of_chars[:start + max_len-len(set_of_chars)]
    else:
        return set_of_chars[start:start + max_len]

def uniform_distribution_sort_basic(maximum_number_of_digts, maximum_length, num_examples, reverse_all):
    """sorting dataset generator"""
    dataset = []
    for i in range(0, num_examples):
        dataset_entry = ""
        chars = pick_char_set(maximum_length)
        all_nums = []
        for j in range(0, maximum_length):
            # choose a random number of digit between 1 and maximum_number_of_digts
            num_digit = random.randint(1, maximum_number_of_digts)
            # pick a number with num_digit digits
            num = random.randint(10**(num_digit-1), 10**num_digit - 1)
            all_nums.append([chars[j], num])

            num = str(num)
            if reverse_all:
                num = num[::-1]
            dataset_entry += f"{chars[j]}:{num},"

        dataset_entry = dataset_entry[:-1]
        all_nums = sorted(all_nums, key=lambda x: x[1]) # get the answer
        sorted_chars = [x[0] for x in all_nums]
        dataset_entry += f"={','.join(sorted_chars)}" # convert them into a string separated by ,
        dataset.append(dataset_entry)

    return dataset

def bucket_uniform_distribution(maximum_number_of_digts, maximum_length, num_examples, reverse_all):
    """Use a uniform distribution over -- i.e. bucket method for sorting"""
    bucket_limit = num_examples // (maximum_length)
    dataset = []
    for i in range(1, maximum_length + 1):
        dataset += uniform_distribution_sort_basic(maximum_number_of_digts, i, bucket_limit, reverse_all)
    return dataset

dataset = bucket_uniform_distribution(maximum_number_of_digts, maximum_length, num_examples, reverse_all)

for i in range(0, 10):
    print(dataset[i])


file_path = "./val.txt"

random.seed()
random.shuffle(dataset)
with open(file_path, 'w') as file:
    for entry in dataset:
        file.write(entry + '\n')
print(f"created: {file_path}")

T:4=T
F:1339=F
D:855=D
B:5077=B
J:8034=J
S:953=S
X:0553=X
U:3=U
W:759=W
V:7211=V
created: ./val.txt


### Strip Symbols & Replace with Numbers

In [13]:
#!/usr/bin/env python3
"""
replace_letters.py

Reads an input text file where each non-empty line has the format:
  LETTER1:NUMBER1,LETTER2:NUMBER2,... = LETTER_A,LETTER_B,...

Replaces each letter on both sides with its corresponding number (from the left-hand mapping),
preserves ordering, and writes lines to an output file. Each output line will end with a '$'.

Example input line:
  X:7,Y:2429,Z:766,A:6448=X,Z,A,Y

Produces:
  7,2429,766,6448=7,766,6448,2429$
"""

input_file = '/Users/perfectpeter/Library/CloudStorage/GoogleDrive-llmfunexperiment@gmail.com/My Drive/addition/data/4_operands_sorting_balanced_digit_symbol/test.txt'
output_file = "./test.txt"

import re
import sys

def process_line(line: str) -> str:
    line = line.strip()
    if not line:
        return ''  # skip empty lines

    if '=' not in line:
        raise ValueError(f"Line missing '=': {line!r}")

    left_part, right_part = line.split('=', 1)

    # Parse left mapping: pieces like "A:123" separated by commas
    left_tokens = [tok.strip() for tok in left_part.split(',') if tok.strip()]
    mapping = {}
    left_order = []
    pair_re = re.compile(r'^([A-Za-z]+)\s*:\s*([0-9]+)$')

    for tok in left_tokens:
        m = pair_re.match(tok)
        if not m:
            raise ValueError(f"Invalid mapping token: {tok!r} in line: {line!r}")
        letter, num = m.group(1), m.group(2)
        mapping[letter] = num
        left_order.append(letter)

    # Build left-side number list (preserve left mapping order)
    left_numbers = [mapping[lt] for lt in left_order]

    # Parse right side: letters separated by commas (could be same letters)
    right_tokens = [tok.strip() for tok in right_part.split(',') if tok.strip()]
    right_numbers = []
    for tok in right_tokens:
        # token might be a single letter or multi-letter identifier
        if tok not in mapping:
            raise KeyError(f"Right-side token {tok!r} has no mapping in line: {line!r}")
        right_numbers.append(mapping[tok])

    output_line = ','.join(left_numbers) + '=' + ','.join(right_numbers) + '$'
    return output_line

def main(infile: str, outfile: str):
    out_lines = []
    with open(infile, 'r', encoding='utf-8') as f:
        for raw in f:
            s = raw.rstrip('\n')
            if not s.strip():
                continue
            out = process_line(s)
            if out:
                out_lines.append(out)

    with open(outfile, 'w', encoding='utf-8') as f:
        for l in out_lines:
            f.write(l + '\n')

    print(f"Wrote {len(out_lines)} lines to {outfile!r}")

if __name__ == '__main__':
    main(input_file, output_file)


Wrote 3000 lines to './test.txt'


# Sample Train Eval Data & Reverse Result Script

In [1]:
# Sample a subset of training examples as train_evaluation data
!python sample.py \
    --input ./train.txt \
    --output ./train_eval.txt \
    --sample-size 10000

Total lines in input file: 1110000
Wrote 10000 sampled lines to './train_eval.txt'.


In [3]:
import os

INPUT_FILENAMES = [
    './train.txt',
    './train_eval.txt',
    './test.txt',
    './val.txt'
]

OUTPUT_FILENAMES = [
    './train_reverse.txt',
    './train_eval_reverse.txt',
    './test_reverse.txt',
    './val_reverse.txt'
]

def reverse_results(input_path: str, output_path: str) -> None:
    """
    Read each line from `input_path`, reverse the two‐digit result before the '$',
    and write the modified line to `output_path`.
    """
    if not os.path.isfile(input_path):
        print(f"Error: '{input_path}' does not exist.")
        return

    with open(input_path, "r") as infile, open(output_path, "w") as outfile:
        for raw_line in infile:
            line = raw_line.rstrip("\n")
            # Skip empty lines
            if not line.strip():
                continue

            # Expect format: <expression>=<two_digits>$
            if "=" in line and line.endswith("$"):
                left, right = line.split("=", 1)
                # right is like "42$" or "02$"
                digits = right[:-1]  # drop the trailing '$'
                # Reverse the digit string (e.g., "42" -> "24", "02" -> "20")
                reversed_digits = digits[::-1]
                new_line = f"{left}={reversed_digits}$"
            else:
                # If line doesn't match expected pattern, leave it unchanged
                new_line = line

            outfile.write(new_line + "\n")

    print(f"Processed '{input_path}' and wrote results to '{output_path}'.")


if __name__ == "__main__":
    for input_file, output_file in zip(INPUT_FILENAMES, OUTPUT_FILENAMES):
        reverse_results(input_file, output_file)

Error: './train.txt' does not exist.
Error: './train_eval.txt' does not exist.
Processed './test.txt' and wrote results to './test_reverse.txt'.
Error: './val.txt' does not exist.
