In [1]:
import re
import sys
import random
from collections import defaultdict

In [2]:
def to_basic_coord_tag_words(x, y, grid, k):
    """
    Return string of 0-padded x and y coordinates separated by a comma, followed
    by a numeric tag k+n where k is some constant, and n is defined such that
    the symbol S at grid[(x,y)] is the nth consecutive symbol in the
    alphanumeric string that S appears in. (n is 0-based).

    :param int x: x-coordinate
    :param int y: y-coordinate
    :param defaultdict grid: mapping from coords to the symbols located at them
    :param int k: any constant
    """
    count = 0
    i = x
    while True:
        i -= 1
        if i >= 0 and re.fullmatch(r'[A-Za-z0-9]', grid[(i, y)]):
            count += 1
        else:
            break
    return f'{x:02d},{y:02d}:{k+count}'

def to_basic_coord_tag_words_prefix_encoding(x, y, grid):
    """
    Return string of 0-padded x and y coordinates separated by a comma, followed
    by a numeric tag 201+n where n is defined such that the symbol S at
    grid[(x,y)] is the nth consecutive symbol in the alphanumeric string that S
    appears in. (n is 0-based). This allows the numeric tags to be represented
    by single bytepair encoding tokens which differ from the tokens used to
    express x and y.

    :param int x: x-coordinate
    :param int y: y-coordinate
    :param defaultdict grid: mapping from coords to the symbols located at them
    """
    return to_basic_coord_tag_words(x, y, grid, 201)

In [3]:
def spacify(x):
    """
    Given a number or string, return a string that contains all of the
    digits/characters of the original, but separated by spaces. The result
    neither begins nor ends with a space.
    """
    s = str(x)
    out_string = ''
    for char in s:
        out_string += char + ' '
    return out_string.strip()

In [4]:
def look_at(grid, x, y):
    """
    Return the part of a 'look' action string corresponding to 'looking at' a
    grid coordinate (x, y). Specifically: a string consisting of a space,
    followed by a representation of the coordinate, followed by the symbol
    currently located there.

    :param int x: x-coordinate
    :param int y: y-coordinate
    :param defaultdict grid: mapping from coords to the symbols located at them
    """
    return f' {to_basic_coord_tag_words_prefix_encoding(x, y, grid)} {grid[(x, y)]}'

In [5]:
def register(text, grid, x, y, char):
    """
    Write a particular character to a grid location.

    Return a 2-tuple such that the first element is a space, followed by a
    representation of the coordinate, followed by new symbol located there; the
    second element is the grid itself. This is not strictly necessary to return,
    but reminds us that this function modifies the grid.

    :param str text: action string that we want to append the write action onto
    :param defaultdict grid: mapping from coords to the symbols located at them
    :param int x: x-coordinate
    :param int y: y-coordinate
    :param str char: character to write
    """
    text += f' {to_basic_coord_tag_words_prefix_encoding(x, y, grid)} {char}'
    grid[(x, y)] = char
    return (text, grid)

In [6]:
def clean_position_ids(s):
    """ Return a version of `s` with the position ids erased. """
    return re.sub(r' \d\d\d', '', ' ' + s).strip()

In [7]:
def generate_instruction(first, second):
    """ Generate an elaborate instruction for addition.
    :param int first: the first addend
    :param int second: the second addend
    """

    grid = defaultdict(lambda:'_')
    ans = first + second
    first, second = str(first), str(second)
    max_len = max(len(first), len(second))  

    x = 70
    y = 1

    text = 'write'

    # Write the first addend
    for c in first[::-1]:
        x += 1
        text, grid = register(text, grid, x, y, c)

    # Reset the x coordinate
    x = 70
    y += 1

    text += f' {{ {len(first)} digits }}'
    text += f' {{ add }}'

    carry_the_1 = False 

    # Write the second addend
    for c in second[::-1]:
        x += 1
        text, grid = register(text, grid, x, y, c)

    text += f' {{ {len(second)} digits }}'

    # Next, solve the addition problem
    x = 71

    for i in range(max_len + 1):  
        y -= 1
        value1 = grid[(x, y)]
        look1 = look_at(grid, x, y)
        y += 1
        look2 = look_at(grid, x, y)
        value2 = grid[(x, y)]
        text += ' look' + look1 + look2
        if value1 == '_' and value2 == '_':
            break

        value1 = int(value1) if value1.isdigit() else 0
        value2 = int(value2) if value2.isdigit() else 0
        value_sum = value1 + value2

        text += f' {{{look1}' + \
                    f' +{look2} = {spacify(value1 + value2)} }}'
        if carry_the_1:
            text += f' {{ {spacify(value1 + value2)} + 1 = {spacify(value1 + value2 + 1)} }}'
            value_sum += 1

        y += 1
        text += ' write'
        text, grid = register(text, grid, x, y, str(value_sum % 10))

        if (value_sum >= 10):
            carry_the_1 = True
            text += ' { carry the 1 }'
        else:
            carry_the_1 = False
        x += 1
        y -= 1

    text += ' { last }'

    # We have now observed that we are at the edge of the addition problem
    # -- having observed _ both in the first and the second position -- and it is time to wrap up
    if carry_the_1:
        text += ' { 0 + 0 = 0 } { 0 + 1 = 1 }'
        y += 1
        text += ' write'
        text, grid = register(text, grid, x, y, '1')
    else:
        text += ' { 0 + 0 = 0 }'
        y += 1
        text += ' write'
        text, grid = register(text, grid, x, y, '0')

    text += ' { read the answer } look'
    y = 3
    x = 71 + max_len

    for i in range(max_len + 1):
        
        value = grid[(x, y)]
        look = look_at(grid, x, y)
        text += look
        x -= 1

    text += f' {{ result is {convert_number(str(ans).zfill(max_len + 1))} }}'

    return text

In [8]:
def convert_number(number):
    """ Convert a number into a string with positional encoding"""

    number = str(number)

    output = []
    len_number = len(number) - 1

    for i, digit in enumerate(number):
        output.append(str(201 + (len_number - i)))
        output.append(digit)

    return ' '.join(output)

In [9]:
def format(first, second):
    """ Generate an elaborate instruction for addition.
    :param int first: the first addend
    :param int second: the second addend
    """

    return f'Calculate {convert_number(first)} plus {convert_number(second)} | {generate_instruction(first, second)}'

In [10]:
def generate_file(filename, n_examples, min_digits, max_digits):
    """
    Generate pairs of random numbers. 
    The datasets have a roughly equal proportion of d-digit numbers, 
    where d ∈ [min_digits, max_digits]. 
    Create lines with instructions and answers. 
    Save the lines to a file.
    """
    with open(filename, 'w') as f:
        for _ in range(n_examples):
            example = []
            for _ in range(2):
                max_digits_i = random.randint(min_digits, max_digits)
                min_number = int((max_digits_i - 1) * '9') + 1
                max_number = int(max_digits_i * '9')
                example.append(random.randint(min_number, max_number))
            line = format(example[0], example[1])

            f.write(f'{line}\n')

In [11]:
generate_file('train_dataset.txt', n_examples=200, 
              min_digits=2, max_digits=30)

In [12]:
generate_file('val_dataset.txt', n_examples=50, 
              min_digits=2, max_digits=30)