# Data Generator

This notebook generates synthetic training data for with a focus on:

- Addition and subtraction operations (-10 to 10)
- Verification cases

In [1]:
import numpy as np
import pandas as pd
import random

## 1. Arithmetic Operations

Generate training pairs for basic arithmetic operations with built-in verification.

In [57]:
class ArithmeticGenerator:
    def __init__(self, min_val, max_val):
        self.min_val = min_val
        self.max_val = max_val
        self.operations = {
                '+': lambda x, y: x + y,
                '-': lambda x, y: x - y,
            },

    def _make_number_pairs(self, n_pairs):
        """
        Generate pair of numbers within range
        """
        return [(random.randint(self.min_val, self.max_val), random.randint(self.min_val, self.max_val)) for _ in range(n_pairs)]

    def make_operation_data(self, op, n_pairs):
        """
        Generate operation examples
        """
        result = None
        pairs = self._make_number_pairs(n_pairs)
        if op == '+':
            result = [pair[0] + pair[1] for pair in pairs]
        elif op == '-':
            result = [pair[0] - pair[1] for pair in pairs]

        data = {
            "num1": [pair[0] for pair in pairs],
            "num2": [pair[1] for pair in pairs],
            "operation": op,
            "result": result
        }
        df = pd.DataFrame(data)
        return df
    
    def make_dataset(self, n_pairs_per_op):
        """
        Generate dataset for all operations
        """
        add = self.make_operation_data('+', n_pairs_per_op)
        sub = self.make_operation_data('-', n_pairs_per_op)
        return pd.concat([add, sub], ignore_index=True)

In [58]:
# Generate dataset
ag = ArithmeticGenerator(-10, 10)
df = ag.make_dataset(100000)

In [59]:
print(f"{df.shape=}")
display(df.head())
display(df.sample(n=10, random_state=42))

df.shape=(200000, 4)


Unnamed: 0,num1,num2,operation,result
0,-5,4,+,-1
1,0,-8,+,-8
2,-2,-6,+,-8
3,-1,-9,+,-10
4,-4,-10,+,-14


Unnamed: 0,num1,num2,operation,result
119737,-6,-9,-,3
72272,-8,4,+,-4
158154,10,4,-,6
65426,1,8,+,9
30074,-6,1,+,-5
23677,-1,0,+,-1
134858,-1,-8,-,7
176418,8,0,-,8
132467,-7,-6,-,-1
4082,10,-2,+,8


In [None]:
# Save the dataset
df.to_json("simple_arithmetic_pairs_10.json", index=False)