# Augmentations of Raw Arithmetic Dataset

The following code based off the `dataset.ipynb` Notebook from the GOAT_7B GitHub Repo. 

The `templates/goat.json` template is taken directly from the GOAT_7B GitHub Repo.

[https://github.com/liutiedong/goat/tree/main]

## Numeric Representation Format

This is Original Format Used by GOAT_7B Model.

In [1]:
import json
import random

template_name = "./templates/goat.json"
dataset_name = "raw_dataset.json"

with open(template_name) as fp:
    template = json.load(fp)

with open(dataset_name,"rb") as test_file:
    data_original = json.load(test_file)

data_converted = []

for instance in data_original:
    
    arithmetic = instance["input"]
    
    output_dict = {}
        
    
    # add noise to instruction so that the model is robust to diverse question formats 
    if random.random() < 0.05:
        if " + " in arithmetic:
            arithmetic = "the sum of " + arithmetic.replace("+", "and")

        if " - " in arithmetic:
            arithmetic = "the difference of " + arithmetic.replace("-", "and")

        if " * " in arithmetic:
            arithmetic = "the product of " + arithmetic.replace("*", "and")

        if " / " in arithmetic:
            arithmetic = "the quotient and remainder of " + arithmetic.replace("/", "and")

    if random.random() < 0.5:
        arithmetic = arithmetic.replace("*", "x")    

    if random.random() < 0.1:
        arithmetic = arithmetic.replace("+", "plus").replace("-", "minus")
        arithmetic = arithmetic.replace(" x ", " times ").replace("*", "multiplied by").replace("/", "divided by")    

    if random.random() < 0.5:
        if "+" in arithmetic or "-" in arithmetic or "*" in arithmetic or "/" in arithmetic or "x" in arithmetic:
            arithmetic = arithmetic.replace(" ", "")        

    num = random.randint(1,500)

    instruction = template[str(num)].format(
        input = arithmetic
    )
    
    output_dict["instruction"] = instruction
    output_dict["input"] = instance["input"]
    output_dict["output"] = instance["output"]
    output_dict["answer"] = instance["answer"]
    
    data_converted.append(output_dict)

print("Total:", len(data_converted))

with open("numRep_dataset.json", "w") as f:
    json.dump(data_converted, f, indent=4)

Total: 500000


## Word Problem Representation Format

NEED TO REMAKE THIS CODE AND TEMPLATE!!!

## Alpha Character Representation Format

In [None]:
from num2words import num2words

def num_word_conv(matchobj):
    return num2words(matchobj.group(0))

{'input': 'thirty-three * five', 'output': 'thirty-three * five = one hundred and sixty-five', 'answer': 'one hundred and sixty-five'}
{'input': 'five million, seven hundred and twenty-one thousand, one hundred and sixty-one + six thousand, five hundred and twenty-six', 'output': 'five million, seven hundred and twenty-one thousand, one hundred and sixty-one + six thousand, five hundred and twenty-six = five million, seven hundred and twenty-seven thousand, six hundred and eighty-seven', 'answer': 'five million, seven hundred and twenty-seven thousand, six hundred and eighty-seven'}


In [None]:
### Add natural language instruction to the generated arithmetic data using template
import random
import json
import re

template_name = "./templates/goat.json"

with open(template_name) as fp:
    template = json.load(fp)

dataset_name = "raw_dataset.json"

with open(dataset_name,"rb") as test_file:
    data_original = json.load(test_file)

data_formatted = []

for instance in data_original:
    
    arithmetic = instance["input"]
    
    output_dict = {}
        
    
    # add noise to instruction so that the model is robust to diverse question formats 
    if random.random() < 0.05:
        if " + " in arithmetic:
            arithmetic = "the sum of " + arithmetic.replace("+", " and ")

        if " - " in arithmetic:
            arithmetic = "the difference of " + arithmetic.replace("-", " and ")

        if " * " in arithmetic:
            arithmetic = "the product of " + arithmetic.replace("*", " and ")

        if " / " in arithmetic:
            arithmetic = "the quotient and remainder of " + arithmetic.replace("/", " and ")

    arithmetic = arithmetic.replace("+", "plus").replace("-", "minus")
    arithmetic = arithmetic.replace("*", "multiplied by").replace("/", "divided by")    

    num = random.randint(1,500)

    instruction = template[str(num)].format(
        input = arithmetic
    )
    
    output_dict["instruction"] = re.sub('\d+', num_word_conv, instruction)
    output_dict["input"] = re.sub('\d+', num_word_conv, instance["input"])
    output_dict["output"] = re.sub('\d+', num_word_conv, instance["output"])
    output_dict["answer"] = re.sub('\d+', num_word_conv, instance["answer"])
    
    data_formatted.append(output_dict)

print("Total:", len(data_formatted))

with open("alpha_rep_dataset.json", "w") as f:
    json.dump(data_formatted, f, indent=4)

Total: 100000
Instructions added!


## Alpha Character Representation Format of Instructions Only

In [None]:
from num2words import num2words

def num_word_conv(matchobj):
    return num2words(matchobj.group(0))

In [None]:
import random
import re
import json

template_name = "./templates/goat.json"
with open(template_name) as fp:
    template = json.load(fp)

dataset_name = "raw_dataset_500K.json"
with open(dataset_name,"rb") as test_file:
    data_original = json.load(test_file)

data_formatted = []

for instance in data_original:
    
    arithmetic = instance["input"]
    
    output_dict = {}
        
    
    # add noise to instruction so that the model is robust to diverse question formats 
    if random.random() < 0.05:
        if " + " in arithmetic:
            arithmetic = "the sum of " + arithmetic.replace("+", "and")

        if " - " in arithmetic:
            arithmetic = "the difference of " + arithmetic.replace("-", "and")

        if " * " in arithmetic:
            arithmetic = "the product of " + arithmetic.replace("*", "and")

        if " / " in arithmetic:
            arithmetic = "the quotient and remainder of " + arithmetic.replace("/", "and") 

    arithmetic = arithmetic.replace("+", "plus").replace("-", "minus")
    arithmetic = arithmetic.replace("*", "multiplied by").replace("/", "divided by")            

    num = random.randint(1,500)

    instruction = template[str(num)].format(
        input = arithmetic
    )
    
    output_dict["instruction"] = re.sub('\d+', num_word_conv, instruction)
    output_dict["input"] = re.sub('\d+', num_word_conv, instance["input"])
    output_dict["output"] = instance["output"]
    output_dict["answer"] = instance["answer"]
    
    data_formatted.append(output_dict)

print("Total:", len(data_formatted))

with open("alpha_rep_Instruct_dataset.json", "w") as f:
    json.dump(data_formatted, f, indent=4)

Total: 500000
Instructions added!
