In [1]:
import json
import re

In [None]:
# Load dataset json
data = []
with open('data/train.json', 'r') as file:
    for line in file:
        if line.strip():  # Skip empty lines
            data.append(json.loads(line))

# Now data is a list of dictionaries, each containing one record
print(f"Loaded {len(data)} records")

Loaded 498795 records


## ChatGPT Generated Code
Takes imported dictionary, converts equations to reverse polish notation strings, and store the reverse polish notation strings in a seperate dictionary column

In [4]:
# Define the functions we expect to see.
FUNCTIONS = {"sin", "cos", "exp", "log"}

# Define operators with their precedence and associativity.
OPERATORS = {
    '+': {'precedence': 2, 'associativity': 'left'},
    '-': {'precedence': 2, 'associativity': 'left'},
    '*': {'precedence': 3, 'associativity': 'left'},
    '/': {'precedence': 3, 'associativity': 'left'},
    '**': {'precedence': 4, 'associativity': 'right'},
}

def tokenize(expression):
    """
    Tokenizes the expression into numbers, variables, functions, operators, and parentheses.
    Supports multi-character operators like "**".
    """
    token_pattern = r'\d+\.\d+|\d+|[A-Za-z_][A-Za-z0-9_]*|\*\*|[+\-*/()]'
    tokens = re.findall(token_pattern, expression)
    return tokens

def infix_to_rpn(expression):
    """
    Converts an infix expression string to its Reverse Polish Notation (RPN) equivalent.
    """
    output = []
    stack = []
    tokens = tokenize(expression)
    
    for token in tokens:
        # If the token is a number or variable/identifier.
        if re.match(r'\d+(\.\d+)?$', token) or re.match(r'[A-Za-z_][A-Za-z0-9_]*$', token):
            # If the token is a recognized function, push it onto the stack.
            if token in FUNCTIONS:
                stack.append(token)
            else:
                output.append(token)
        elif token == ',':
            # Function argument separator: pop until '('.
            while stack and stack[-1] != '(':
                output.append(stack.pop())
        elif token in OPERATORS:
            # Operator: pop higher precedence operators from the stack.
            while (stack and stack[-1] in OPERATORS and
                   ((OPERATORS[token]['associativity'] == 'left' and OPERATORS[token]['precedence'] <= OPERATORS[stack[-1]]['precedence']) or
                    (OPERATORS[token]['associativity'] == 'right' and OPERATORS[token]['precedence'] < OPERATORS[stack[-1]]['precedence']))):
                output.append(stack.pop())
            stack.append(token)
        elif token == '(':
            stack.append(token)
        elif token == ')':
            # Pop until the matching '('.
            while stack and stack[-1] != '(':
                output.append(stack.pop())
            if stack and stack[-1] == '(':
                stack.pop()  # Remove the '('
            # If a function is on top of the stack, pop it to output.
            if stack and stack[-1] in FUNCTIONS:
                output.append(stack.pop())
    
    # Pop any remaining operators from the stack.
    while stack:
        output.append(stack.pop())
        
    return " ".join(output)

# Process each record: convert the single-string "Skeleton" equation to RPN.
for record in data:
    if "Skeleton" in record:
        skeleton_expr = record["Skeleton"]
        rpn_expr = infix_to_rpn(skeleton_expr)
        record["RPN"] = rpn_expr

In [6]:
##Test the RPN conversion by printing the first 20 records
for idx, record in enumerate(data[:200], 1):
    print(f"\nRecord {idx}:")
    print("Skeleton:")
    print(record["Skeleton"])
    print("RPN:")
    print(record["RPN"])


Record 1:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 2:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 3:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 4:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 5:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 6:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 7:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 8:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 9:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 10:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 11:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 12:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 13:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 14:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 15:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 16:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 * cos * C +

Record 17:
Skeleton:
C*cos(C*x1)+C
RPN:
C C x1 *