In [19]:
from pprint import pprint
import re 
import collections 

In [26]:
# Read the file and convert to a list of strings
with open('dataset.txt', 'r') as f:
    raw_text = f.read()

In [27]:
samples = []
for line in raw_text.splitlines():
    samples.append(line.strip().split('='))

In [28]:
samples[:10]

[['(7-3*z)*(-5*z-9)', '15*z**2-8*z-63'],
 ['-9*s**2', '-9*s**2'],
 ['(2-2*n)*(n-1)', '-2*n**2+4*n-2'],
 ['x**2', 'x**2'],
 ['(4-x)*(x-23)', '-x**2+27*x-92'],
 ['(7-5*c)*(3*c-17)', '-15*c**2+106*c-119'],
 ['-8*x*(3*x+14)', '-24*x**2-112*x'],
 ['-2*k*(5*k-9)', '-10*k**2+18*k'],
 ['(3*cos(c)-19)*(7*cos(c)+13)', '21*cos(c)**2-94*cos(c)-247'],
 ['-8*j*(-8*j-3)', '64*j**2+24*j']]

In [6]:
len(samples)

1000000

In [10]:
# max input length 
max(list(map(lambda x: len(x[0]), samples)))

29

In [11]:
# max output length 
max(list(map(lambda x: len(x[1]), samples)))

28

In [12]:
vocab = dict()
for line in raw_text:
    for char in line.strip():
        if char not in vocab:
            vocab[char] = 0
        vocab[char] += 1

In [14]:
pprint(vocab)

{'(': 1717013,
 ')': 1717013,
 '*': 6296348,
 '+': 1249605,
 '-': 2932951,
 '0': 621755,
 '1': 1551639,
 '2': 2739472,
 '3': 950998,
 '4': 952516,
 '5': 803947,
 '6': 854153,
 '7': 646955,
 '8': 799495,
 '9': 500412,
 '=': 1000000,
 'a': 284688,
 'c': 284521,
 'h': 244132,
 'i': 528183,
 'j': 244094,
 'k': 245042,
 'n': 566389,
 'o': 283088,
 's': 568438,
 't': 285215,
 'x': 243916,
 'y': 246024,
 'z': 245599}


In [16]:
len(vocab)

29

In [17]:
# Find the frequency of common patterns in the dataset (sin, cos etc.)
def freq(pattern, s):
    return collections.Counter(re.findall(pattern, s)).most_common()

In [29]:
freq(".", raw_text)

[('*', 6296348),
 ('-', 2932951),
 ('2', 2739472),
 ('(', 1717013),
 (')', 1717013),
 ('1', 1551639),
 ('+', 1249605),
 ('=', 1000000),
 ('4', 952516),
 ('3', 950998),
 ('6', 854153),
 ('5', 803947),
 ('8', 799495),
 ('7', 646955),
 ('0', 621755),
 ('s', 568438),
 ('n', 566389),
 ('i', 528183),
 ('9', 500412),
 ('t', 285215),
 ('a', 284688),
 ('c', 284521),
 ('o', 283088),
 ('y', 246024),
 ('z', 245599),
 ('k', 245042),
 ('h', 244132),
 ('j', 244094),
 ('x', 243916)]

### All tokens 
- **digits:** `0, 1, 2, 3, 4, 5, 6, 7, 8, 9`
- **variables:** `a, c, h, i, j, k, n, o, s, t, x, y, z`
- **parentheses:** `(, )`
- **math operators:** `*, **, +, -`
- **trig functions:** `sin, cos, tan`


In [30]:
vocab_pattern = "sin|cos|tan|\d|\w|\(|\)|\+|-|\*+"
freq(vocab_pattern, raw_text)

[('*', 4214642),
 ('-', 2932951),
 ('2', 2739472),
 ('(', 1717013),
 (')', 1717013),
 ('1', 1551639),
 ('+', 1249605),
 ('**', 1040853),
 ('4', 952516),
 ('3', 950998),
 ('6', 854153),
 ('5', 803947),
 ('8', 799495),
 ('7', 646955),
 ('0', 621755),
 ('9', 500412),
 ('s', 489862),
 ('i', 488935),
 ('n', 487884),
 ('y', 246024),
 ('t', 245958),
 ('z', 245599),
 ('a', 245431),
 ('c', 245193),
 ('k', 245042),
 ('h', 244132),
 ('j', 244094),
 ('x', 243916),
 ('o', 243760),
 ('cos', 39328),
 ('tan', 39257),
 ('sin', 39248)]

In [31]:
# Need to create a vocabulary for all these tokens. 