In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import re
import os
from pathlib import Path

In [3]:
data_path = Path("../data/mathematics_dataset-v1.0/")

In [4]:
!ls ../data/mathematics_dataset-v1.0/extrapolate

algebra__polynomial_roots_big.txt
arithmetic__add_or_sub_big.txt
arithmetic__add_sub_multiple_longer.txt
arithmetic__div_big.txt
arithmetic__mixed_longer.txt
arithmetic__mul_big.txt
arithmetic__mul_div_multiple_longer.txt
comparison__closest_more.txt
comparison__kth_biggest_more.txt
comparison__sort_more.txt
measurement__conversion.txt
numbers__place_value_big.txt
numbers__round_number_big.txt
probability__swr_p_level_set_more_samples.txt
probability__swr_p_sequence_more_samples.txt


In [5]:
train_files = [
    "algebra__polynomial_roots.txt",
    "arithmetic__add_or_sub.txt",
    "arithmetic__add_sub_multiple.txt",
    "arithmetic__div.txt",
    "arithmetic__mixed.txt",
    "arithmetic__mul.txt",
    "arithmetic__mul_div_multiple.txt",
    "comparison__closest.txt",
    "comparison__kth_biggest.txt",
    "comparison__sort.txt",
    "measurement__conversion.txt",
    "numbers__place_value.txt",
    "numbers__round_number.txt",
    "probability__swr_p_level_set.txt",
    "probability__swr_p_sequence.txt",
]

In [6]:
feature_keys = ["big", "longer", "more", "", "more_samples"]
difficulties = ['train-easy', 'train-medium', 'train-hard']

In [7]:
for fn in train_files:
    print("\n" + "=" * len(fn))
    print(fn)
    print("=" * len(fn))
    for difficulty in difficulties:
        print("\n" + difficulty + "\n")
        with open(data_path/difficulty/fn, 'r') as f:
            for i, line in enumerate(f.readlines()):
                if i > 5: break
                print(line.strip())


algebra__polynomial_roots.txt

train-easy

Solve -3*h**2/2 - 24*h - 45/2 = 0 for h.
-15, -1
Factor -n**2/3 - 25*n - 536/3.
-(n + 8)*(n + 67)/3
Let c**3/9 - 11*c**2/3 + 35*c - 75 = 0. What is c?
3, 15

train-medium

Factor 2*k**2/5 - 44598*k/5 + 44596/5.
2*(k - 22298)*(k - 1)/5
Solve -3*w**3 + 1374*w**2 - 5433*w - 6810 = 0 for w.
-1, 5, 454
Suppose 27*u**3 - 7329*u**2 = 0. Calculate u.
0, 2443/9

train-hard

Determine b so that -18*b**5 + 66654*b**4 - 3739744*b**3 - 3836048*b**2 + 1662112*b + 1691744 = 0.
-1, -2/3, 2/3, 58, 3646
Find r, given that 3*r**3 + 7644783*r**2 = 0.
-2548261, 0
Suppose 290*b**5/9 - 6031124*b**4/9 + 10451403836*b**3/3 + 32021141536*b**2/9 - 62071651798*b/9 - 432556804/3 = 0. What is b?
-2, -3/145, 1, 10399

arithmetic__add_or_sub.txt

train-easy

What is -5 - 110911?
-110916
What is -0.188 + -0.814?
-1.002
Sum 259 and -46.
213

train-medium

Total of 0.06 and -1977321735.
-1977321734.94
Add together 2 and 436273715.
436273717
Calculate 4062037 + -0.07.
4062036.9


probability__swr_p_sequence.txt

train-easy

What is prob of sequence ccbc when four letters picked without replacement from nnscspb?
0
Three letters picked without replacement from {g: 3, w: 1, t: 7, u: 3}. Give prob of sequence tuw.
1/104
Three letters picked without replacement from dxaxxaaxxxaax. What is prob of sequence aad?
5/429

train-medium

Two letters picked without replacement from {p: 12, v: 6, k: 1}. Give prob of sequence vk.
1/57
What is prob of sequence dd when two letters picked without replacement from dnnonnodonvdnnv?
1/35
Calculate prob of sequence yovy when four letters picked without replacement from {o: 2, y: 4, v: 4}.
2/105

train-hard

Two letters picked without replacement from uxjqxjjqxuxqx. What is prob of sequence xj?
5/52
What is prob of sequence dd when two letters picked without replacement from cccadcdaddd?
2/11
Calculate prob of sequence fxf when three letters picked without replacement from {a: 4, u: 3, f: 2, x: 8}.
1/255


In [8]:
fn = "arithmetic__add_or_sub.txt"


In [9]:
# Regex patterns
float_pattern = "[+-]?(?:[0-9]*[.])?[0-9]+"  # ?: for non-capturing group
float_pattern = re.compile(float_pattern)

In [10]:
matches = re.findall(float_pattern, "What is -0.188 + -0.814?")

In [11]:
print(matches)

['-0.188', '-0.814']


In [12]:
float_results = dict()
for fn in train_files:
    print("\n" + "=" * len(fn))
    print(fn)
    print("=" * len(fn))
    for difficulty in difficulties:
        print("\n" + difficulty + "\n")
        with open(data_path/difficulty/fn, 'r') as f:
            avg_lens = []
            for i, line in enumerate(f):
#                 if i > 5: break
                line = line.strip()
                matches = re.findall(float_pattern, line)
                if i % int(5e5) == 0: print(matches)
                lens = [len(match) for match in matches]
                avg_len = sum(lens) / len(lens) if len(lens) > 0 else 0.
#                 avg_len = max(lens) if len(lens) > 0 else 0.
                avg_lens.append(avg_len)
            total_avg_len = sum(avg_lens) / len(avg_lens) if len(avg_lens) > 0 else 0.
            float_results[difficulty + '/' + fn] = total_avg_len
            print('Average feature: ', total_avg_len)


algebra__polynomial_roots.txt

train-easy

['-3', '2', '2', '24', '45', '2', '0']
['35', '5', '45', '4', '165', '3', '55', '2', '30', '0']
['-3', '3', '4', '33', '2', '4', '3', '4', '33', '4']
Average feature:  1.6958307381347562

train-medium

['2', '2', '5', '44598', '5', '44596', '5']
['-3', '5', '987', '4', '1962', '3', '6', '2', '1965', '981', '0']
['8', '3', '20972', '2', '13760240', '20593200', '0']
Average feature:  2.4275948104422063

train-hard

['-18', '5', '66654', '4', '3739744', '3', '3836048', '2', '1662112', '1691744', '0']
['2', '5', '372', '4', '27080', '3', '202128', '2', '18926208', '149022720', '0']
['-2', '2', '13', '5624', '13', '1197120', '13']
Average feature:  3.2544570248489006

arithmetic__add_or_sub.txt

train-easy

['-5', '110911']
['0.3', '445115']
['-1.5', '227230']
Average feature:  5.206219456219456

train-medium

['0.06', '-1977321735']
['4', '54838336.46']
['0.4', '0.0952769']
Average feature:  7.7267046017046015

train-hard

['-0.5', '-178283837383

In [108]:
float_results

{'train-easy/algebra__polynomial_roots.txt': 2.7458457458457457,
 'train-medium/algebra__polynomial_roots.txt': 4.625938625938626,
 'train-hard/algebra__polynomial_roots.txt': 6.743892743892744,
 'train-easy/arithmetic__add_or_sub.txt': 5.868015618015618,
 'train-medium/arithmetic__add_or_sub.txt': 8.882789132789133,
 'train-hard/arithmetic__add_or_sub.txt': 11.964767214767214,
 'train-easy/arithmetic__add_sub_multiple.txt': 1.8752853752853753,
 'train-medium/arithmetic__add_sub_multiple.txt': 2.418852918852919,
 'train-hard/arithmetic__add_sub_multiple.txt': 2.8447160947160945,
 'train-easy/arithmetic__div.txt': 3.186057186057186,
 'train-medium/arithmetic__div.txt': 4.878652878652878,
 'train-hard/arithmetic__div.txt': 6.607759357759358,
 'train-easy/arithmetic__mixed.txt': 2.2798575298575297,
 'train-medium/arithmetic__mixed.txt': 2.8877466377466376,
 'train-hard/arithmetic__mixed.txt': 3.519106269106269,
 'train-easy/arithmetic__mul.txt': 4.583364083364083,
 'train-medium/arithmeti