In [1]:
import time
from datetime import datetime, date, timedelta
import random

In [2]:
def rand_date(start=date(1400, 1, 1), end=date(2400, 1, 1)):
    diff = end - start
    return start + random.random() * diff

In [3]:
import numpy as np

In [4]:
def convert_to_vector(xdate):
    yr1 = xdate.year // 1000
    yr2 = (xdate.year // 100) % 10
    yr3 = (xdate.year // 10) % 10
    yr4 = (xdate.year) % 10
    
    month = xdate.month
    
    day1 = xdate.day // 10
    day2 = xdate.day % 10
    
    weekday = xdate.weekday()
    
    to_encode = [
        (yr1, 10),
        (yr2, 10),
        (yr3, 10),
        (yr4, 10),
        (month, 12),
        (day1, 10),
        (day2, 10)
    ]
    
    xs = []
    for val, num_vals in to_encode:
        one_hot = np.zeros(num_vals)
        one_hot[val-1] = 1
        xs.append(one_hot)
        
    x = np.concatenate(xs)
    
    y = np.zeros(7)
    y[weekday] = 1
    return x, y

Encoding: One-hot, separate integers

Training: Month, year, other years, other decades

Raw

Start small, build up

One month, same month plus another, keep adding months until a year, then add new years
One month, add a month, add a year, add several month, add several years, add decades
Start without leap years, then include leap years

Whole idea is to go simple and complex
Introduce new concepts before past concepts are solidified
Don't want to confuse things - easily possible

Simplest would be a single month in non leap-year
When introducing leap years, do half leap years and half non-leap years


In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import Callback
from keras import backend as K

Using TensorFlow backend.


In [6]:
model = Sequential()
model.add(Dense(100, input_shape=(10*6 + 12,), activation='tanh'))
model.add(Dense(100, activation='tanh'))
model.add(Dense(100, activation='tanh'))
model.add(Dense(7, activation='softmax'))

model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

def reset_model():
    global model
    session = K.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.initializer.run(session=session)

In [16]:
def generate_random(batch_size):
    dates = []
    xs = []
    ys = []
    for i in range(batch_size):
        rdate = rand_date()
        x, y = convert_to_vector(rdate)
        dates.append(rdate)
        xs.append(x)
        ys.append(y)

    xs = np.array(xs)
    ys = np.array(ys)
    return xs, ys


def get_range_gen(start_date, end_date):
    def generate_range(batch_size):
        dates = []
        xs = []
        ys = []
        for i in range(batch_size):
            rdate = rand_date(start_date, end_date)
            x, y = convert_to_vector(rdate)
            dates.append(rdate)
            xs.append(x)
            ys.append(y)

        xs = np.array(xs)
        ys = np.array(ys)
        return xs, ys
    return generate_range


def get_month_gen(year, month):
    start = date(year, month, 1)
    if month == 12:
        end = date(year + 1, 1, 1)
    else:
        end = date(year, month + 1, 1)
    return get_range_generator(start, end)


def combine_gens(generators, p=None):
    if p is None:
        p = [1/len(generators)] * len(generators)
    
    def generate(batch_size):
        generator = np.random.choice(generators, p=p)
        return generator(batch_size)
    
    return generate


def get_months_gen(year, months, p=None):
    gens = []
    for month in months:
        gens.append(get_month_generator(year, month))
    return combine_generators(gens, p)


def get_year_gen(year):
    return get_range_generator(date(year, 1, 1), date(year+1, 1, 1))


def get_years_gen(start_year, end_year):
    return get_range_generator(date(start_year, 1, 1), date(end_year + 1, 1, 1))


def get_years_gen_p(years, p=None):
    gens = []
    for year in years:
        gens.append(get_year_generator(year))
    return combine_generators(years, p)

In [8]:
from collections import deque

In [23]:
def train_until_acc(model, data_gen, desired_acc, output=True, limit=1000000):
    running_acc = 0
    num_samples = 0
    batch_size = 32
    accs = deque(maxlen=10)
    accs.append(desired_acc-0.01) # so we don't get the divide by zero error
    
    batch_num = 0
    while sum(accs)/len(accs) < desired_acc:
        xs, ys = data_gen(batch_size)
        loss, acc = model.train_on_batch(xs, ys)
        accs.append(acc)
        num_samples += batch_size
        batch_num += 1
        if batch_num % 100 == 0:
            print("Minibatch", batch_num, sum(accs)/len(accs))
        if num_samples > limit: # 1 mil by default
            break
    return num_samples


def run_schedule(schedule):
    reset_model()
    
    total_samples = 0
    for name, data_gen, accuracy in schedule:
        num_samples = train_until_acc(model, data_gen, accuracy)
        total_samples += num_samples
        print('Trained {} in {} samples'.format(name, num_samples))
    print('Training complete after {} samples'.format(total_samples))

In [None]:
from scipy.special import softmax

def get_schedule_generator(x):
    class P:
        def __init__(self, x):
            self.x = x
        def __getitem__(self, key):
            x.add_constraints()
            return softmax(x[key])
    
    x = X(x)
    p = P(x)
    
    schedule = [
        ('Aug', get_month_generator(2019, 8), 0.99)
    ]
    class StagedValue:
        def __init__(self, num):
            self.num = num
    class Stager:
        def __init__(self):
            self.total = 0
            self.constraints = []
        def __getitem__(self, key):
            self.total += key
            new_constraints = None # TODO Implement new constraints
            self.constraints.append(new_constraints)
            return StagedValue(key)
    
    # TODO How do I get this for multiple situations
    schedule = [ # Have a schedule generator. Maybe iterate through after staging to activate
        ('Aug', get_month_generator(2019, 8), 0.99),
        ('Sep', get_months_generator(2019, [8, 9], p=p[2]), 0.99),
        ('All', get_month_generator(2019, [8, 9]), 0.99)
    ]
    
    # need to create function that accepts the right number of parameters and stuff
    # use sparse matrix to represent constraints
    class X:
        def __init__(self, x):
            self.x = x
            self.current = 0
        def __getitem__(self, key):
            values = self.x[self.current: self.current+key]
            self.current += key
            return values
        def __len__(self):
            return self.current
        
    def get_schedule(x):
        x = X(x)
        processed_schedule = []
        for name, func, params, probability in schedule:
            # process into actual schedule
            new_params = []
            for param in params:
                if type(param) is StagedValue:
                    new_params.append(x[param.num])
                else:
                    new_params.append(param)
            schedule_item = (name, func(*params), probability)
            processed_schedule.append(schedule_item)
        return processed_schedule
    
    return get_schedule, constraints, x0
    
    
    
    schedule = [ # Have a schedule generator. Maybe iterate through after staging to activate
        ('Aug', get_month_generator(2019, 8), 0.99),
        ('Sep', get_months_generator(2019, [8, 9], p=p[2]), x[1]),
        ('Oct', get_month_generator(2019, [8, 9, 10], p=p[3]), x[1]),
        ('Oct', get_month_generator(2019, [8, 9, 10]), 0.99)
    ]
    
    schedule = [
        ('2019-1', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-2', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-3', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-4', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-5', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-6', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-7', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-8', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-9', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-10', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-11', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019-12', get_months_gen(2019, range(1, 13), p=x[12]), x[1]),
        ('2019', get_year_generator(2019), 0.99),
#         ('2020', get_years_gen(2019, 2020), x[1]),
#         ('2020s', get_years_gen(2019, 2029), x[1]),
#         ('2000-2100', get_years_gen(2000, 2100), x[1]),
#         ('1900-2100', get_years_gen(1900, 2100), x[1]),
#         ('1-2100', get_years_gen(1, 2100), x[1]),
    ]
    
    
    return schedule, len(x)


Optimize the last two, so the new one and the most recent before that. That way, there's a lag time, but most of it is set. We can also save the progresss of the model to make the running occur faster

In [15]:
schedule = [
    ('Jul', get_months_gen(2019, [7]), 0.99),
    ('Aug', get_months_gen(2019, [8]), 0.85),
    ('Sep', get_months_gen(2019, [9]), 0.85),
    ('Oct', get_months_gen(2019, [10]), 0.85),
    ('Nov', get_months_gen(2019, [11]), 0.85),
    ('Dec', get_months_gen(2019, [12]), 0.85),
    ('Jan', get_months_gen(2019, [1]), 0.85),
    ('Feb', get_months_gen(2019, [2]), 0.85),
    ('Mar', get_months_gen(2019, [3]), 0.85),
    ('Apr', get_months_gen(2019, [4]), 0.85),
    ('May', get_months_gen(2019, [5]), 0.85),
    ('Jun', get_months_gen(2019, [6]), 0.85),
    ('2019', get_year_generator(2019), 0.85),
    ('2020', get_years_gen(2019, 2020), 0.85),
    ('2020s', get_years_gen(2019, 2029), 0.85),
    ('2000-2100', get_years_gen(2000, 2100), 0.85),
    ('1900-2100', get_years_gen(1900, 2100), 0.85),
    ('1-2100', get_years_gen(1, 2100), 0.85),
]

NameError: name 'get_years_gen' is not defined

In [None]:
run_schedule(schedule)

In [26]:
from itertools import chain

reset_model()

schedule = [
    ('July', get_month_generator(2019, 7), 0.85),
    ('Aug', get_months_gen(2019, range(7, 9)), 0.85),
    ('Sep', get_months_gen(2019, range(7, 10)), 0.85),
    ('Oct', get_months_gen(2019, range(7, 11)), 0.85),
    ('Nov', get_months_gen(2019, range(7, 12)), 0.85),
    ('Dec', get_months_gen(2019, range(7, 13)), 0.85),
    ('Jan', get_months_gen(2019, chain(range(1, 2), range(7, 13))), 0.85),
    ('Feb', get_months_gen(2019, chain(range(1, 3), range(7, 13))), 0.85),
    ('Mar', get_months_gen(2019, chain(range(1, 4), range(7, 13))), 0.85),
    ('Apr', get_months_gen(2019, chain(range(1, 5), range(7, 13))), 0.85),
    ('May', get_months_gen(2019, chain(range(1, 6), range(7, 13))), 0.85),
    ('Jun', get_months_gen(2019, chain(range(1, 7), range(7, 13))), 0.85),
    ('Jul-Dec', get_months_gen(2019, chain(range(1, 8), range(7, 13))), 0.85),
    ('2019', get_year_generator(2019), 0.85),
    ('2020', get_years_gen(2019, 2020), 0.85),
    ('2020s', get_years_gen(2019, 2029), 0.85),
    ('2000-2100', get_years_gen(2000, 2100), 0.85),
    ('1900-2100', get_years_gen(1900, 2100), 0.85),
    ('1-2100', get_years_gen(1, 2100), 0.85),
    ('1-2100', get_years_gen(1, 2100), 0.999),
]

bad_schedule = [
    ('1-2100', get_years_gen(1, 2100), 0.85)
]

schedule_2 = list(map(lambda item: (item[0], item[1], 0.5), schedule))
schedule_2[-1] = schedule[-1]

schedule_3 = list(map(lambda item: (item[0], item[1], 0.2), schedule))
schedule_3[-1] = schedule[-1]

run_schedule(schedule)

# print('\n\n\n')
# reset_model()
# run_schedule(model, schedule_2)
# print('\n\n\n')
# reset_model()
# run_schedule(model, schedule_3)

Minibatch 100 0.5375
Trained July in 6208 samples
Trained Aug in 3072 samples
Trained Sep in 32 samples
Minibatch 100 0.571875
Trained Oct in 6016 samples
Trained Nov in 32 samples
Trained Dec in 64 samples
Trained Jan in 32 samples
Minibatch 100 0.79375
Trained Feb in 3328 samples
Trained Mar in 64 samples
Trained Apr in 32 samples
Trained May in 32 samples
Trained Jun in 1568 samples
Trained Jul-Dec in 32 samples
Trained 2019 in 3008 samples
Minibatch 100 0.75
Trained 2020 in 5312 samples
Minibatch 100 0.584375
Minibatch 200 0.778125
Minibatch 300 0.79375
Trained 2020s in 9856 samples
Minibatch 100 0.509375
Minibatch 200 0.6625
Minibatch 300 0.6875
Minibatch 400 0.75625
Minibatch 500 0.796875
Minibatch 600 0.7875
Trained 2000-2100 in 19904 samples
Trained 1900-2100 in 2944 samples
Minibatch 100 0.45625
Minibatch 200 0.5125
Minibatch 300 0.6125
Minibatch 400 0.65
Minibatch 500 0.65625
Minibatch 600 0.68125
Minibatch 700 0.696875
Minibatch 800 0.721875
Minibatch 900 0.734375
Minibatch 

In [29]:
convert_to_vector(date(2019,7,28))[1]

array([0., 0., 0., 0., 0., 0., 1.])

In [30]:
model.predict(convert_to_vector(date(2035,5,18))[0].reshape((1, -1)))

array([[3.21856205e-04, 4.16064868e-04, 3.76252714e-03, 1.97594166e-02,
        8.88420641e-01, 7.28261918e-02, 1.44934375e-02]], dtype=float32)

So far, it seems that a model with 0.85 threshold trains with 160k samples and a model with 0.5 threshold trains with 120k samples.

I tested a model with no grading. It got 100k batches (3,200k samples) without having improved its' accuracy at all.

Can we train some type of model that takes in training histories and outputs the ideal training history?

In [117]:
stuff = model.fit(xs, ys)

stuff
print(stuff.history['acc'])

Epoch 1/1


Next steps:

- Overfit on a single sample of say 1000, then as soon as its' prediction is better than chance, move along to another set. 
  - Compare to pure random
- Overfit on a sample of just a few, and then start throwing in new ones? - This was the whole strategy from before
  - This would work, but we want our sample to be better than just chance, how do we determine the sample?

1. Just do in order with random distributions up until a certain threshold
  - Test different thresholds for progression, 50% accuracy, 90%, etc
2. Change distribution to reflect the learning that needs to happen
  - If you get a certain month right 90% of the time, only show it 10% of the time or something like that - determine this distrubtion
3. Learn the distribution using some kind of statistics or model
4. Create an ml model that perfects this situation

Also, should be considering taking a month where we know the date and asking for another date. So if we know that today is the 18th and a Thursday, we can infer that tomorrow is the 19th and a Friday.