In [None]:
import tensorflow as tf
from   tensorflow import keras
from   tensorflow.keras import regularizers
from   tensorflow.keras import Sequential
from   tensorflow.keras.layers import Dropout, Dense

In [None]:
from   matplotlib import pyplot as plt

In [None]:
import errno
import glob
import json
import numpy as np
import os
import pandas as pd
import pathlib
import shutil
import subprocess
import tempfile
import uuid

In [None]:
# Local modules
import config
import datasets
import op_stats
import utils

In [None]:
# Globals
TICKER = 'SPY'

In [None]:
# For saving the model
PREFIX = 'model'
TICKER_MODEL_DIR = os.path.join(config.ML_MODELS_DIR, TICKER)

In [None]:
# Set the values to be used for working with the data
BATCH_SIZE     = 512
BUFFER_SIZE    = 100
MAX_MARGIN     = 10
MIN_PROFIT     = 1
MIN_DATAPOINTS = 4*10**6
VAL_TEST_COUNT = 20000
MAX_EPOCHS     = 200

In [None]:
try:
    os.makedirs(TICKER_MODEL_DIR)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise
    for tmpdir in glob.glob('{}/{}*'.format(TICKER_MODEL_DIR, PREFIX)):
        shutil.rmtree(tmpdir)
MODEL_DIR = tempfile.mkdtemp(prefix='model', dir=TICKER_MODEL_DIR)

In [None]:
# Load the data
examples_dataset = datasets.load_dataset(
    TICKER,
    max_margin=MAX_MARGIN,
    min_profit=MIN_PROFIT,
    total_datapoints=MIN_DATAPOINTS,
    loss_ratio=3,
    loss_pool_multiplier=2,
    verbose=True,
)
X = examples_dataset.data
Y = examples_dataset.labels
metadata = examples_dataset.metadata

# Are we using the right ticker?
assert(metadata['ticker'] == TICKER)
# Make sure we're at least reasonably close to the desired number of datapoints
assert(metadata['total_datapoints'] >= MIN_DATAPOINTS*0.99)
# Make sure the remaining metadata is accurate
assert(metadata['max_margin'] == MAX_MARGIN)
assert(metadata['min_profit'] == MIN_PROFIT)

In [None]:
# Collect the statistics
pooled_means, pooled_variances = op_stats.pool_stats_from_stats_df(TICKER)

In [None]:
# Immediately save the metadata that we can
pooled_means.to_pickle(os.path.join(MODEL_DIR, 'means'))
pooled_variances.to_pickle(os.path.join(MODEL_DIR, 'variances'))
pooled_stds = pooled_variances.pow(1/2)

In [None]:
pooled_means

In [None]:
# It'll be important for code using this model to know how to order its
# columns.
feature_order = pooled_means.index.tolist()
metadata['feature_order'] = feature_order
print(feature_order)

In [None]:
# Normalize all of the stuff that will be used for X.
X = (X[feature_order] - pooled_means[feature_order]) / pooled_stds[feature_order]

In [None]:
n_examples, n_features = X.shape
print('{} examples\n{} features'.format(n_examples, n_features))

In [None]:
# Build the datasets for train and validation
dataset = tf.data.Dataset.from_tensor_slices((X.values, Y.values)).shuffle(n_examples)

# Split up the data
n_train          = n_examples - 2*VAL_TEST_COUNT
train_dataset    = dataset.take(n_train)
validate_dataset = dataset.skip(n_train)
test_dataset     = validate_dataset.take(VAL_TEST_COUNT).batch(BATCH_SIZE)
validate_dataset = validate_dataset.skip(VAL_TEST_COUNT).batch(BATCH_SIZE)

STEPS_PER_EPOCH = n_train//BATCH_SIZE

train_dataset = train_dataset.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=True).batch(BATCH_SIZE).repeat()

In [None]:
lr_schedule = keras.optimizers.schedules.InverseTimeDecay(
    0.001,
    decay_steps=STEPS_PER_EPOCH*7,
    decay_rate=1,
    staircase=False
)

checkpoint_filepath = os.path.join(MODEL_DIR, 'checkpoint')
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True
)

class MetadataSaver(keras.callbacks.Callback):
    _best_loss = np.inf
    def on_epoch_end(self, epoch, logs=None):
        if logs['val_loss'] >= self._best_loss:
            return
        
        self._best_loss = logs['val_loss']
        with open(os.path.join(MODEL_DIR, 'metadata'), 'w') as MF:
            metadata.update({
                'accuracy': float(logs['val_accuracy']),
                'loss': float(self._best_loss),
            })
            json.dump(metadata, MF)

def get_callbacks(name):
    return [
        # tfdocs.modeling.EpochDots(),
        model_checkpoint_callback,
        MetadataSaver(),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30),
    ]

model = Sequential([
    Dense(512, activation='relu', input_shape=(n_features,)),
    Dense(256, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1)
])

model.compile(
    optimizer=keras.optimizers.Adam(lr_schedule),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()

In [None]:
history = model.fit(train_dataset,
                    steps_per_epoch = STEPS_PER_EPOCH,
                    epochs=MAX_EPOCHS,
                    validation_data=validate_dataset,
                    callbacks=get_callbacks('testing'),
                    verbose=1)

In [None]:
# Make some predictions to figure out what the percentiles are for outputs
# and add these percentiles to the saved model.
preds = model.predict(test_dataset)
percentiles = list(range(50, 100, 5)) + [99, 99.9, 99.99, 99.999]
with open(os.path.join(MODEL_DIR, 'metadata'), 'r+') as MF:
    metadata = json.load(MF)
    MF.seek(0)
    metadata['percentiles'] = dict(zip(
        percentiles, np.percentile(preds, percentiles)
    ))
    json.dump(metadata, MF)
    MF.truncate()

In [None]:
# Build a tarball for this session (ignoring the directory when including filenames)
files_to_tar = [os.path.basename(f) for f in glob.glob('{}/*'.format(MODEL_DIR))]
tarball_path = os.path.join(config.ML_MODELS_DIR, TICKER, '{}.tar'.format(uuid.uuid4()))
subprocess.check_call(['tar', '-C', MODEL_DIR, '-cf', tarball_path] + files_to_tar)
shutil.rmtree(MODEL_DIR, ignore_errors=True)

In [None]:
# Test the model
model.evaluate(test_dataset)

In [None]:
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()