In [None]:
import tensorflow as tf
from   tensorflow import keras
from   tensorflow.keras import regularizers
from   tensorflow.keras import Sequential
from   tensorflow.keras.layers import Dropout, Dense

In [None]:
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

In [None]:
from   IPython import display
from   matplotlib import pyplot as plt

In [None]:
import errno
import glob
import json
import numpy as np
import os
import pandas as pd
import pathlib
import shutil
import subprocess
import tempfile
import uuid

In [None]:
# Local modules
import config
import utils

In [None]:
# Globals
TICKER     = 'SPY'
EXPIRIES   = ['2020-08-07']
MAX_MARGIN = 500
MIN_PROFIT = 100
DATA_SPLIT = 0.95

In [None]:
# For tensorboard
LOGDIR = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(LOGDIR, ignore_errors=True)

In [None]:
# For saving the model
PREFIX = 'model'
TICKER_MODEL_DIR = os.path.join(config.ML_MODELS_DIR, TICKER)
try:
    os.mkdir(TICKER_MODEL_DIR)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise
    for tmpdir in glob.glob('{}/{}*'.format(TICKER_MODEL_DIR, PREFIX)):
        shutil.rmtree(tmpdir)
MODEL_DIR = tempfile.mkdtemp(prefix='model', dir=TICKER_MODEL_DIR)

In [None]:
# Load the data
data_df_list = []
for exp in EXPIRIES:
    print(exp)
    data_df_list.append(
        utils.sort_trades_df_columns(
            utils.load_spreads(TICKER, exp, refresh=True, verbose=True)
        )
    )
data_df = pd.concat(data_df_list, ignore_index=True)
print('Loaded {} examples'.format(data_df.shape[0]))

In [None]:
# Normatlize all of the stuff that will be used for X.
# NOTE: do this before removing examples based on open_margin.
#       We want to include all data in the statistics.
normalized_df, means, stds = utils.normalize_metadata_columns(data_df)

In [None]:
# Immediately save the metadata that we can
means.to_pickle(os.path.join(MODEL_DIR, 'means'))
stds.to_pickle(os.path.join(MODEL_DIR, 'stds'))

In [None]:
# Whittle the data down to only what we want to stomach in terms of
# open margin
viable_trades_df = normalized_df[normalized_df.open_margin <= MAX_MARGIN]

# We don't need the open_margin anymore
examples_df = viable_trades_df.drop(['open_margin'], axis=1)
# examples_df = normalized_df.drop(['open_margin'], axis=1)

In [None]:
examples_df.head(10)

In [None]:
# Pop out the max_profit and compare it to our desired minimum profit
labels = examples_df.pop('max_profit') >= MIN_PROFIT + utils.calculate_fee()

In [None]:
# Set the values to be used for working with the data
BATCH_SIZE = 512
BUFFER_SIZE = 100
n_examples, n_features = examples_df.shape

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((examples_df.values, labels.values)).shuffle(n_examples)

In [None]:
# Split up the data
n_train = int(examples_df.shape[0] * DATA_SPLIT)
train_dataset = dataset.take(n_train)
test_dataset = dataset.skip(n_train)

In [None]:
STEPS_PER_EPOCH = n_train//BATCH_SIZE

In [None]:
train_dataset = train_dataset.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=True).batch(BATCH_SIZE).repeat()
validate_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
lr_schedule = keras.optimizers.schedules.InverseTimeDecay(
    0.001,
    decay_steps=STEPS_PER_EPOCH*5,
    decay_rate=1,
    staircase=False
)

def get_optimizer():
    return keras.optimizers.Adam(lr_schedule)

In [None]:
checkpoint_filepath = os.path.join(MODEL_DIR, 'checkpoint')
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='loss',
    mode='min',
    save_best_only=True
)

def get_callbacks(name):
    return [
        # tfdocs.modeling.EpochDots(),
        model_checkpoint_callback,
        tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10),
        tf.keras.callbacks.TensorBoard(LOGDIR/name),
    ]

In [None]:
def compile_and_fit(model, name, optimizer=None, max_epochs=200):
    if optimizer is None:
        optimizer = get_optimizer()
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[
            tf.keras.losses.BinaryCrossentropy(
                from_logits=True, name='binary_crossentropy'),
            'accuracy'
        ]
    )

    model.summary()

    history = model.fit(
        train_dataset,
        steps_per_epoch = STEPS_PER_EPOCH,
        epochs=max_epochs,
        validation_data=validate_dataset,
        callbacks=get_callbacks(name),
        verbose=1)
    return history

In [None]:
model = Sequential([
    Dense(512, activation='relu', input_shape=(n_features,)),
    Dense(256, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1)
])

In [None]:
history = compile_and_fit(model, 'sizes/test')

In [None]:
# Save the remaining metadata
history_dict = history.history
best_epoch = history_dict['loss'].index(min(history['loss']))
with open(os.path.join(MODEL_DIR, 'metadata'), 'w') as MF:
    json.dump(
        {
            'ticker': TICKER,
            'expiries': EXPIRIES,
            'max_margin': MAX_MARGIN,
            'min_profit': MIN_PROFIT,
            'accuracy': history_dict['accuracy'][best_epoch],
            'loss': history_dict['loss'][best_epoch],
        },
        MF
    )

In [None]:
# Build a tarball for this session (ignoring the directory when including filenames)
files_to_tar = [os.path.basename(f) for f in glob.glob('{}/*'.format(MODEL_DIR))]
tarball_path = os.path.join(config.ML_MODELS_DIR, TICKER, '{}.tar'.format(uuid.uuid4()))
subprocess.check_call(['tar', '-C', MODEL_DIR, '-cf', tarball_path] + files_to_tar)
shutil.rmtree(MODEL_DIR, ignore_errors=True)

In [None]:
#docs_infra: no_execute

# Load the TensorBoard notebook extension
%load_ext tensorboard

# Open an embedded TensorBoard viewer
%tensorboard --logdir {LOGDIR}/sizes