In [None]:
import tensorflow as tf
from   tensorflow import keras
from   tensorflow.keras import regularizers
from   tensorflow.keras import Sequential
from   tensorflow.keras.layers import Dropout, Dense

In [None]:
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

In [None]:
from   IPython import display
from   matplotlib import pyplot as plt

In [None]:
import errno
import glob
import json
import numpy as np
import os
import pandas as pd
import pathlib
import shutil
import subprocess
import tempfile
import uuid

In [None]:
# Local modules
import config
import utils

In [None]:
# Globals
TICKER     = 'SPY'
EXPIRIES   = ['2020-08-28', '2020-09-18', '2020-11-20', '2021-02-19']

In [None]:
# For saving the model
PREFIX = 'model'
TICKER_MODEL_DIR = os.path.join(config.ML_MODELS_DIR, TICKER)

In [None]:
# Set the values to be used for working with the data
MAX_DATAPOINTS = 2*10**6
BATCH_SIZE = 512
BUFFER_SIZE = 100
MAX_MARGIN = 5
MIN_PROFIT = 1
FEES = utils.calculate_fee()
DATA_SPLIT = 0.95
MAX_EPOCHS = 200

In [None]:
# For tensorboard
LOGDIR = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(LOGDIR, ignore_errors=True)

In [None]:
try:
    os.makedirs(TICKER_MODEL_DIR)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise
    for tmpdir in glob.glob('{}/{}*'.format(TICKER_MODEL_DIR, PREFIX)):
        shutil.rmtree(tmpdir)
MODEL_DIR = tempfile.mkdtemp(prefix='model', dir=TICKER_MODEL_DIR)

In [None]:
# Load the data
exp_paths = []
for exp in EXPIRIES:
    print('\n'*2 + exp + '\n'*2)
    exp_paths.append(
        utils.load_spreads(TICKER,
                           exp,
                           winning_profit=MIN_PROFIT+FEES,
                           loss_win_ratio=3,
                           verbose=True)
    )

In [None]:
# Collect the statistics
means = []
variances = []
sample_sizes = []

for df in utils.spreads_dirs_to_generator(exp_paths, shuffle=True):
    df_means, df_vars = utils.collect_statistics(df)
    means.append(df_means)
    variances.append(df_vars)
    sample_sizes.append(df.shape[0])
    
pooled_means = None
pooled_vars = None
total_samples = sum(sample_sizes)
for i in range(len(sample_sizes)):
    next_mean = means[i] * sample_sizes[i]
    next_var = variances[i] * (sample_sizes[i] - 1)
    try:
        pooled_means += next_mean
        pooled_vars += next_var
    except TypeError:
        pooled_means = next_mean
        pooled_vars = next_var
        
pooled_means /= total_samples
pooled_vars /= (total_samples - len(sample_sizes))

# Finally, reset some of the values that should not be changed
static_columns = ['open_margin', 'max_profit']
for i in range(1, 6):
    type_col = 'leg{}_type'.format(i)
    if type_col in trades_df.columns:
        static_columns.append(type_col)

for c in static_columns:
    try:
        pooled_means[c] = 0
        pooled_vars[c] = 1
    except KeyError:
        pass

In [None]:
# Immediately save the metadata that we can
pooled_means.to_pickle(os.path.join(MODEL_DIR, 'means'))
pooled_vars.to_pickle(os.path.join(MODEL_DIR, 'vars'))
pooled_stds = pooled_vars.pow(1/2)

In [None]:
pooled_means

In [None]:
# Pull in as much datapoints as desired by randomly going through the DataFrames
data_to_use = []
total_datapoints = 0
for df in utils.spreads_tarball_to_generator(exp_paths, shuffle=True):
    
    # Whittle the data down to only what we want to stomach in terms of
    # open margin
    viable_trades_df = df[df.open_margin <= MAX_MARGIN]
    
    viable_count = viable_trades_df.shape[0]
    if viable_count == 0:
        continue

    total_datapoints += viable_count
    data_to_use.append(viable_trades_df)
    
    if total_datapoints >= MAX_DATAPOINTS:
        break

In [None]:
# Get the final dataframe
examples_df = pd.concat(data_to_use)
data_to_use = []

In [None]:
# Get a 3-to-1 ratio of hard losers to winners
examples_df = utils.collect_winners_and_hard_losers(examples_df, winning_profit=MIN_PROFIT)

In [None]:
# We don't need the open_margin anymore
examples_df.drop(['open_margin'], axis=1, inplace=True)
# Pop out the max_profit and compare it to our desired minimum profit
labels = examples_df.pop('max_profit') >= MIN_PROFIT + FEES

In [None]:
n_examples, n_features = examples_df.shape
print('{} examples\n{} features'.format(n_examples, n_features))

In [None]:
feature_order = examples_df.columns.tolist()
print(feature_order)

In [None]:
# Normalize all of the stuff that will be used for X.
examples_df = (examples_df - pooled_means[examples_df.columns]) / pooled_stds[examples_df.columns]

In [None]:
# Build the datasets for train and validation
dataset = tf.data.Dataset.from_tensor_slices((examples_df.values, labels.values)).shuffle(n_examples)

# Split up the data
n_train = int(n_examples * DATA_SPLIT)
train_dataset = dataset.take(n_train)
test_dataset = dataset.skip(n_train)

STEPS_PER_EPOCH = n_train//BATCH_SIZE

train_dataset = train_dataset.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=True).batch(BATCH_SIZE).repeat()
validate_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
lr_schedule = keras.optimizers.schedules.InverseTimeDecay(
    0.001,
    decay_steps=STEPS_PER_EPOCH*5,
    decay_rate=1,
    staircase=False
)

checkpoint_filepath = os.path.join(MODEL_DIR, 'checkpoint')
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='loss',
    mode='min',
    save_best_only=True
)

class MetadataSaver(keras.callbacks.Callback):
    _best_loss = np.inf
    def on_epoch_end(self, epoch, logs=None):
        if logs['loss'] >= self._best_loss:
            return
        
        self._best_loss = logs['loss']
        with open(os.path.join(MODEL_DIR, 'metadata'), 'w') as MF:
            json.dump(
                {
                    'ticker': TICKER,
                    'expiries': EXPIRIES,
                    'max_margin': MAX_MARGIN,
                    'min_profit': MIN_PROFIT,
                    'feature_order': feature_order,
                    'n_examples': n_examples,
                    'accuracy': float(logs['accuracy']),
                    'loss': float(self._best_loss),
                },
                MF
            )

def get_callbacks(name):
    return [
        # tfdocs.modeling.EpochDots(),
        model_checkpoint_callback,
        MetadataSaver(),
        tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10),
        tf.keras.callbacks.TensorBoard(LOGDIR/name),
    ]

model = Sequential([
    Dense(512, activation='relu', input_shape=(n_features,)),
    Dense(256, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1)
])

model.compile(
    optimizer=keras.optimizers.Adam(lr_schedule),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[
        tf.keras.losses.BinaryCrossentropy(
            from_logits=True, name='binary_crossentropy'),
        'accuracy'
    ]
)

model.summary()

In [None]:
history = model.fit(train_dataset,
                    steps_per_epoch = STEPS_PER_EPOCH,
                    epochs=MAX_EPOCHS,
                    validation_data=validate_dataset,
                    callbacks=get_callbacks('testing'),
                    verbose=1)

In [None]:
# Build a tarball for this session (ignoring the directory when including filenames)
files_to_tar = [os.path.basename(f) for f in glob.glob('{}/*'.format(MODEL_DIR))]
tarball_path = os.path.join(config.ML_MODELS_DIR, TICKER, '{}.tar'.format(uuid.uuid4()))
subprocess.check_call(['tar', '-C', MODEL_DIR, '-cf', tarball_path] + files_to_tar)
shutil.rmtree(MODEL_DIR, ignore_errors=True)

In [None]:
#docs_infra: no_execute

# Load the TensorBoard notebook extension
%load_ext tensorboard

# Open an embedded TensorBoard viewer
%tensorboard --logdir {LOGDIR}/sizes