In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

print(tf.__version__)

1.2.0


In [15]:
batch_size = 128
d_x = 1
d_y = 1

target_stock = 'CSCO'
stock_list = [target_stock]
n_stocks = 1

start_date = '20150101'
end_date = '20151231'
min_day_obs = 252

sampling_freq = '100ms'

# Downloading data from s3 (Optional)

In [16]:
import boto3
import os

bucket = 'tradelake'
key = 'sp500_2015/%s_%s.csv.gz'
colnames = ['date',
            'time',
            'ex',
            'symbol',
            'trade_cond',
            'size',
            'price',
            'stopinf',
            'corr',
            'seqnum',
            'source',
            'rf']

def check_key_in_bucket(bucket_name, key):
    try:
        with open('/root/start.sh', 'rb') as input_file:
            AWSAccessKeyId = input_file.readline()[:-1]
            AWSSecretKey = input_file.readline()[:-1]    
        s3 = boto3.client('s3', 
                          aws_access_key_id=AWSAccessKeyId, 
                          aws_secret_access_key=AWSSecretKey)
        s3.get_object(Bucket=bucket_name, Key=key)
    except Exception as e:
        exists = False
    else:
        exists = True
    return exists

def read_csv_from_s3(bucket, key, colnames, parse_dates, index_to_set):
    
    with open('/root/start.sh', 'rb') as input_file:
        AWSAccessKeyId = input_file.readline()[:-1]
        AWSSecretKey = input_file.readline()[:-1]

    print 'Reading csv from bucket %s key %s' % (bucket, key)

    local_file = key.split('/')[-1]
    
    s3 = boto3.client('s3', 
                      aws_access_key_id=AWSAccessKeyId, 
                      aws_secret_access_key=AWSSecretKey)

    s3.download_file(bucket, key, local_file)
    
    obj = s3.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(
        local_file, 
        names=colnames,
        parse_dates=parse_dates, 
        compression='gzip',
        engine='c').set_index(index_to_set)

    print 'Read csv from bucket %s key %s' % (bucket, key)
    
    os.remove(local_file)

    return df[['price', 'size']]

In [17]:
import cPickle as pickle
    
dates = pd.date_range(start_date, end_date).map(lambda x: ''.join(str(x).split(' ')[0].split('-')))

dates = [date for date in dates if check_key_in_bucket(bucket, key % (date, stock_name))]

print(dates)

[]


In [5]:
def remove_outliers_trades(time_series, window=1000):
    time_series['size_upper_bound'] = pd.rolling_quantile(time_series['size'],
                                                     window=window,
                                                     quantile=0.99,
                                                     min_periods=1)
    time_series['size_lower_bound'] = pd.rolling_quantile(time_series['size'],
                                                     window=window,
                                                     quantile=0.01,
                                                     min_periods=1)
    time_series = time_series[time_series['size'] < time_series['size_upper_bound']]
    time_series = time_series[time_series['size'] > time_series['size_lower_bound']]

    time_series.drop(['size_upper_bound', 'size_lower_bound'], axis = 1)

    time_series['price_upper_bound'] = pd.rolling_quantile(time_series['price'],
                                                     window=window,
                                                     quantile=0.99,
                                                     min_periods=1)
    time_series['price_lower_bound'] = pd.rolling_quantile(time_series['price'],
                                                     window=window,
                                                     quantile=0.01,
                                                     min_periods=1)
    time_series = time_series[time_series['price'] < time_series['price_upper_bound']]
    time_series = time_series[time_series['price'] > time_series['price_lower_bound']]

    time_series.drop(['price_upper_bound', 'price_lower_bound'], axis = 1)

    price_median = time_series['price'].median()

    time_series = time_series[time_series['price'] > price_median * 0.80]
    time_series = time_series[time_series['price'] < price_median * 1.20]

    return time_series

In [6]:
def resample_trades(key, time_series):
    
    key_date = key[11:].split('_')[0]    
    
    start_timestamp = pd.to_datetime(key_date + 'T10:00:00')
    end_timestamp = pd.to_datetime(key_date + 'T14:00:00')
        
    time_series = time_series.resample(sampling_freq).median().fillna(method='pad')    
    complete_result = time_series[start_timestamp:end_timestamp]['price'].diff()[1:]
    
    n_samples = len(complete_result)
    
    return [complete_result[offset : offset + split_window].values
            for offset in xrange(0, n_samples, split_window)]

In [7]:
stock_names = [stock_name for stock_name in stock_list[:n_stocks]
               if stock_name not in exclusion_list]

key_list = [key % (date, stock_name)
            for stock_name in stock_list[:n_stocks]
            if stock_name not in exclusion_list
            for date in dates[:n_days]
            if date not in excluded_dates]

In [8]:
data_set = [read_csv_from_s3(bucket, k, None,
                             {'datetime': ['date', 'time']},
                             'datetime')
            for k in key_list]

In [11]:
print(data_set)

[]


# Building the network

In [None]:
# We do not assume we know the sequence length yet

x_seq_ph = tf.placeholder(shape=(batch_size, None, d_x), dtype=tf.float32)
y_seq_ph = tf.placeholder(shape=(batch_size, None, d_y), dtype=tf.float32)

In [None]:
def convolution_layer(input_seq, n_dims_in, n_dims_out, width, 
                      dilation=1, causal=True):
    conv_kernel = tf.get_variable(
        name="kernel",
        shape=[width, n_dims_in, n_dims_out], 
        dtype=tf.float32,
        initializer=tf.truncated_normal_initializer())
    
    # Similar to approach in Francois Chollet's Keras library
    if causal:
        offset = dilation * (width - 1)
        input_seq = tf.pad(input_seq, [[0, 0], [offset, 0], [0, 0]])
    
    conv_output = tf.nn.convolution(
        input=input_seq,
        filter=conv_kernel,
        padding="VALID" if causal else "SAME",
        strides=None,
        dilation_rate=[dilation]
    )
    
    return conv_output

In [None]:
loss = tf.reduce_sum(tf.nn.l2_loss(y_predicted - y_seq_ph))

In [None]:
learning_rate = 1e-3

optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(loss)

In [None]:
n_steps = int(1e4)

session = tf.Session()

session.run(tf.global_variables_initializer())

loss_evals = []

for step in xrange(n_steps):
    x_seq = generate_samples(batch_size, sequence_length, 1)
    
    loss_eval, _ = session.run((loss, optimizer), 
                               feed_dict={x_seq_ph: x_seq[:,:-1],
                                          y_seq_ph: x_seq[:,1:]})

    loss_evals.append(loss_eval)
    
plt.plot(loss_evals)
plt.title("Squared l2 loss")
plt.xlabel("SGD step")
plt.show()