# SPY500 Synthetic Data Generation Pipeline

In [27]:
import os
import numpy as np
import pandas as pd

from utils.noise import generateData
from sklearn.preprocessing import MinMaxScaler

## Dataset preprocessing

In [28]:
data = pd.read_csv('data/spy-daily.csv', index_col='Date')
adj_close_data = data['Adj Close']
adj_close_data.head()

Date
1993-01-29    27.357281
1993-02-01    27.551851
1993-02-02    27.610189
1993-02-03    27.902090
1993-02-04    28.018848
Name: Adj Close, dtype: float64

In [29]:
# frame a sequence as a supervised learning problem
def timeseries_to_supervised(data, lag=1):
    df = pd.DataFrame(data)
    columns = [df.shift(i) for i in range(1, lag+1)]
    columns.append(df)
    df = pd.concat(columns, axis=1)
    df.fillna(0, inplace=True)
    df.columns = ['X', 'Y']
    return df

In [30]:
sup_data = timeseries_to_supervised(adj_close_data)
sup_data.head()

Unnamed: 0_level_0,X,Y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1993-01-29,0.0,27.357281
1993-02-01,27.357281,27.551851
1993-02-02,27.551851,27.610189
1993-02-03,27.610189,27.90209
1993-02-04,27.90209,28.018848


In [31]:
# create a differenced series
def difference(dataset, interval=1):
    assert interval >= 1
    assert isinstance(dataset, pd.DataFrame) or isinstance(dataset, pd.Series)
    return dataset.diff(interval).dropna()

In [32]:
# invert differenced value
def inverse_difference(history, dataset, interval=1):
    assert interval >= 1
    assert isinstance(dataset, pd.DataFrame) or isinstance(dataset, pd.Series)
    return (history - dataset).dropna()

In [33]:
X = adj_close_data.values
X = X.reshape(len(X), 1)
X.shape

(6383, 1)

In [34]:
# scaler example
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(X)
scaled_X = scaler.transform(X)
scaled_X

array([[-0.99743961],
       [-0.99593372],
       [-0.9954822 ],
       ...,
       [ 0.88780018],
       [ 0.90838763],
       [ 0.91844905]])

In [35]:
# invert transform example
inverted_X = scaler.inverse_transform(scaled_X)
inverted_X

array([[ 27.357281],
       [ 27.551851],
       [ 27.610189],
       ...,
       [270.940002],
       [273.600006],
       [274.899994]])

In [43]:
# Split training and test set
supervised_values = sup_data.values
train_lim = int((4/5) * supervised_values.shape[0])
train, test = supervised_values[:train_lim, :], supervised_values[train_lim:, :]
train.shape, test.shape

((5106, 2), (1277, 2))

In [59]:
# Scale training and test sets
train_scaled, test_scaled = scaler.transform(train), scaler.transform(test)

## Time series regresion LSTM pipeline

In [67]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

import os

In [65]:
def fit_lstm(train, batch_size, nb_epoch, neurons, logdir='logs'):
    X, y = train[:, 0:-1], train[:, -1]
    X = X.reshape(X.shape[0], 1, X.shape[1])
    model = Sequential()
    model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(nb_epoch):
        model.fit(X, y, epochs=1, batch_size=batch_size, shuffle=False)
        model.reset_states()
    filename = os.path.join(logdir, 'model.h5')
    print('Saving model as {}'.format(filename))
    model.save(filename)
    return model

In [76]:
temp = train_scaled[:, 0:-1]
temp = temp.reshape(temp.shape[0], 1, temp.shape[1])
temp.shape

(5106, 1, 1)

In [102]:
lstm_model = fit_lstm(train_scaled, 1, 300, 4)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
 400/5106 [=>............................] - ETA: 31s - loss: 0.0027

KeyboardInterrupt: 

In [110]:
train_scaled[:, 0:-1].reshape(train_scaled[:, 0:-1].shape[0], 1, train_scaled[:, 0:-1].shape[1]).shape

(5106, 1, 1)

## Predicting using the LSTM

In [85]:
from keras.models import load_model

import numpy as np

In [78]:
model = load_model('logs/model.h5')

In [80]:
# forecast the entire training dataset to build up state for forecasting
train_reshaped = train_scaled[:, 0].reshape(len(train_scaled), 1, 1)
model.predict(train_reshaped, batch_size=1)

array([[-0.9097316 ],
       [-0.9457605 ],
       [-1.0179694 ],
       ...,
       [ 0.96749943],
       [ 0.97240335],
       [ 0.9806607 ]], dtype=float32)

In [99]:
# inverse scaling for a forecasted value
def invert_scale(scaler, X, value):
    new_row = [x for x in X] + [value]
    array = np.array(new_row)
    array = array.reshape(1, len(array))
    inverted = scaler.inverse_transform(array)
    return inverted[0, -1]

# make a one-step forecast
def forecast_lstm(model, batch_size, X):
    X = X.reshape(1, 1, len(X))
    yhat = model.predict(X, batch_size=batch_size)
    return yhat[0,0]

# invert differenced value
def inverse_difference(history, yhat, interval=1):
    return yhat + history[-interval]

In [101]:
# walk-forward validation on the test data
predictions = list()
for i in range(len(test_scaled)):
    # make one-step forecast
    X, y = test_scaled[i, 0:-1], test_scaled[i, -1]
    yhat = forecast_lstm(model, 1, X)
    # invert scaling
    yhat = invert_scale(scaler, X, yhat)
    # invert differencing
    yhat = inverse_difference(adj_close_data.values, yhat, len(test_scaled)+1-i)
    # store forecast
    predictions.append(yhat)
    expected = adj_close_data.values[len(train) + i + 1]
    print('Time_value=%d, Predicted=%f, Expected=%f' % (i+1, yhat, expected))

Time_value=1, Predicted=319.561291, Expected=147.884644
Time_value=2, Predicted=304.731228, Expected=148.002274
Time_value=3, Predicted=302.855367, Expected=149.531723
Time_value=4, Predicted=302.457978, Expected=150.337158
Time_value=5, Predicted=304.996275, Expected=149.631256
Time_value=6, Predicted=306.490859, Expected=151.079254
Time_value=7, Predicted=305.269981, Expected=151.070190
Time_value=8, Predicted=307.733638, Expected=151.287399
Time_value=9, Predicted=307.827245, Expected=150.165207
Time_value=10, Predicted=308.195120, Expected=149.730835
Time_value=11, Predicted=306.215920, Expected=149.604111
Time_value=12, Predicted=305.331609, Expected=150.500061
Time_value=13, Predicted=305.061427, Expected=149.522690
Time_value=14, Predicted=306.644747, Expected=150.074753
Time_value=15, Predicted=304.981187, Expected=147.920853
Time_value=16, Predicted=305.876593, Expected=148.735321
Time_value=17, Predicted=302.055455, Expected=148.020416
Time_value=18, Predicted=303.335874, Exp

Time_value=172, Predicted=339.806285, Expected=168.825577
Time_value=173, Predicted=336.418015, Expected=169.735596
Time_value=174, Predicted=338.955003, Expected=169.514984
Time_value=175, Predicted=340.506630, Expected=168.798050
Time_value=176, Predicted=340.302060, Expected=169.294373
Time_value=177, Predicted=339.212176, Expected=169.404663
Time_value=178, Predicted=339.884120, Expected=168.016678
Time_value=179, Predicted=340.096174, Expected=164.431885
Time_value=180, Predicted=338.003056, Expected=163.623032
Time_value=181, Predicted=332.278717, Expected=164.597397
Time_value=182, Predicted=330.558676, Expected=163.016373
Time_value=183, Predicted=331.940737, Expected=164.744446
Time_value=184, Predicted=329.514873, Expected=163.779282
Time_value=185, Predicted=332.080284, Expected=160.093399
Time_value=186, Predicted=330.725035, Expected=161.214783
Time_value=187, Predicted=324.695257, Expected=161.012573
Time_value=188, Predicted=326.090094, Expected=163.135895
Time_value=189

Time_value=342, Predicted=363.914634, Expected=186.227020
Time_value=343, Predicted=365.700913, Expected=187.219559
Time_value=344, Predicted=366.159674, Expected=187.051071
Time_value=345, Predicted=367.503167, Expected=185.606476
Time_value=346, Predicted=367.403352, Expected=184.543991
Time_value=347, Predicted=365.540671, Expected=185.988602
Time_value=348, Predicted=364.002299, Expected=182.987579
Time_value=349, Predicted=365.751892, Expected=184.441483
Time_value=350, Predicted=361.900881, Expected=184.105988
Time_value=351, Predicted=363.538988, Expected=183.621353
Time_value=352, Predicted=363.180446, Expected=181.132935
Time_value=353, Predicted=362.514891, Expected=181.160889
Time_value=354, Predicted=359.109814, Expected=183.155334
Time_value=355, Predicted=358.860685, Expected=182.940979
Time_value=356, Predicted=361.488206, Expected=180.117020
Time_value=357, Predicted=361.394680, Expected=183.267166
Time_value=358, Predicted=357.583539, Expected=179.632416
Time_value=359

Time_value=515, Predicted=385.574085, Expected=200.218582
Time_value=516, Predicted=383.051594, Expected=199.992661
Time_value=517, Predicted=385.071973, Expected=198.750137
Time_value=518, Predicted=384.901002, Expected=199.154892
Time_value=519, Predicted=383.464166, Expected=198.957230
Time_value=520, Predicted=383.845370, Expected=199.484360
Time_value=521, Predicted=383.618872, Expected=197.799393
Time_value=522, Predicted=384.215856, Expected=197.460526
Time_value=523, Predicted=382.282247, Expected=196.189728
Time_value=524, Predicted=381.761985, Expected=196.217987
Time_value=525, Predicted=380.193095, Expected=198.580704
Time_value=526, Predicted=380.100301, Expected=199.230209
Time_value=527, Predicted=382.855357, Expected=197.611115
Time_value=528, Predicted=383.785818, Expected=196.829865
Time_value=529, Predicted=381.988958, Expected=197.912338
Time_value=530, Predicted=380.973040, Expected=198.232391
Time_value=531, Predicted=382.163923, Expected=200.228012
Time_value=532

Time_value=697, Predicted=349.429420, Expected=181.520187
Time_value=698, Predicted=354.260990, Expected=184.485275
Time_value=699, Predicted=358.758612, Expected=183.729630
Time_value=700, Predicted=363.132568, Expected=183.643555
Time_value=701, Predicted=362.537179, Expected=186.302567
Time_value=702, Predicted=362.446295, Expected=183.949631
Time_value=703, Predicted=365.955273, Expected=184.791321
Time_value=704, Predicted=363.144839, Expected=187.029465
Time_value=705, Predicted=364.081712, Expected=186.599075
Time_value=706, Predicted=367.058447, Expected=185.135651
Time_value=707, Predicted=366.741924, Expected=189.487625
Time_value=708, Predicted=364.850334, Expected=190.338898
Time_value=709, Predicted=370.323547, Expected=191.084915
Time_value=710, Predicted=371.796527, Expected=191.706650
Time_value=711, Predicted=372.912966, Expected=191.859695
Time_value=712, Predicted=373.813943, Expected=189.764999
Time_value=713, Predicted=374.101829, Expected=190.702347
Time_value=714

Time_value=869, Predicted=392.734436, Expected=208.128418
Time_value=870, Predicted=394.106034, Expected=207.739929
Time_value=871, Predicted=394.767479, Expected=207.837051
Time_value=872, Predicted=394.390495, Expected=208.720932
Time_value=873, Predicted=394.492009, Expected=208.021591
Time_value=874, Predicted=395.459705, Expected=207.603958
Time_value=875, Predicted=394.748347, Expected=207.050293
Time_value=876, Predicted=394.271444, Expected=206.438370
Time_value=877, Predicted=393.637291, Expected=206.448105
Time_value=878, Predicted=392.924188, Expected=204.952316
Time_value=879, Predicted=392.885496, Expected=203.718765
Time_value=880, Predicted=391.213323, Expected=202.786346
Time_value=881, Predicted=389.737465, Expected=202.562943
Time_value=882, Predicted=388.576096, Expected=207.030869
Time_value=883, Predicted=388.222287, Expected=207.963318
Time_value=884, Predicted=393.161960, Expected=210.168152
Time_value=885, Predicted=394.435297, Expected=210.692657
Time_value=886

Time_value=1043, Predicted=426.055507, Expected=237.983704
Time_value=1044, Predicted=428.078384, Expected=238.427414
Time_value=1045, Predicted=426.078448, Expected=238.831711
Time_value=1046, Predicted=426.498952, Expected=239.383896
Time_value=1047, Predicted=426.887337, Expected=237.194855
Time_value=1048, Predicted=427.411666, Expected=238.733093
Time_value=1049, Predicted=425.330120, Expected=238.989456
Time_value=1050, Predicted=426.793147, Expected=238.811981
Time_value=1051, Predicted=427.040842, Expected=240.606583
Time_value=1052, Predicted=426.870336, Expected=241.010880
Time_value=1053, Predicted=428.570281, Expected=242.134964
Time_value=1054, Predicted=428.948809, Expected=242.105377
Time_value=1055, Predicted=430.002665, Expected=242.233566
Time_value=1056, Predicted=429.966339, Expected=243.545029
Time_value=1057, Predicted=430.080373, Expected=243.653503
Time_value=1058, Predicted=431.310390, Expected=243.436569
Time_value=1059, Predicted=431.402895, Expected=243.3774

Time_value=1222, Predicted=458.278747, Expected=274.200012
Time_value=1223, Predicted=458.037182, Expected=270.489990
Time_value=1224, Predicted=458.325706, Expected=270.950012
Time_value=1225, Predicted=454.968149, Expected=270.429993
Time_value=1226, Predicted=455.463897, Expected=263.670013
Time_value=1227, Predicted=455.027988, Expected=258.049988
Time_value=1228, Predicted=448.909781, Expected=265.109985
Time_value=1229, Predicted=443.917475, Expected=260.600006
Time_value=1230, Predicted=450.549911, Expected=259.829987
Time_value=1231, Predicted=446.411060, Expected=263.149994
Time_value=1232, Predicted=445.769560, Expected=257.470001
Time_value=1233, Predicted=448.862618, Expected=260.769989
Time_value=1234, Predicted=443.637740, Expected=263.559998
Time_value=1235, Predicted=446.741847, Expected=265.640015
Time_value=1236, Predicted=449.290514, Expected=259.720001
Time_value=1237, Predicted=451.136951, Expected=261.000000
Time_value=1238, Predicted=445.664196, Expected=265.1499

IndexError: index 6383 is out of bounds for axis 0 with size 6383

In [93]:
isinstance(adj_close_data, pd.Series)

True

## Initial random noise for GAN

In [None]:
# noise
# adj_close_data + 0.5(+-random.uniform(0.05*adj_close_data))

In [60]:
x, cols = generateData(nObs=520,size0=5,size1=5,mu0=0,sigma0=1e-2,sigma1F=.25,sLength=260)

In [62]:
x + np.random.uniform()

array([[-0.0135472 , -0.0007816 , -0.00953293, ..., -0.01239322,
        -0.00785447, -0.00047046],
       [ 0.00646519, -0.00362034,  0.00514823, ..., -0.00889116,
        -0.00894014, -0.00513731],
       [-0.0011967 ,  0.00353195, -0.0020002 , ..., -0.0185029 ,
        -0.00752751,  0.00506009],
       ...,
       [-0.00022572,  0.01555214,  0.00742646, ...,  0.01431497,
         0.01124302,  0.01753849],
       [-0.00799441,  0.00563538, -0.00464556, ..., -0.00962632,
        -0.00766688,  0.00182281],
       [ 0.00579761,  0.01643316, -0.00325538, ..., -0.00161548,
        -0.00185555,  0.01483365]])

## GAN for finance implementation

In [153]:
from keras.layers import Input, Reshape

In [146]:
noise = np.random.normal(0, 1, (32, 1, 1))
print(noise.dtype, noise.shape)
model.predict(noise, batch_size=1).shape

float64 (32, 1, 1)


(32, 1)

In [186]:
shitmodel = Sequential()

shitmodel.add(LSTM(4, batch_input_shape=(1, 1, 1), stateful=True))
shitmodel.add(Dense(1))
#shitmodel.add(Reshape((1, 1, 1)))
shitmodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_15 (LSTM)               (1, 4)                    96        
_________________________________________________________________
dense_15 (Dense)             (1, 1)                    5         
Total params: 101
Trainable params: 101
Non-trainable params: 0
_________________________________________________________________


In [187]:
z = Input(shape=(1, 1,))
noise_datum = shitmodel(z)
shitmodel.reset_states()

In [188]:
noise = np.random.normal(0, 1, (32, 1, 1,))
print(noise.dtype, noise.shape)
shitmodel.predict(noise, batch_size=1).shape

float64 (32, 1, 1)


InvalidArgumentError: You must feed a value for placeholder tensor 'input_14' with dtype float and shape [?,1,1]
	 [[Node: input_14 = Placeholder[dtype=DT_FLOAT, shape=[?,1,1], _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]
	 [[Node: dense_15/BiasAdd/_847 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_396_dense_15/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]