In [1]:
from keras.models import Sequential
from keras.layers import Dropout, LSTM, Activation, Dense
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data = pd.read_csv('bitcoin.csv').drop(['time_period_start', 'time_period_end', 'time_open', 'time_close'], axis=1)
data

Unnamed: 0,price_open,price_high,price_low,price_close,volume_traded,trades_count
0,7165.72,7165.72,7165.71,7165.71,0.021841,2
1,7168.30,7168.30,7168.30,7168.30,1.000000,2
2,7170.50,7170.50,7170.50,7170.50,0.002000,1
3,7169.20,7169.20,7169.20,7169.20,0.004000,2
4,7169.20,7169.20,7169.20,7169.20,0.002000,1
...,...,...,...,...,...,...
9995,7179.50,7179.50,7179.50,7179.50,0.013325,1
9996,7181.67,7181.67,7181.67,7181.67,0.013364,1
9997,7179.50,7179.50,7179.50,7179.50,0.001526,1
9998,7182.12,7182.12,7182.12,7182.12,0.013437,1


## Lookback Window

In order to capture the past as features, we create the below function. It takes as input some number of seconds to look back, defaulted to 60 seconds.

Question: Is this a good feature set? Can you find a better one?

In [11]:
def lookback(dataset, timesteps = 60):
    # this uses the shift method of pandas dataframes to shift all of the columns down one row
    # and then append to the original dataset
    data = dataset
    for i in range(1, timesteps):
        step_back = dataset.shift(i).reset_index()
        step_back.columns = ['index'] + [f'{column}_-{i}' for column in dataset.columns if column != 'index']
        data = data.reset_index().merge(step_back, on='index', ).drop('index', axis=1)
        
    return data.dropna()
        
features = lookback(data)
features

Unnamed: 0,price_open,price_high,price_low,price_close,volume_traded,trades_count,price_open_-1,price_high_-1,price_low_-1,price_close_-1,...,price_low_-58,price_close_-58,volume_traded_-58,trades_count_-58,price_open_-59,price_high_-59,price_low_-59,price_close_-59,volume_traded_-59,trades_count_-59
59,7154.97,7154.97,7154.97,7154.97,2.000000,2,7163.30,7163.30,7163.30,7163.30,...,7168.30,7168.30,1.000000,2.0,7165.72,7165.72,7165.71,7165.71,0.021841,2.0
60,7161.20,7163.40,7161.20,7163.40,0.015800,2,7154.97,7154.97,7154.97,7154.97,...,7170.50,7170.50,0.002000,1.0,7168.30,7168.30,7168.30,7168.30,1.000000,2.0
61,7154.98,7154.98,7154.97,7154.98,0.038357,3,7161.20,7163.40,7161.20,7163.40,...,7169.20,7169.20,0.004000,2.0,7170.50,7170.50,7170.50,7170.50,0.002000,1.0
62,7154.98,7154.98,7154.98,7154.98,0.032201,1,7154.98,7154.98,7154.97,7154.98,...,7169.20,7169.20,0.002000,1.0,7169.20,7169.20,7169.20,7169.20,0.004000,2.0
63,7154.97,7154.97,7154.97,7154.97,2.000000,1,7154.98,7154.98,7154.98,7154.98,...,7165.72,7169.20,0.075433,3.0,7169.20,7169.20,7169.20,7169.20,0.002000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,7179.50,7179.50,7179.50,7179.50,0.013325,1,7179.04,7179.04,7178.23,7178.23,...,7178.64,7178.64,0.006793,1.0,7179.88,7179.88,7179.88,7179.88,0.009628,1.0
9996,7181.67,7181.67,7181.67,7181.67,0.013364,1,7179.50,7179.50,7179.50,7179.50,...,7178.64,7178.64,0.001249,1.0,7178.64,7178.64,7178.64,7178.64,0.006793,1.0
9997,7179.50,7179.50,7179.50,7179.50,0.001526,1,7181.67,7181.67,7181.67,7181.67,...,7178.64,7178.64,0.068287,1.0,7178.64,7178.64,7178.64,7178.64,0.001249,1.0
9998,7182.12,7182.12,7182.12,7182.12,0.013437,1,7179.50,7179.50,7179.50,7179.50,...,7173.77,7173.77,0.008064,1.0,7178.64,7178.64,7178.64,7178.64,0.068287,1.0


In [20]:
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(data.values)
scaled_df = pd.DataFrame(scaled_values, index = data.index, columns = data.columns)
scaled_df

Unnamed: 0,price_open,price_high,price_low,price_close,volume_traded,trades_count
0,0.267495,0.264047,0.339554,0.267368,3.358218e-04,0.011494
1,0.300203,0.296333,0.369204,0.300203,1.538114e-02,0.011494
2,0.328093,0.323864,0.394390,0.328093,3.063947e-05,0.000000
3,0.311613,0.307596,0.379508,0.311613,6.140199e-05,0.011494
4,0.311613,0.307596,0.379508,0.311613,3.063947e-05,0.000000
...,...,...,...,...,...,...
9995,0.442191,0.436491,0.497424,0.442191,2.048259e-04,0.000000
9996,0.469701,0.463647,0.522267,0.469701,2.054366e-04,0.000000
9997,0.442191,0.436491,0.497424,0.442191,2.334721e-05,0.000000
9998,0.475406,0.469278,0.527418,0.475406,2.065548e-04,0.000000


In [15]:
num_in  = 60
num_out = 1
n_features = features.shape[1]

# Splitting the data into appropriate sequences
X, y = [], []
    
for i in range(len(s)):
    end = i + n_steps_in
    out_end = end + n_steps_out

    if out_end > len(seq):
        break

    seq_x, seq_y = seq[i:end], seq[end:out_end]

    X.append(seq_x)
    y.append(seq_y)

return np.array(X), np.array(y)
X, y = split_sequence(list(df.Close), n_per_in, n_per_out)

# Reshaping the X variable from 2D to 3D
X = X.reshape((X.shape[0], X.shape[1], n_features))

(9941, 359)

In [11]:
model = Sequential()
model.add(Dense(32, input_dim=features.shape[1]))
model.add(Dense(16))
model.add(Dense(1, activation='relu'))

model.compile(
    loss='mse',
    optimizer=Adam(lr=0.01), # is this the best optimizer/learning rate?
    metrics=['mean_squared_error', 'mean_absolute_error'] # does accuracy make sense in this context?
)

## callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    mode='auto',
    restore_best_weights=True
)


history = model.fit(
    features,
    target,
    validation_split=.3,
    epochs=20,
    verbose=1
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
act_fun = 'sigmoid'

# Initialize the RNN
model = Sequential()

# Adding the input layer and the LSTM layer
model.add(LSTM(units = 30, activation=act_fun, return_sequences=True, input_shape=(None, features.shape[1])))

# Adding the output layer
model.add(Dense(units = 1))
# Compiling the RNN
model.compile(optimizer=Adam(lr=0.01), loss='mse', metrics=['mean_squared_error', 'mean_absolute_error'])

# Using the training set to train the model
model.fit(features, target, epochs = 100, verbose=1)


Epoch 1/100


ValueError: in user code:

    /Users/cmarquis/Documents/penn/penn-ml-challenge/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /Users/cmarquis/Documents/penn/penn-ml-challenge/venv/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/cmarquis/Documents/penn/penn-ml-challenge/venv/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/cmarquis/Documents/penn/penn-ml-challenge/venv/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/cmarquis/Documents/penn/penn-ml-challenge/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /Users/cmarquis/Documents/penn/penn-ml-challenge/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:885 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    /Users/cmarquis/Documents/penn/penn-ml-challenge/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:176 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential_2 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 359]


In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                11520     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 12,065
Trainable params: 12,065
Non-trainable params: 0
_________________________________________________________________


Below we calculate predictions and root mean square error. Can we easily improve this RMSE?

In [8]:
## prediction
predictions = model.predict(features)
predictions

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [9]:
rmse = np.sqrt(np.mean(np.square((target.reshape(-1, 1) - predictions))))

In [10]:
rmse

7191.088757578544

## Save Model

The last thing we'll do is save the model for use in the API.

In [73]:
model.save('model.h5')