In [233]:
import random 
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error
import h5py

Declare constants

In [234]:
TRAIN_TEST_CUTOFF = '2020-01-31'
TRAIN_VALID_RATIO = 0.75

Define metric functions

In [235]:
def _recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
 
def _precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
 
def _f1(y_true, y_pred):
    precision = _precision(y_true, y_pred)
    recall = _recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
 
def f1macro(y_true, y_pred):
    f_pos = _f1(y_true, y_pred)
    # negative version of the data and prediction
    f_neg = _f1(1-y_true, 1-K.clip(y_pred,0,1))
    return (f_pos + f_neg)/2

In [236]:
def cnnpred_2d(seq_len=60, n_features=74, n_filters=(8,8,8), droprate=0.1):
    "2D-CNNpred model according to the paper"
    
    model = Sequential([
        Input(shape=(seq_len, n_features, 1)),
        Conv2D(n_filters[0], kernel_size=(1, n_features), activation="relu"),
        Conv2D(n_filters[1], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Conv2D(n_filters[2], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Flatten(),
        Dropout(droprate),
        Dense(1, activation="sigmoid")
    ])

    return model

cnnpred_2d().summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_63 (Conv2D)          (None, 60, 1, 8)          600       
                                                                 
 conv2d_64 (Conv2D)          (None, 58, 1, 8)          200       
                                                                 
 max_pooling2d_42 (MaxPoolin  (None, 29, 1, 8)         0         
 g2D)                                                            
                                                                 
 conv2d_65 (Conv2D)          (None, 27, 1, 8)          200       
                                                                 
 max_pooling2d_43 (MaxPoolin  (None, 13, 1, 8)         0         
 g2D)                                                            
                                                                 
 flatten_21 (Flatten)        (None, 104)             

In [237]:
def datagen(df, seq_len, batch_size, target_col, kind):
    """A generator to produce samples for Keras model"""

    batch = []

    while True:

        # Set up splitting parameters
        input_cols = [c for c in df.columns if c != target_col]
        index = df.index[df.index < TRAIN_TEST_CUTOFF]
        split = int(len(index) * TRAIN_VALID_RATIO)

        # Range for the training set
        if kind == 'train':
            index = index[:split]

        # Range for the validation set
        elif kind == 'valid':
            index = index[split:]   

        while True:
            "Pick one position, then clip a sequence length"

            # Pick one time step
            t = random.choice(index)

            # Find its position in the DataFrame      
            n = (df.index == t).argmax()

            # Start over if there isn't enough data for one sequence length  
            if (n - seq_len + 1) < 0:
                continue
            
            # Create the DataFrame of one sequence length
            frame = df.iloc[n - seq_len+1 : n+1]

            # Append X and y values as a sample in the CNN dataset
            batch.append([frame[input_cols].values, df.loc[t, target_col]])

            break

        # If we get enough for a batch, yield the instance
        if len(batch) == batch_size:

            # Unpack the `batch` list into features and target
            X, y = zip(*batch)

            # Expand dimensions of X
            X, y = np.expand_dims(np.array(X), 3), np.array(y)

            # Yield the sample
            yield X, y

            # Clear the batch list for next iteration
            batch = []

In [238]:
def testgen(df, seq_len, target_col):
    "Return array of all test samples"

    batch = []

    input_cols = [c for c in df.columns if c != target_col]

    # find the start of test sample
    t = df.index[df.index > TRAIN_TEST_CUTOFF][0]
    n = (df.index == t).argmax()

    for i in range(n+1, len(df)+1):

        frame = df.iloc[i-seq_len:i]
        batch.append([frame[input_cols].values, frame[target_col][-1]])

    X, y = zip(*batch)

    return np.expand_dims(np.array(X),3), np.array(y)

In [239]:
data = pd.read_csv('../csv/initial_variables.csv', index_col='date', parse_dates=True, infer_datetime_format=True)

In [240]:
cols = data.columns

# If the current price is higher than yesterday's price then target = 1, else 0
data['target'] = (data['close'].pct_change().shift(-1) > 0).astype(int)

data.dropna(inplace=True)

# Fit the standard scaler using the training dataset
index = data.index[data.index > TRAIN_TEST_CUTOFF]
index = index[:int(len(index) * TRAIN_VALID_RATIO)]
scaler = StandardScaler().fit(data.loc[index, cols])

# Save scale transformed dataframe
data[cols] = scaler.transform(data[cols])
data

Unnamed: 0_level_0,open,high,low,close,volume,trend,rsi,rsi_fast_k,rsi_fast_d,williams_r,...,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy,day_of_week,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,-3.927738,-3.992075,-3.849111,-3.929754,-0.000858,-4.067769,-0.125040,-0.487452,0.438703,0.116703,...,-6.675313,1.040767,-4.615527,-5.219167,-1.191103,-1.437705,-0.654210,-10.338144,-0.003038,0
2012-08-02,-3.934715,-4.008711,-3.879867,-3.946913,-0.180844,-4.064781,-0.465806,-1.451297,-0.592878,-0.453170,...,-6.644511,0.974987,-4.601028,-5.192207,-1.164388,-1.327174,-0.564953,-10.248450,0.712476,1
2012-08-03,-3.950836,-3.976733,-3.862104,-3.903700,-0.503106,-4.062349,0.208150,0.839803,-0.471148,0.854900,...,-6.608700,1.081638,-4.590233,-5.150877,-1.168968,-1.197698,-0.618212,-10.303646,1.427991,1
2012-08-06,-3.908220,-3.967541,-3.820016,-3.898313,-0.961206,-4.060179,0.284862,0.993284,0.164870,0.758658,...,-6.628040,1.075251,-4.686221,-5.265817,-1.154084,-1.715607,-0.615329,-10.169103,-1.434068,1
2012-08-07,-3.902525,-3.954921,-3.814391,-3.886474,-0.553116,-4.056620,0.453579,0.993284,1.214888,0.766904,...,-6.648812,1.105268,-4.690031,-5.265287,-1.144542,-1.636657,-0.564953,-10.310547,-0.718553,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-26,0.358625,0.323492,0.324125,0.303007,-0.989335,0.292450,-0.821548,-1.451297,-0.831555,-0.060913,...,-1.008615,4.189357,0.515773,-0.219405,0.288545,3.760341,6.209249,9.778225,-0.718553,1
2022-07-27,0.355644,0.468577,0.391047,0.473532,-0.624709,0.308015,-0.169930,0.993284,-0.541380,0.850320,...,-0.595164,4.377641,0.524133,-0.178608,0.337396,4.051191,6.218583,9.964172,-0.003038,1
2022-07-28,0.480043,0.534768,0.459368,0.554705,-0.407222,0.330827,0.106226,0.993284,0.230795,0.960463,...,-0.311756,4.481488,0.420663,-0.279003,0.318696,3.807711,6.207087,9.667828,0.712476,1
2022-07-29,0.581961,0.637608,0.601223,0.650908,-0.454632,0.357480,0.413294,0.993284,1.280813,0.942131,...,-0.105479,4.897848,0.407827,-0.296919,0.274424,3.859186,6.235878,9.076529,1.427991,0


In [241]:
seq_len = 60
batch_size = 64
n_epochs = 20
n_features = 74
 
model = cnnpred_2d(seq_len, n_features)
model.compile(optimizer='adam', loss='mae', metrics=['acc', f1macro])
model.summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_66 (Conv2D)          (None, 60, 1, 8)          600       
                                                                 
 conv2d_67 (Conv2D)          (None, 58, 1, 8)          200       
                                                                 
 max_pooling2d_44 (MaxPoolin  (None, 29, 1, 8)         0         
 g2D)                                                            
                                                                 
 conv2d_68 (Conv2D)          (None, 27, 1, 8)          200       
                                                                 
 max_pooling2d_45 (MaxPoolin  (None, 13, 1, 8)         0         
 g2D)                                                            
                                                                 
 flatten_22 (Flatten)        (None, 104)             

In [242]:
checkpoint_path = './models/cp2d-{epoch}-{val_f1macro:.2f}.h5'
callbacks = [
    ModelCheckpoint(checkpoint_path,
                    monitor='val_f1macro', mode='max', verbose=0,
                    save_best_only=True, save_weights_only=False, save_freq='epoch')
]

In [243]:
training_gen = datagen(data, seq_len, batch_size, 'target', 'train')
validation_gen = datagen(data, seq_len, batch_size, 'target', 'valid')

model.fit(
    training_gen,
    validation_data=validation_gen,
    epochs=n_epochs, 
    steps_per_epoch=400, 
    validation_steps=10, 
    verbose=1,
    callbacks=callbacks
)

Epoch 1/20


2022-08-07 16:50:13.093621: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-08-07 16:50:21.078525: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x335745150>

In [246]:
# Prepare test data
test_data, test_target = testgen(data, seq_len, "target")
 
# Test the model
test_out = model.predict(test_data)
test_pred = (test_out > 0.5).astype(int)
print("accuracy:", accuracy_score(test_pred, test_target))
print("MAE:", mean_absolute_error(test_pred, test_target))
print("F1:", f1_score(test_pred, test_target))

accuracy: 0.5421303656597775
MAE: 0.4578696343402226
F1: 0.7030927835051547
