In [219]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from os.path import join
from sklearn.metrics import accuracy_score as accuracy, f1_score, mean_absolute_error as mae
import os
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPool2D
from pathlib2 import Path
from tensorflow.keras import backend as K, callbacks
import tensorflow as tf
import tensorflow.keras as keras

In [220]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision_pos = precision(y_true, y_pred)
    recall_pos = recall(y_true, y_pred)
    precision_neg = precision((K.ones_like(y_true) - y_true), (K.ones_like(y_pred) - K.clip(y_pred, 0, 1)))
    recall_neg = recall((K.ones_like(y_true) - y_true), (K.ones_like(y_pred) - K.clip(y_pred, 0, 1)))
    f_posit = 2 * ((precision_pos * recall_pos) / (precision_pos + recall_pos + K.epsilon()))
    f_neg = 2 * ((precision_neg * recall_neg) / (precision_neg + recall_neg + K.epsilon()))

    return (f_posit + f_neg) / 2


In [221]:
Base_dir = '../'
seq_len = 60
n_filters = [8, 8, 8]
predict_period = 1
epochs = 200
drop_rate = 0.1

In [222]:
def build_dataset():

    data = pd.read_csv('../csv/initial_variables.csv', index_col='date', parse_dates=True, infer_datetime_format=True)

    # dynamically build target based on number of periods to predict
    target = (data['close'][predict_period:] / data['close'][:-predict_period].values).astype(int)

    data = data[:-predict_period]
    target.index = data.index
    data = data.ffill()
    data['target'] = target
    target = data['target']
    del data['target']

    global n_features
    n_features = data.shape[1]

    X_train = data[data.index < '2020-01-31']

    X_train_tmp = scale(X_train)
    y_train_tmp = target[target.index < '2020-01-31']

    X_train = X_train_tmp[:int(0.75 * X_train_tmp.shape[0])]
    y_train = y_train_tmp[:int(0.75 * y_train_tmp.shape[0])]

    X_valid = scale(X_train_tmp[int(0.75 * X_train_tmp.shape[0]) - seq_len:])
    y_valid = y_train_tmp[int(0.75 * y_train_tmp.shape[0]) - seq_len:]

    data = pd.DataFrame(scale(data.values), columns=data.columns)
    data.index = target.index
    
    X_test = data[data.index >= '2020-01-31']
    y_test = target[target.index >= '2020-01-31']

    dataset = [X_train, y_train, np.array(X_test), np.array(y_test), X_valid, y_valid]

    return dataset, y_test


Generate the images (2D matrices) of data for each Sequence length (60 days)

In [223]:
def build_snapshots(data, target, seq_len):

    X = []
    y = []

    for i in range(data.shape[0] - seq_len + 1):
        
        X.append(data[i: i + seq_len])
        y.append(target[i + seq_len - 1])

    return X, y


Generate train, vaildation, and test data

In [224]:
def cnn_data_sequence(data, seq_len):

    X_train, y_train = build_snapshots(
        data[0], data[1], seq_len
    )

    X_test, y_test = build_snapshots(
        data[2], data[3], seq_len
    )

    X_valid, y_valid = build_snapshots(
        data[4], data[5], seq_len
    )

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    X_valid = np.array(X_valid)
    y_valid = np.array(y_valid)

    X_train = X_train.reshape(
        X_train.shape[0], X_train.shape[1], X_train.shape[2], 1
    )

    X_test = X_test.reshape(
        X_test.shape[0], X_test.shape[1], X_test.shape[2], 1
    )

    X_valid = X_valid.reshape(
        X_valid.shape[0], X_valid.shape[1], X_valid.shape[2], 1
    )

    return X_train, y_train, X_test, y_test, X_valid, y_valid


In [225]:
def sklearn_acc(model, test_data, test_target):

    global test_pred
    
    overall_results = model.predict(test_data)
    test_pred = (overall_results > 0.5).astype(int)

    acc_results = [mae(overall_results, test_target), accuracy(test_pred, test_target),
                   f1_score(test_pred, test_target, average='macro')]

    return acc_results

In [226]:
def train(data):
    
    X_train, y_train, X_test, y_test, X_valid, y_valid = cnn_data_sequence(
        data, seq_len
    )

    my_file = Path(join(Base_dir, f'models/best-{epochs}-{seq_len}-{drop_rate}.h5'))
    filepath = join(Base_dir, f'models/best-{epochs}-{seq_len}-{drop_rate}.h5')
    
    if my_file.is_file():
        print('Loading model...')

    else:

        print(' Fitting model...')
        model = Sequential()
        # layer 1
        model.add(Conv2D(n_filters[0], (1, n_features), activation='relu', input_shape=(seq_len, n_features, 1)))
        # layer 2
        model.add(Conv2D(n_filters[1], (3, 1), activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 1)))
        # layer 3
        model.add(Conv2D(n_filters[2], (3, 1), activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 1)))
        model.add(Flatten())
        model.add(Dropout(drop_rate))
        model.add(Dense(1, activation='sigmoid'))

        # Build the model
        model.compile(optimizer='adam', loss='mae', metrics=['acc', f1])

        # Save the best model out of all trials
        best_model = callbacks.ModelCheckpoint(
            filepath, 
            monitor='val_f1', 
            verbose=0, 
            save_best_only=True,
            ave_weights_only=False, 
            mode='max', 
            period=1
        )

        model.fit(
            X_train, y_train,
            epochs=epochs, 
            batch_size=128, 
            verbose=1,
            validation_data=(X_valid, y_valid), 
            callbacks=[best_model]
        )
        
    model = load_model(filepath, custom_objects={'f1': f1})

    return model, seq_len


In [227]:
def cnn_data_sequence_pre_train(data, target, seq_len):

    new_data = []
    new_target = []
    
    for index in range(data.shape[0] - seq_len + 1):
        
        new_data.append(data[index: index + seq_len])
        new_target.append(target[index + seq_len - 1])

    new_data = np.array(new_data)
    new_target = np.array(new_target)

    new_data = new_data.reshape(new_data.shape[0], new_data.shape[1], new_data.shape[2], 1)

    return new_data, new_target


In [228]:
def prediction(data, model, seq_len):

    X_test, y_test = cnn_data_sequence_pre_train(data[2], data[3], seq_len)

    cnn_results = [sklearn_acc(model, X_test, y_test)[2]]

    return cnn_results


In [229]:
def run_cnn_ann(data):
    
    K.clear_session()
    model, seq_len = train(data)
    
    results = prediction(data, model, seq_len)
    results = np.array(results)
    results = pd.DataFrame(results, columns=['SP500'])
    results.to_csv(join(Base_dir, 'models/new results.csv'), index=False)

In [230]:
print('Loading train data ...')
dataset, y_test = build_dataset()
run_cnn_ann(dataset)

Loading train data ...
Loading model...
 1/18 [>.............................] - ETA: 0s

2022-08-08 19:48:35.797060: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




In [231]:
y_test

date
2020-01-31    1
2020-02-03    1
2020-02-04    1
2020-02-05    1
2020-02-06    0
             ..
2022-07-25    0
2022-07-26    1
2022-07-27    1
2022-07-28    1
2022-07-29    0
Name: target, Length: 629, dtype: int64