In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from os.path import join
from sklearn.metrics import accuracy_score as accuracy, f1_score, mean_absolute_error as mae
import os
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPool2D
from pathlib2 import Path
from tensorflow.keras import backend as K, callbacks
import tensorflow as tf
import tensorflow.keras as keras

In [5]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision_pos = precision(y_true, y_pred)
    recall_pos = recall(y_true, y_pred)
    precision_neg = precision((K.ones_like(y_true) - y_true), (K.ones_like(y_pred) - K.clip(y_pred, 0, 1)))
    recall_neg = recall((K.ones_like(y_true) - y_true), (K.ones_like(y_pred) - K.clip(y_pred, 0, 1)))
    f_posit = 2 * ((precision_pos * recall_pos) / (precision_pos + recall_pos + K.epsilon()))
    f_neg = 2 * ((precision_neg * recall_neg) / (precision_neg + recall_neg + K.epsilon()))

    return (f_posit + f_neg) / 2


In [None]:
# def load_data(file_fir):
#     print(file_fir)
#     try:
#         df_raw = pd.read_csv(file_fir, index_col='date', parse_dates=True, infer_datetime_format=True) # parse_dates=['Date'])
#     except IOError:
#         print("IO ERROR")
#     return df_raw


In [21]:
def construct_data_warehouse():
    global number_of_stocks
    # global samples_in_each_stock
    global n_features
    # global order_stocks


    data = pd.read_csv('../csv/initial_variables.csv', index_col='date', parse_dates=True, infer_datetime_format=True)


    # dynamically build target based on number of periods to predict
    target = (data['close'][days_to_predict:] / data['close'][:-days_to_predict].values).astype(int)


    data = data[:-days_to_predict]
    target.index = data.index

    data = data.ffill()
    data['target'] = target
    target = data['target']
    # data['Date'] = data['Date'].apply(lambda x: x.weekday())
    del data['target']

    n_features = data.shape[1]
    # samples_in_each_stock = data.shape[0]

    X_train = data[data.index < '2020-01-31']

    X_train_tmp = scale(X_train)
    y_train_tmp = target[target.index < '2020-01-31']

    X_train = X_train_tmp[:int(0.75 * X_train_tmp.shape[0])]
    y_train = y_train_tmp[:int(0.75 * y_train_tmp.shape[0])]

    X_valid = scale(X_train_tmp[int(0.75 * X_train_tmp.shape[0]) - seq_len:])
    y_valid = y_train_tmp[int(0.75 * y_train_tmp.shape[0]) - seq_len:]

    data = pd.DataFrame(scale(data.values), columns=data.columns)
    data.index = target.index
    
    X_test = np.array(data[data.index >= '2020-01-31'])
    y_test = np.array(target[target.index >= '2020-01-31'])

    data_warehouse = [X_train, y_train, X_test, y_test, X_valid, y_valid]
    # data_warehouse[df_name] = [X_train, y_train, np.array(X_test), np.array(y_test), X_valid, y_valid]

    return data_warehouse


In [22]:
dw = construct_data_warehouse()
dw



[array([[-1.78022919, -1.78339602, -1.77748429, ..., -0.77416684,
         -2.99349553, -0.01702592],
        [-1.78903669, -1.80414801, -1.81693303, ..., -0.74053201,
         -2.96712807,  0.69654917],
        [-1.80938473, -1.76425812, -1.79414989, ..., -0.7606017 ,
         -2.98335414,  1.41012425],
        ...,
        [ 1.18995834,  1.18658268,  1.14495812, ...,  1.55376196,
         -0.1396651 , -0.730601  ],
        [ 1.15159577,  1.13461943,  1.11479125, ...,  1.54833073,
         -0.11451395, -0.01702592],
        [ 1.10999596,  1.10510541,  1.1086526 , ...,  1.57002979,
         -0.15031406,  0.69654917]]),
 date
 2012-08-01    0
 2012-08-02    1
 2012-08-03    1
 2012-08-06    1
 2012-08-07    1
              ..
 2018-03-09    0
 2018-03-12    0
 2018-03-13    0
 2018-03-14    0
 2018-03-15    1
 Name: target, Length: 1414, dtype: int64,
 array([[ 0.73771624,  0.71718308,  0.68264964, ..., -0.3885846 ,
          0.01355631,  1.41291204],
        [ 0.68432008,  0.70139801, 

Generate the images (2D matrices) of data for each Sequence length (60 days)

In [None]:
def build_snapshots(data, target, seq_len):

    X = []
    y = []

    for i in range(data.shape[0] - seq_len + 1):
        
        X.append(data[i: i + seq_len])
        y.append(target[i + seq_len - 1])

    return X, y


Generate train, vaildation, and test data

In [15]:
def cnn_data_sequence(data, seq_len):

    # for key, value in data.items():

    X_train, y_train = build_snapshots(
        data[0], data[1], seq_len
    )

    X_test, y_test = build_snapshots(
        data[2], data[3], seq_len
    )

    X_valid, y_valid = build_snapshots(
        data[4], data[5], seq_len
    )

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    X_valid = np.array(X_valid)
    y_valid = np.array(y_valid)

    X_train = X_train.reshape(
        X_train.shape[0], X_train.shape[1], X_train.shape[2], 1
    )

    X_test = X_test.reshape(
        X_test.shape[0], X_test.shape[1], X_test.shape[2], 1
    )

    X_valid = X_valid.reshape(
        X_valid.shape[0], X_valid.shape[1], X_valid.shape[2], 1
    )

    return X_train, y_train, X_test, y_test, X_valid, y_valid


In [None]:
def sklearn_acc(model, test_data, test_target):
    overall_results = model.predict(test_data)
    test_pred = (overall_results > 0.5).astype(int)
    


    # print(f'test pred: {test_pred}')
    # print(f'test target: {test_target}')




    acc_results = [mae(overall_results, test_target), accuracy(test_pred, test_target),
                   f1_score(test_pred, test_target, average='macro')]

    return acc_results



In [None]:

def train(data_warehouse):
# def train(data_warehouse, i):
    seq_len = 60
    epochs = 200
    drop_rate = 0.1

    global cnn_train_data, cnn_train_target, cnn_test_data, cnn_test_target, cnn_valid_data, cnn_valid_target

    # if i == 1:
    #     print('sequencing ...')
    cnn_train_data, cnn_train_target, cnn_test_data, cnn_test_target, cnn_valid_data, cnn_valid_target = cnn_data_sequence(
        data_warehouse, seq_len)

    my_file = Path(join(Base_dir, f'2D-models/best-{epochs}-{seq_len}-{n_filters}-{drop_rate}.h5'))
    filepath = join(Base_dir, f'2D-models/best-{epochs}-{seq_len}-{n_filters}-{drop_rate}.h5')
    
    if my_file.is_file():
        print('loading model')

    else:

        print(' fitting model to target')
        model = Sequential()

        # layer 1
        model.add(
            Conv2D(n_filters[0], (1, n_features), activation='relu', input_shape=(seq_len, n_features, 1))
        )
        
        # layer 2
        model.add(Conv2D(n_filters[1], (3, 1), activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 1)))

        # layer 3
        model.add(Conv2D(n_filters[2], (3, 1), activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 1)))

        model.add(Flatten())
        model.add(Dropout(drop_rate))

        model.add(Dense(1, activation='sigmoid'))

        model.compile(optimizer='adam', loss='mae', metrics=['acc', f1])

        best_model = callbacks.ModelCheckpoint(filepath, monitor='val_f1', verbose=0, save_best_only=True,
                                               save_weights_only=False, mode='max', period=1)


        model.fit(cnn_train_data, cnn_train_target, epochs=epochs, batch_size=128, verbose=1,
                        validation_data=(cnn_valid_data, cnn_valid_target), callbacks=[best_model])
    model = load_model(filepath, custom_objects={'f1': f1})

    return model, seq_len


In [None]:
def cnn_data_sequence_pre_train(data, target, seque_len):
    new_data = []
    new_target = []
    for index in range(data.shape[0] - seque_len + 1):
        new_data.append(data[index: index + seque_len])
        new_target.append(target[index + seque_len - 1])

    new_data = np.array(new_data)
    new_target = np.array(new_target)

    new_data = new_data.reshape(new_data.shape[0], new_data.shape[1], new_data.shape[2], 1)

    return new_data, new_target


In [None]:
def prediction(data_warehouse, model, seque_len, order_stocks, cnn_results):
    for name in order_stocks:
        value = data_warehouse[name]
        # train_data, train_target = cnn_data_sequence_pre_train(value[0], value[1], seque_len)
        test_data, test_target = cnn_data_sequence_pre_train(value[2], value[3], seque_len)
        # valid_data, valid_target = cnn_data_sequence_pre_train(value[4], value[5], seque_len)

        cnn_results.append(sklearn_acc(model, test_data, test_target)[2])

    return cnn_results


In [None]:
def run_cnn_ann(data_warehouse, order_stocks):
    # cnn_results = []
    # dnn_results = []
    # iterate_no = 4
    # for i in range(1, iterate_no):
    K.clear_session()
    # print(i)
    model, seq_len = train(data_warehouse)
    # model, seq_len = train(data_warehouse, i)
    # cnn_results, dnn_results = prediction(data_warehouse, model, seq_len, order_stocks, cnn_results)
    cnn_results = prediction(data_warehouse, model, seq_len, order_stocks, cnn_results)

    cnn_results = np.array(cnn_results)
    cnn_results = cnn_results.reshape(iterate_no - 1, len(order_stocks))
    cnn_results = pd.DataFrame(cnn_results, columns=order_stocks)
    cnn_results = cnn_results.append([cnn_results.mean(), cnn_results.max(), cnn_results.std()], ignore_index=True)
    cnn_results.to_csv(join(Base_dir, '2D-models/new results.csv'), index=False)
    

In [13]:
# Base_dir = '../'
# TRAIN_ROOT_PATH = join(Base_dir, 'Dataset')
# train_file_names = os.listdir(join(Base_dir, 'Dataset'))

# if moving average = 0 then we have no moving average
seq_len = 60
moving_average_day = 0
number_of_stocks = 0
n_features = 0
# samples_in_each_stock = 0
n_filters = [8, 8, 8]
days_to_predict = 1

cnn_train_data = []
cnn_train_target = []
cnn_test_data = []
cnn_test_target = []
cnn_valid_data = []
cnn_valid_target = []

print('Loading train data ...')
# order_stocks = []
data_warehouse = costruct_data_warehouse()
# data_warehouse = costruct_data_warehouse(TRAIN_ROOT_PATH, train_file_names)
# order_stocks = data_warehouse.keys()

# print('number of stocks = '), number_of_stocks

run_cnn_ann(data_warehouse, order_stocks)

Loading train data ...




NameError: name 'run_cnn_ann' is not defined