## Generate a Baseline

In [5]:
# imports and define constants that will be used by the helper methods.

import os
import math
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import confusion_matrix, classification_report

# Constants
BARS=15
RESULT_DIR='./result'
DATE_SPLIT='2019-06-01'

In [6]:
def get_datasets():

    train_val = pd.read_csv('./data/spy.2008.2021.csv.gz', compression='gzip')
    train_val = train_val[['date','open','close']]
    train_val['date'] = pd.to_datetime(train_val['date'])

    test = pd.read_csv('./data/spy.csv.gz', compression='gzip')
    test = test[['date','open','close']]
    test['date'] = pd.to_datetime(test['date'])

    train = train_val[train_val['date'] <= DATE_SPLIT]
    validation = train_val[(train_val['date'] > DATE_SPLIT) & (train_val['date'] < test['date'].min())]

    return train, validation, test

In [7]:
def get_features_targets(df, scale_obs=True):

    feature_result = []
    dates = []

    # Remove duplicated dates
    df = df.groupby(by='date').mean().reset_index()

    # Get Features based on BARS configuration
    features = df[((df['date'].dt.hour == 9) & (df['date'].dt.minute >= 30)) &
                   (df['date'].dt.hour == 9) & (df['date'].dt.minute < 30 + BARS)]
    features = features.groupby(features['date'].dt.date)

    for dt, feature in features:

        if len(feature) != BARS:
            feature = feature.set_index('date')
            feature = feature.resample('1T').asfreq().reindex(pd.date_range(str(dt) + ' 09:30:00', str(dt) + f' 09:{30+BARS-1}:00', freq='1T'))
            feature = feature.reset_index()
            feature['close'] = feature['close'].fillna(method='ffill')
            feature['open'] = feature['open'].fillna(feature['close'])
            feature = feature.dropna()

        if len(feature) == BARS:
            feature = feature['close'].values

            if scale_obs:
                feature -= np.min(feature)
                feature /= np.max(np.abs(feature))
                feature = np.nan_to_num(feature, nan=0.0, posinf=0.0, neginf=0.0)

            feature_result.append(feature)
            dates.append(dt)

    # Get Targets Trend based on first and last value / day (0: DOWN - 1: UP)
    targets = df.set_index('date')
    targets = targets.resample('1D').agg({'open':'first', 'close':'last'})
    targets = targets.loc[dates].reset_index().sort_values(by='date')
    targets['trend'] = np.where(targets['open'] < targets['close'], 1, 0)

    print(len(feature_result), len(targets))
    return np.array(feature_result), np.array(targets['trend'].values)

In [8]:
get_datasets()

(                       date    open   close
 0       2008-01-22 09:30:00  126.45  126.67
 1       2008-01-22 09:31:00  126.67  127.12
 2       2008-01-22 09:32:00  127.10  126.78
 3       2008-01-22 09:33:00  126.76  126.54
 4       2008-01-22 09:34:00  126.54  126.78
 ...                     ...     ...     ...
 1615277 2019-05-31 15:58:00  275.55  275.47
 1615278 2019-05-31 15:58:00  275.55  275.47
 1615279 2019-05-31 15:59:00  275.48  275.37
 1615280 2019-05-31 15:59:00  275.48  275.37
 1615281 2019-05-31 15:59:00  275.48  275.37
 
 [1615282 rows x 3 columns],
                        date    open   close
 1615282 2019-06-03 09:30:00  275.31  275.36
 1615283 2019-06-03 09:31:00  275.37  275.80
 1615284 2019-06-03 09:32:00  275.79  275.76
 1615285 2019-06-03 09:33:00  275.76  275.87
 1615286 2019-06-03 09:34:00  275.86  275.99
 ...                     ...     ...     ...
 2070829 2021-05-06 15:55:00  418.56  418.49
 2070830 2021-05-06 15:56:00  418.50  418.64
 2070831 2021-05-06 15:5