In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

In [2]:
from springstone.data import get_data, get_missing_dates, create_train_test
from springstone.utils import bollinger_bands, moving_average, daily_return
from springstone.params import MODEL_TYPE
import pandas as pd
import numpy as np

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout, Normalization, Masking
from tensorflow.keras.layers.experimental.preprocessing import Normalization

In [4]:
hist_aapl = get_data('AAPL').drop(columns=['High', 'Low'])

[*********************100%***********************]  1 of 1 completed


In [5]:
hist_aapl.shape

(2518, 3)

In [6]:
missing_dates_aapl = get_missing_dates(hist_aapl, True)

In [7]:
an_array = np.full((missing_dates_aapl.shape[0], hist_aapl.shape[1]), np.nan)
data = pd.concat([hist_aapl, pd.DataFrame(an_array, index=missing_dates_aapl, columns=hist_aapl.columns)])
data = data.sort_index()
data.rename_axis(index='Date', inplace=True)
hist_aapl = data.fillna(method='ffill')

In [8]:
hist_aapl

Unnamed: 0_level_0,Open,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-02-21,18.102858,18.387501,605595200.0
2012-02-22,18.324286,18.322857,483302400.0
2012-02-23,18.395714,18.442499,568027600.0
2012-02-24,18.559643,18.657499,415072000.0
2012-02-25,18.559643,18.657499,415072000.0
...,...,...,...
2022-02-14,167.369995,168.880005,86185500.0
2022-02-15,170.970001,172.789993,62527400.0
2022-02-16,171.850006,172.550003,61177400.0
2022-02-17,171.029999,168.880005,69589300.0


In [9]:
hist_aapl = bollinger_bands(hist_aapl, 'Close', 20, 2)
hist_aapl = bollinger_bands(hist_aapl, 'Close', 20, -2)
hist_aapl = moving_average(hist_aapl, 'Close', 7)
hist_aapl = daily_return(hist_aapl, 'Close')

In [10]:
hist_aapl

Unnamed: 0_level_0,Open,Close,Volume,Close_bb20_2,Close_bb20_-2,Close_ma7,percentage_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-02-21,18.102858,18.387501,605595200.0,,,,
2012-02-22,18.324286,18.322857,483302400.0,,,,-0.003516
2012-02-23,18.395714,18.442499,568027600.0,,,,0.006530
2012-02-24,18.559643,18.657499,415072000.0,,,,0.011658
2012-02-25,18.559643,18.657499,415072000.0,,,,0.000000
...,...,...,...,...,...,...,...
2022-02-14,167.369995,168.880005,86185500.0,179.832587,161.656413,171.147143,0.001423
2022-02-15,170.970001,172.789993,62527400.0,178.878874,163.920125,170.855713,0.023152
2022-02-16,171.850006,172.550003,61177400.0,176.874304,167.257696,170.322856,-0.001389
2022-02-17,171.029999,168.880005,69589300.0,176.953354,167.033645,169.860001,-0.021269


In [11]:
hist_aapl.dropna(subset=['Close_bb20_2'], inplace=True)

In [36]:
hist_aapl

Unnamed: 0_level_0,Open,Close,Volume,Close_bb20_2,Close_bb20_-2,Close_ma7,percentage_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-03-11,19.436071,19.470358,4.189192e+08,19.883297,18.212204,19.242909,0.000000
2012-03-12,19.606428,19.714287,4.072824e+08,19.939537,18.288642,19.339031,0.012528
2012-03-13,19.912144,20.289286,6.908552e+08,20.106623,18.318199,19.532093,0.029167
2012-03-14,20.644644,21.056429,1.418844e+09,20.491487,18.194728,19.832552,0.037810
2012-03-15,21.414642,20.912857,1.159718e+09,20.753963,18.157787,20.054847,-0.006818
...,...,...,...,...,...,...,...
2022-02-14,167.369995,168.880005,8.618550e+07,179.832587,161.656413,171.147143,0.001423
2022-02-15,170.970001,172.789993,6.252740e+07,178.878874,163.920125,170.855713,0.023152
2022-02-16,171.850006,172.550003,6.117740e+07,176.874304,167.257696,170.322856,-0.001389
2022-02-17,171.029999,168.880005,6.958930e+07,176.953354,167.033645,169.860001,-0.021269


In [39]:
def subsample_sequence(df, column, length, horizon=1):
    last_possible = df.shape[0] - length - horizon
    random_start = np.random.randint(0, last_possible)
    X = df[random_start: random_start + length].values
    y = df.iloc[random_start + length + horizon][column]
    return np.array(X), np.array(y)

X, y = subsample_sequence(hist_aapl, 'Close', length=10)

In [41]:
X

array([[ 3.25050011e+01,  3.32500000e+01,  2.83896400e+08,
         3.35371608e+01,  2.90675893e+01,  3.23750000e+01,
         2.70270270e-02],
       [ 3.32350006e+01,  3.30424995e+01,  2.76912400e+08,
         3.37199062e+01,  2.92000939e+01,  3.25299999e+01,
        -6.24061527e-03],
       [ 3.28899994e+01,  3.21974983e+01,  2.98846800e+08,
         3.37413686e+01,  2.93998812e+01,  3.25324996e+01,
        -2.55731628e-02],
       [ 3.21974983e+01,  3.26049995e+01,  3.65150000e+08,
         3.37487561e+01,  2.96797437e+01,  3.26028568e+01,
         1.26563007e-02],
       [ 3.25000000e+01,  3.21150017e+01,  2.48059200e+08,
         3.36462371e+01,  3.00205129e+01,  3.25657142e+01,
        -1.50283046e-02],
       [ 3.25000000e+01,  3.21150017e+01,  2.48059200e+08,
         3.34737565e+01,  3.04312436e+01,  3.25285715e+01,
         0.00000000e+00],
       [ 3.25000000e+01,  3.21150017e+01,  2.48059200e+08,
         3.32484963e+01,  3.08750040e+01,  3.24914289e+01,
         0.0000000

In [42]:
def get_X_y(df, column, length_of_observations, horizon=1):
    X, y = [], []
    for length in length_of_observations:
        xi, yi = subsample_sequence(df, column, length, horizon)
        X.append(xi)
        y.append(yi)
    return np.array(X), np.array(y

In [43]:
df_train, df_test = create_train_test(hist_aapl)

In [44]:
length_of_observations = np.random.randint(15, 30, 100)
X_train, y_train = get_X_y(df_train, 'Close', length_of_observations)

X_test, y_test = get_X_y(df_test, 'Close', length_of_observations)

In [45]:
X_test.shape

AttributeError: 'list' object has no attribute 'shape'