In [51]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [52]:
df_lstm=pd.read_csv('apple_data.csv')
df_lstm.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2015-01-02,24.28858,24.757334,23.848706,24.746226,212818400
1,2015-01-05,23.604326,24.137507,23.417714,24.05753,257142000
2,2015-01-06,23.606552,23.866477,23.244433,23.668756,263188400
3,2015-01-07,23.937576,24.037547,23.704309,23.815388,160423600
4,2015-01-08,24.857306,24.915067,24.14862,24.266365,237458000


In [None]:
df_lstm.shape

(2515, 6)

In [54]:
df_lstm['Date'] = pd.to_datetime(df_lstm['Date'])
df_lstm.set_index('Date', inplace=True)
df_lstm = df_lstm.asfreq('B')

Handling Missing Values

In [55]:
df_lstm['Close'] = df_lstm['Close'].interpolate(method='linear')


Lag Features

In [None]:
for lag in range(1, 4):
    df_lstm[f'lag_{lag}'] = df_lstm['Close'].shift(lag)


In [57]:
df_lstm.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,lag_1,lag_2,lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-02,24.28858,24.757334,23.848706,24.746226,212818400.0,,,
2015-01-05,23.604326,24.137507,23.417714,24.05753,257142000.0,24.28858,,
2015-01-06,23.606552,23.866477,23.244433,23.668756,263188400.0,23.604326,24.28858,
2015-01-07,23.937576,24.037547,23.704309,23.815388,160423600.0,23.606552,23.604326,24.28858
2015-01-08,24.857306,24.915067,24.14862,24.266365,237458000.0,23.937576,23.606552,23.604326


Rolling Statistics

In [58]:
df_lstm['rolling_mean_7'] = df_lstm['Close'].rolling(window=7).mean()
df_lstm['rolling_std_7']  = df_lstm['Close'].rolling(window=7).std()


In [59]:
df_lstm.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,lag_1,lag_2,lag_3,rolling_mean_7,rolling_std_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-02,24.28858,24.757334,23.848706,24.746226,212818400.0,,,,,
2015-01-05,23.604326,24.137507,23.417714,24.05753,257142000.0,24.28858,,,,
2015-01-06,23.606552,23.866477,23.244433,23.668756,263188400.0,23.604326,24.28858,,,
2015-01-07,23.937576,24.037547,23.704309,23.815388,160423600.0,23.606552,23.604326,24.28858,,
2015-01-08,24.857306,24.915067,24.14862,24.266365,237458000.0,23.937576,23.606552,23.604326,,


In [60]:
df_lstm.isnull().sum()

Close              0
High              92
Low               92
Open              92
Volume            92
lag_1              1
lag_2              2
lag_3              3
rolling_mean_7     6
rolling_std_7      6
dtype: int64

Target Variable

In [61]:
df_lstm['target'] = df_lstm['Close'].shift(-1)

In [62]:
df_lstm.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,lag_1,lag_2,lag_3,rolling_mean_7,rolling_std_7,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-02,24.28858,24.757334,23.848706,24.746226,212818400.0,,,,,,23.604326
2015-01-05,23.604326,24.137507,23.417714,24.05753,257142000.0,24.28858,,,,,23.606552
2015-01-06,23.606552,23.866477,23.244433,23.668756,263188400.0,23.604326,24.28858,,,,23.937576
2015-01-07,23.937576,24.037547,23.704309,23.815388,160423600.0,23.606552,23.604326,24.28858,,,24.857306
2015-01-08,24.857306,24.915067,24.14862,24.266365,237458000.0,23.937576,23.606552,23.604326,,,24.883959


Dropping NaN Rows

In [63]:
df_lstm_final = df_lstm[['Close', 'lag_1', 'lag_2', 'lag_3', 'rolling_mean_7', 'rolling_std_7', 'target']].dropna()

In [64]:
df_lstm_final.isnull().sum()

Close             0
lag_1             0
lag_2             0
lag_3             0
rolling_mean_7    0
rolling_std_7     0
target            0
dtype: int64

In [68]:
df_lstm_final.describe()

Unnamed: 0,Close,lag_1,lag_2,lag_3,rolling_mean_7,rolling_std_7,target
count,2600.0,2600.0,2600.0,2600.0,2600.0,2600.0,2600.0
mean,0.309103,0.308731,0.308884,0.309037,0.310861,0.163758,94.223827
std,0.275441,0.27518,0.275379,0.27558,0.278722,0.155284,65.544296
min,0.0,0.0,0.0,0.0,0.0,0.0,20.647446
25%,0.062668,0.062589,0.062632,0.062591,0.062083,0.041983,35.571258
50%,0.185869,0.185394,0.185413,0.185506,0.183924,0.111126,64.932724
75%,0.546588,0.546386,0.547265,0.54706,0.551083,0.242644,150.630825
max,1.0,1.0,1.0,1.0,1.0,1.0,258.396667


In [65]:
df_lstm_final.shape

(2600, 7)

Feature Scaling

In [66]:
scaler = MinMaxScaler()
features = ['Close', 'lag_1', 'lag_2', 'lag_3', 'rolling_mean_7', 'rolling_std_7']
df_lstm_final[features] = scaler.fit_transform(df_lstm_final[features])


In [None]:
df_lstm_final.describe()

Unnamed: 0,Close,lag_1,lag_2,lag_3,rolling_mean_7,rolling_std_7,target
count,2600.0,2600.0,2600.0,2600.0,2600.0,2600.0,2600.0
mean,0.309103,0.308731,0.308884,0.309037,0.310861,0.163758,94.223827
std,0.275441,0.27518,0.275379,0.27558,0.278722,0.155284,65.544296
min,0.0,0.0,0.0,0.0,0.0,0.0,20.647446
25%,0.062668,0.062589,0.062632,0.062591,0.062083,0.041983,35.571258
50%,0.185869,0.185394,0.185413,0.185506,0.183924,0.111126,64.932724
75%,0.546588,0.546386,0.547265,0.54706,0.551083,0.242644,150.630825
max,1.0,1.0,1.0,1.0,1.0,1.0,258.396667
