# Exploratory Data Analysis

In [20]:
import pandas_datareader as pdr
import numpy as np

## Import data

In [2]:
df = pdr.get_data_yahoo('GOOG')
df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-08-07,46.584999,46.325001,46.452999,46.467999,20644000.0,46.467999
2017-08-08,46.790699,46.280499,46.3545,46.3395,21232000.0,46.3395
2017-08-09,46.299,45.862499,46.030499,46.145,23842000.0,46.145
2017-08-10,45.963001,45.306499,45.877499,45.362,36480000.0,45.362
2017-08-11,45.889,45.278999,45.398499,45.719501,24136000.0,45.719501


In [3]:
df.tail()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-08-01,117.120003,114.690002,115.529999,115.480003,22856200.0,115.480003
2022-08-02,117.080002,114.260002,114.43,115.900002,17911000.0,115.900002
2022-08-03,119.419998,116.150002,116.339996,118.779999,25302800.0,118.779999
2022-08-04,119.5,117.709999,118.300003,118.870003,15740700.0,118.870003
2022-08-05,118.860001,116.709999,116.93,117.940002,4130463.0,117.940002


In [4]:
df.describe()

Unnamed: 0,High,Low,Open,Close,Volume,Adj Close
count,1259.0,1259.0,1259.0,1259.0,1259.0,1259.0
mean,82.60496,80.878393,81.716135,81.762361,31080710.0,81.762361
std,32.655928,31.961382,32.327053,32.299736,13549740.0,32.299736
min,45.650002,45.169998,45.255001,45.333,4130463.0,45.333
25%,56.58765,55.4175,56.033751,56.001999,22643000.0,56.001999
50%,67.498749,65.875,66.611,66.806999,27860000.0,66.806999
75%,113.625999,110.484001,111.8255,112.110497,35388000.0,112.110497
max,152.100006,149.887497,151.863495,150.709,124140000.0,150.709


It looks like date is already set as the index on this dataset, which is convenient. The data is also sorted by day chronologically, and starts in 2017 and ends in 2022. We have several columns that are quite similar. I will opt to use only the Close data, just for the sake of choosing a single feature that is not redundant.

In [5]:
df = df[['Close']]
df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2017-08-07,46.467999
2017-08-08,46.3395
2017-08-09,46.145
2017-08-10,45.362
2017-08-11,45.719501


Split data into train vs. validation vs. test chronologically.
This is important so that we do not do out-of-time data leakage.

In [6]:
TEST_SIZE = 0.05
VALIDATION_SIZE = 0.05
TRAIN_SIZE = 1 - TEST_SIZE - VALIDATION_SIZE
print(f'Train size: {TRAIN_SIZE:.0%}\nValidation size: {VALIDATION_SIZE:.0%}\nTest size: {TEST_SIZE:.0%}')

Train size: 90%
Validation size: 5%
Test size: 5%


In [7]:
train_ending_index = int(len(df) * TRAIN_SIZE)
validation_ending_index = train_ending_index + int(len(df) * VALIDATION_SIZE)

train_df = df[:train_ending_index]
validation_df = df[train_ending_index:validation_ending_index]
test_df = df[validation_ending_index:]

print(f'Train df shape: {train_df.shape}')
print(f'Validation df shape: {validation_df.shape}')
print(f'Test df shape: {test_df.shape}')

Train df shape: (1133, 1)
Validation df shape: (62, 1)
Test df shape: (64, 1)


In [8]:
train_df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2017-08-07,46.467999
2017-08-08,46.3395
2017-08-09,46.145
2017-08-10,45.362
2017-08-11,45.719501


In [9]:
train_df.tail()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-01-28,133.289505
2022-01-31,135.698502
2022-02-01,137.878494
2022-02-02,148.036499
2022-02-03,142.650497


In [10]:
validation_df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-02-04,143.016006
2022-02-07,138.938004
2022-02-08,139.212997
2022-02-09,141.453003
2022-02-10,138.602493


In [11]:
validation_df.tail()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-04-28,119.411499
2022-04-29,114.966499
2022-05-02,117.156998
2022-05-03,118.129501
2022-05-04,122.574997


In [12]:
test_df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-05-05,116.746498
2022-05-06,115.660004
2022-05-09,113.084
2022-05-10,114.584503
2022-05-11,113.960999


In [13]:
test_df.tail()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-08-01,115.480003
2022-08-02,115.900002
2022-08-03,118.779999
2022-08-04,118.870003
2022-08-05,117.940002


Note how the data is split chronologically, so there is no time overlap between training / validation / testing

## Feature normalization
Normalize based on training data so that scale is controlled.

In [15]:
train_max = train_df['Close'].max()
train_min = train_df['Close'].min()
print(f'Train max: {train_max}, train min: {train_min}')

Train max: 150.70899963378906, train min: 45.33300018310547


In [16]:
train_df_scaled = (train_df['Close'] - train_min) / (train_max - train_min)
validation_df_scaled = (validation_df['Close'] - train_min) / (train_max - train_min)
test_df_scaled = (test_df['Close'] - train_min) / (train_max - train_min)

print(f'Train df scaled max: {train_df_scaled.max():.2f}, min: {train_df_scaled.min():.2f}')
print(f'Validation df scaled max: {validation_df_scaled.max():.2f}, min: {validation_df_scaled.min():.2f}')
print(f'Test df scaled max: {test_df_scaled.max():.2f}, min: {test_df_scaled.min():.2f}')

Train df scaled max: 1.00, min: 0.00
Validation df scaled max: 0.93, min: 0.66
Test df scaled max: 0.71, min: 0.57


## Create time series data set
Need to reshape the data so that it is conducive for time series modeling.

In [18]:
def create_dataset(dataset, look_back=1):
    X, Y = [], []
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back)]
        X.append(a)
        Y.append(dataset[i + look_back])
    X = np.asarray(X).astype(np.float32)
    Y = np.asarray(Y).astype(np.float32)
    return np.array(X).reshape(X.shape[0], X.shape[1], 1), np.array(Y)

In [25]:
LOOKBACK = 20

x_train, y_train = create_dataset(train_df_scaled, look_back = LOOKBACK)
x_validation, y_validation = create_dataset(validation_df_scaled, look_back = LOOKBACK)
x_test, y_test = create_dataset(test_df_scaled, look_back = LOOKBACK)

print(x_train.shape)
print(y_train.shape)
print(x_validation.shape)
print(y_validation.shape)
print(x_test.shape)
print(y_test.shape)

(1113, 20, 1)
(1113,)
(42, 20, 1)
(42,)
(44, 20, 1)
(44,)


## Build model
The plan is to use an LSTM neural network. I chose this model because this model has been shown to be quite good at modeling order-dependent interactions (e.g. time-series data).

The limitations of this model choice is that LSTM's typically perform well with a lot of data, and we don't have quite a lot of data available here. This model will perform better if we can acquire a larger volume of data.

The benefit of this model choice is that we don't have to spend as much time feature engineering as we would with other types of models. We simply need to normalize the data and transform it into a properly shaped dataframe and can then begin training / hyperparameter tuning until we achieve the desired performance.

Ideas for tuning:
1. Number of hidden layers
2. Sizes of hidden layers
3. Magnitude of dropout
4. Optimizer / learning rate
5. Other types of RNN (e.g. GRU, bidirectional LSTM, etc.)
6. Amount of lookback
7. Training vs. validation vs. testing data set sizes

This model will be trained to optimize for RMSE, as it is a regression model. This error metric is in the units of the original stock price (normalized of course), so it can be compared between different company data fairly.