### Data preprocessing

#### Data preprocessing for a single ticker in nasdaq100 dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Read Data
df = pd.read_pickle("../data/data_pkls/nasdaq100.pkl")
df.head()

Unnamed: 0_level_0,SPLK,SPLK,SPLK,SPLK,SPLK,SPLK,FTNT,FTNT,FTNT,FTNT,...,CSGP,CSGP,CSGP,CSGP,MU,MU,MU,MU,MU,MU
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1962-01-02,,,,,,,,,,,...,,,,,,,,,,
1962-01-03,,,,,,,,,,,...,,,,,,,,,,
1962-01-04,,,,,,,,,,,...,,,,,,,,,,
1962-01-05,,,,,,,,,,,...,,,,,,,,,,
1962-01-08,,,,,,,,,,,...,,,,,,,,,,


In [4]:
df.columns.get_level_values(0).unique()

Index(['SPLK', 'FTNT', 'TMUS', 'CDW', 'NFLX', 'DXCM', 'KHC', 'ILMN', 'IDXX',
       'TEAM',
       ...
       'REGN', 'ADBE', 'SBUX', 'BIIB', 'MAR', 'BKNG', 'KLAC', 'CMCSA', 'CSGP',
       'MU'],
      dtype='object', length=101)

In [5]:
#Define the ticker that we want to use
ticker = 'FTNT'
#Filter dataframe by this specific ticker
df = df[ticker]
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02,,,,,,
1962-01-03,,,,,,
1962-01-04,,,,,,
1962-01-05,,,,,,
1962-01-08,,,,,,


In [6]:
#Drop missing values since there is no good way to estimate stock values that are not tracked
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-11-18,1.7,1.75,1.633,1.662,1.662,136704000.0
2009-11-19,1.657,1.712,1.606,1.689,1.689,22173000.0
2009-11-20,1.688,1.696,1.606,1.694,1.694,11390000.0
2009-11-23,1.706,1.717,1.66,1.69,1.69,11858000.0
2009-11-24,1.68,1.705,1.673,1.701,1.701,4677000.0


In [7]:
#Drop Close column since it Adj Close column is calcualted using this information
#so it would be allowing the model to cheat
df.drop(columns={'Close'}, inplace=True)
df.head()

Unnamed: 0_level_0,Open,High,Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-11-18,1.7,1.75,1.633,1.662,136704000.0
2009-11-19,1.657,1.712,1.606,1.689,22173000.0
2009-11-20,1.688,1.696,1.606,1.694,11390000.0
2009-11-23,1.706,1.717,1.66,1.69,11858000.0
2009-11-24,1.68,1.705,1.673,1.701,4677000.0


In [8]:
#Now just organize the columns 
df = df.loc[:, ['Open', 'High', 'Low', 'Volume', 'Adj Close']]
df.head()

Unnamed: 0_level_0,Open,High,Low,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-11-18,1.7,1.75,1.633,136704000.0,1.662
2009-11-19,1.657,1.712,1.606,22173000.0,1.689
2009-11-20,1.688,1.696,1.606,11390000.0,1.694
2009-11-23,1.706,1.717,1.66,11858000.0,1.69
2009-11-24,1.68,1.705,1.673,4677000.0,1.701


In [9]:
#Now let's scale our data
#First let's split data into X and Y
X = df.loc[:, ['Open', 'High', 'Low', 'Volume']]
y = df['Adj Close']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() #create scaler object
scaled_data = scaler.fit_transform(X) #fit transform data

#check scaled data
print(scaled_data.shape)
print(y.shape)

scaled_data


(3564, 4)
(3564,)


array([[0.00250798, 0.00287259, 0.00207239, 0.81847793],
       [0.00196332, 0.00239592, 0.00172485, 0.12890601],
       [0.00235598, 0.00219521, 0.00172485, 0.06398338],
       ...,
       [0.75477533, 0.76354446, 0.76637313, 0.03501234],
       [0.75616866, 0.75388553, 0.74989698, 0.02796496],
       [0.75363533, 0.75224225, 0.75710532, 0.02084694]])

In [9]:
np.array(y)

array([ 1.66199994,  1.68900001,  1.69400001, ..., 62.        ,
       60.61999893, 61.40999985])

In [10]:
#Now we have to organize data in the correct way so LSTM is able to take it as an input as expected
#We have 2956 observations and 4 features and let's assume we will use the last 30 days of information to predict the next day
#Closing price for the stock

samples = []
delay = 15
# step over the 2956 in jumps of 15
for i in range(0, (y.shape[0]), delay):
	# grab from i to i + 15
	sample = scaled_data[i:i+delay]
	samples.append(sample)
print(len(samples))
print(len(samples[-1]))


238
9


In [11]:
#make it in a list generation to be more efficient
samples_2 = [ scaled_data[i:i+delay] for i in range(0, (y.shape[0]), delay)]

In [12]:
#since the last observation would generally have less samples we can discard it and keep the other observations
samples = samples[:-1]
print(len(samples))
print(len(samples[-1]))

237
15


In [13]:
#Convert list of arrays into samples, timesteps (delay), features
samples = np.array(samples)
samples.shape
#Now data for features is okay, finally we also need to reshape data for targets

(237, 15, 4)

In [14]:
#since we are using the past 15 days to predict the next day of stock value the 16th value is the target 
new_y = [ y.iloc[i+delay] for i in range(0, (y.shape[0]-delay), delay)]
new_y = np.array(new_y)
print(new_y.shape)


(237,)


In [15]:
new_y[-1]

58.34000015258789

In [16]:
#Finally let's create a function that encapsulates everything
def single_preprocessing(ticker='SPLK', drop_col='Close', delay=15):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import datetime as dt
    from sklearn.preprocessing import MinMaxScaler

    df = pd.read_pickle("../data/data_pkls/nasdaq100.pkl") #read data
    #Filter dataframe by this specific ticker
    df = df[ticker]
    #Drop missing values since there is no good way to estimate stock values that are not tracked
    df.dropna(inplace=True)
    #Drop Close column since it Adj Close column is calcualted using this information
    #so it would be allowing the model to cheat by already knowing the answers
    df.drop(columns={drop_col}, inplace=True)

    if drop_col == 'Close':
        organize_col = 'Adj Close'
    else:
        organize_col = 'Close'
    #Now just organize the columns 
    df = df.loc[:, ['Open', 'High', 'Low', 'Volume', organize_col]]
    #Now let's scale our data
    #First let's split data into X and Y
    X = df.loc[:, ['Open', 'High', 'Low', 'Volume']]
    y = df['Adj Close']
    scaler = MinMaxScaler() #create scaler object
    scaled_data = scaler.fit_transform(X) #fit transform data
    #Split data into samples and reshape X
    samples = [ scaled_data[i:i+delay] for i in range(0, (y.shape[0]), delay)]
    #Reshape y also
    new_y = [ y[i+delay] for i in range(0, (y.shape[0]-delay), delay)]
    new_y = np.array(new_y)
    return samples, new_y, ticker

    
    

    



