### Data preprocessing

#### Data preprocessing for a single ticker in nasdaq100 dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Read Data
df = pd.read_pickle("./data/data_pkls/nasdaq100.pkl")
df.head()

Unnamed: 0_level_0,SPLK,SPLK,SPLK,SPLK,SPLK,SPLK,FTNT,FTNT,FTNT,FTNT,...,CSGP,CSGP,CSGP,CSGP,MU,MU,MU,MU,MU,MU
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1962-01-02,,,,,,,,,,,...,,,,,,,,,,
1962-01-03,,,,,,,,,,,...,,,,,,,,,,
1962-01-04,,,,,,,,,,,...,,,,,,,,,,
1962-01-05,,,,,,,,,,,...,,,,,,,,,,
1962-01-08,,,,,,,,,,,...,,,,,,,,,,


In [3]:
#Define the ticker that we want to use
ticker = 'SPLK'
#Filter dataframe by this specific ticker
df = df[ticker]
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02,,,,,,
1962-01-03,,,,,,
1962-01-04,,,,,,
1962-01-05,,,,,,
1962-01-08,,,,,,


In [4]:
#Drop missing values since there is no good way to estimate stock values that are not tracked
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-04-19,32.0,35.849998,30.91,35.48,35.48,14124400.0
2012-04-20,35.25,36.639999,33.5,36.200001,36.200001,1796600.0
2012-04-23,36.779999,37.240002,35.759998,35.93,35.93,930700.0
2012-04-24,36.040001,37.34,34.650002,35.799999,35.799999,506200.0
2012-04-25,36.779999,37.150002,34.57,34.720001,34.720001,520500.0


In [5]:
#Drop Close column since it Adj Close column is calcualted using this information
#so it would be allowing the model to cheat
df.drop(columns={'Close'}, inplace=True)
df.head()

Unnamed: 0_level_0,Open,High,Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-04-19,32.0,35.849998,30.91,35.48,14124400.0
2012-04-20,35.25,36.639999,33.5,36.200001,1796600.0
2012-04-23,36.779999,37.240002,35.759998,35.93,930700.0
2012-04-24,36.040001,37.34,34.650002,35.799999,506200.0
2012-04-25,36.779999,37.150002,34.57,34.720001,520500.0


In [6]:
#Now just organize the columns 
df = df.loc[:, ['Open', 'High', 'Low', 'Volume', 'Adj Close']]
df.head()

Unnamed: 0_level_0,Open,High,Low,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-04-19,32.0,35.849998,30.91,14124400.0,35.48
2012-04-20,35.25,36.639999,33.5,1796600.0,36.200001
2012-04-23,36.779999,37.240002,35.759998,930700.0,35.93
2012-04-24,36.040001,37.34,34.650002,506200.0,35.799999
2012-04-25,36.779999,37.150002,34.57,520500.0,34.720001


In [10]:
#Now let's scale our data
#First let's split data into X and Y
X = df.loc[:, ['Open', 'High', 'Low', 'Volume']]
y = df['Adj Close']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() #create scaler object
scaled_data = scaler.fit_transform(X) #fit transform data

#check scaled data
print(scaled_data.shape)

scaled_data


(2956, 4)


array([[0.02863182, 0.04593603, 0.02972597, 0.46230154],
       [0.04498566, 0.0499021 , 0.04309233, 0.05479656],
       [0.05268455, 0.05291431, 0.05475563, 0.02617356],
       ...,
       [0.63498216, 0.63291332, 0.65691285, 0.05839303],
       [0.63473055, 0.63286307, 0.65696443, 0.04633097],
       [0.63543502, 0.63389225, 0.65753212, 0.03586122]])

In [12]:
np.array(y)

array([ 35.47999954,  36.20000076,  35.93000031, ..., 152.63000488,
       152.69000244, 152.78999329])

In [None]:
#Finally let's create a function that encapsulates everything
def single_preprocessing(ticker='SPLK', drop_col='Close'):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import datetime as dt
    from sklearn.preprocessing import MinMaxScaler

    df = pd.read_pickle("./data/data_pkls/nasdaq100.pkl") #read data
    #Filter dataframe by this specific ticker
    df = df[ticker]
    #Drop missing values since there is no good way to estimate stock values that are not tracked
    df.dropna(inplace=True)
    #Drop Close column since it Adj Close column is calcualted using this information
    #so it would be allowing the model to cheat by already knowing the answers
    df.drop(columns={drop_col}, inplace=True)

    if drop_col == 'Close':
        organize_col = 'Adj Close'
    else:
        organize_col = 'Close'
    #Now just organize the columns 
    df = df.loc[:, ['Open', 'High', 'Low', 'Volume', organize_col]]
    #Now let's scale our data
    #First let's split data into X and Y
    X = df.loc[:, ['Open', 'High', 'Low', 'Volume']]
    y = df['Adj Close']
    scaler = MinMaxScaler() #create scaler object
    scaled_data = scaler.fit_transform(X) #fit transform data
    return scaled_data, y

    
    

    



