# Pipelines

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.pipeline import Pipeline 

In [4]:
from sklearn.model_selection import train_test_split 

In [5]:
from sklearn.preprocessing import FunctionTransformer 

In [6]:
import transformers

In [7]:
import types

In [8]:
import category_encoders as ce

In [9]:
# %pip install category_encoders

In [10]:
pd.set_option('display.max_rows', 1000)

In [11]:
pd.set_option('display.max_columns', 1000)

In [12]:
df = pd.read_csv('../data/Google_Stock_Price_Train.csv')

In [13]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1/3/2012,325.25,332.83,324.97,663.59,7380500
1,1/4/2012,331.27,333.87,329.08,666.45,5749400
2,1/5/2012,329.83,330.75,326.89,657.21,6590300
3,1/6/2012,328.34,328.77,323.68,648.24,5405900
4,1/9/2012,322.04,322.29,309.46,620.76,11688800
...,...,...,...,...,...,...
1253,12/23/2016,790.90,792.74,787.28,789.91,623400
1254,12/27/2016,790.68,797.86,787.66,791.55,789100
1255,12/28/2016,793.70,794.23,783.20,785.05,1153800
1256,12/29/2016,783.33,785.93,778.92,782.79,744300


In [14]:
stock_pipeline = Pipeline(transformers.get_transformer_func_list())

In [15]:
df = stock_pipeline.transform(df)

In [16]:
# def set_future_day_value(dataframe, column: str, future_distance: int):
#     for future_cell_index in range(0, len(dataframe) - future_distance - 1):
#         print(future_cell_index - future_distance)
#         df.loc[future_cell_index - future_distance,f'{column}_in_{future_distance}_days'] = df.loc[future_cell_index, column]

In [17]:
# for distance in range(1, 30):
#     set_future_day_value(df, 'Close', distance)

In [18]:
# df.head(100)

In [19]:
# df.tail(100)

In [20]:
categories_features = ['Volatility_group','Day_of_week']

In [21]:
categories_pipe = Pipeline([
    ('onehot',ce.OneHotEncoder(cols=categories_features, use_cat_names=True, handle_unknown='return_nan'))])

In [22]:
df = categories_pipe.fit_transform(df)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1258 entries, 0 to 1257
Data columns (total 25 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Open                                1258 non-null   float64
 1   High                                1258 non-null   float64
 2   Low                                 1258 non-null   float64
 3   Close                               1258 non-null   float64
 4   Volume                              1258 non-null   float64
 5   Volatility                          1258 non-null   float64
 6   Day_of_week_1.0                     1258 non-null   float64
 7   Day_of_week_2.0                     1258 non-null   float64
 8   Day_of_week_3.0                     1258 non-null   float64
 9   Day_of_week_4.0                     1258 non-null   float64
 10  Day_of_week_0.0                     1258 non-null   float64
 11  Volatility_group_Low                1258 no

In [26]:
df.isnull().sum()

Open                                   0
High                                   0
Low                                    0
Close                                  0
Volume                                 0
Volatility                             0
Day_of_week_1.0                        0
Day_of_week_2.0                        0
Day_of_week_3.0                        0
Day_of_week_4.0                        0
Day_of_week_0.0                        0
Volatility_group_Low                   0
Volatility_group_Medium                0
Volatility_group_High                  0
Close_in_1_days                        1
Close_in_6_days                        6
Close_in_11_days                      11
Close_in_16_days                      16
Close_in_21_days                      21
Close_in_26_days                      26
Close_30_moving_avarage               29
Is_close_below_30_moving_avarage       0
Is_close_above_30_moving_avarage       0
Days_close_below_30_moving_avarage     0
Days_close_above

In [27]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Volatility,Day_of_week_1.0,Day_of_week_2.0,Day_of_week_3.0,Day_of_week_4.0,Day_of_week_0.0,Volatility_group_Low,Volatility_group_Medium,Volatility_group_High,Close_in_1_days,Close_in_6_days,Close_in_11_days,Close_in_16_days,Close_in_21_days,Close_in_26_days,Close_30_moving_avarage,Is_close_below_30_moving_avarage,Is_close_above_30_moving_avarage,Days_close_below_30_moving_avarage,Days_close_above_30_moving_avarage
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1257.0,1252.0,1247.0,1242.0,1237.0,1232.0,1229.0,1258.0,1258.0,1258.0,1258.0
mean,533.709833,537.880223,529.007409,712.669666,3158107.0,8.872814,0.205087,0.205087,0.201113,0.200318,0.188394,0.334658,0.332273,0.333068,712.708711,712.987827,713.333817,713.827593,714.385667,714.850771,712.938298,0.385533,0.614467,3.683625,11.122417
std,151.904442,153.008811,150.552807,164.752591,2273726.0,5.4754,0.403926,0.403926,0.400992,0.400397,0.391182,0.472058,0.471216,0.471498,164.812341,165.077806,165.317888,165.459604,165.559995,165.733278,159.627851,0.486914,0.486914,6.87368,15.194762
min,279.12,281.21,277.22,491.2,7900.0,1.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,491.2,491.2,491.2,491.2,491.2,491.2,513.435333,0.0,0.0,0.0,0.0
25%,404.115,406.765,401.765,576.74,1621700.0,5.3525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,576.65,576.34,576.195,576.1525,576.28,575.8925,577.849333,0.0,0.0,0.0,0.0
50%,537.47,540.75,532.99,695.675,2566150.0,7.425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,695.7,696.2,697.46,697.77,698.21,699.03,706.950667,0.0,1.0,0.0,4.0
75%,654.9225,662.5875,644.8,782.105,4122500.0,10.67,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,782.22,782.46,782.655,782.955,783.07,783.22,779.089667,1.0,1.0,5.0,18.0
max,816.68,816.68,805.14,1216.83,24977900.0,54.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1216.83,1216.83,1216.83,1216.83,1216.83,1216.83,1199.119667,1.0,1.0,43.0,70.0
