## Using Scikit-Learn Pipelines for Data Preprocessing with Python

In [1]:
# Standard Scaler
# OneHotEncoder
# RandomForestClassifier
# Pickle -> Serialise

In [2]:
# Authenticate to Kaggle
# https://www.kaggle.com/discussions/general/74235
!mkdir ~/.kaggle

In [4]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [5]:
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!pip list

Package                       Version
----------------------------- ---------------------
absl-py                       1.4.0
alabaster                     0.7.13
albumentations                1.2.1
altair                        4.2.2
anyio                         3.7.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
array-record                  0.3.0
arviz                         0.15.1
astropy                       5.2.2
astunparse                    1.6.3
attrs                         23.1.0
audioread                     3.0.0
autograd                      1.5
Babel                         2.12.1
backcall                      0.2.0
beautifulsoup4                4.11.2
bleach                        6.0.0
blis                          0.7.9
blosc2                        2.0.0
bokeh                         2.4.3
branca                        0.6.0
build                         0.10.0
CacheControl                  0.13.1
cac

In [7]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading house-prices-advanced-regression-techniques.zip to /content
100% 199k/199k [00:00<00:00, 716kB/s]
100% 199k/199k [00:00<00:00, 714kB/s]


In [8]:
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


### Pipeline Practice

In [9]:
import pandas as pd
df = pd.read_csv('train.csv')

In [10]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [18]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [19]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [22]:
select_df = df[['MSSubClass',	'MSZoning',	'LotFrontage', 'LotArea',	'Street',	'LotShape',	'LandContour',	'Utilities',	'MiscVal',	'MoSold',	'YrSold',	'SaleType', 'SalePrice']].dropna()

In [24]:
X = pd.get_dummies(select_df.drop('SalePrice', axis = 1))

In [20]:
# target variable = SalePrice

In [25]:
y = select_df.SalePrice

In [28]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,MiscVal,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,...,Utilities_AllPub,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,60,65.0,8450,0,2,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,20,80.0,9600,0,5,2007,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,60,68.0,11250,0,9,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,70,60.0,9550,0,2,2006,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,60,84.0,14260,0,12,2008,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [29]:
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor())

In [30]:
pipeline.fit(X, y)

In [31]:
pipeline.predict(X)

array([205227.5 , 166449.  , 216164.79, ..., 219136.75, 147505.  ,
       150808.  ])

### Save andd Reload the pipeline
https://www.kaggle.com/code/harrygem/titanic-pipeline-and-save-load-model

In [32]:
import pickle

In [33]:
with open('pipelinemodel.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

In [34]:
with open('pipelinemodel.pkl', 'rb') as f:
  reloaded_model = pickle.load(f)

In [35]:
reloaded_model

In [36]:
reloaded_model.steps

[('standardscaler', StandardScaler()),
 ('randomforestregressor', RandomForestRegressor())]

In [38]:
reloaded_model.steps[1]

('randomforestregressor', RandomForestRegressor())

In [39]:
reloaded_model.steps[1][1]

In [40]:
reloaded_model.steps[1][1].predict(X)



array([344483., 344483., 369583., ..., 343943., 344483., 344483.])

### Using the pipeline class

In [45]:
# Pipeline class
from sklearn.pipeline import Pipeline
custom_pipeline = Pipeline([('scaling', StandardScaler()), ('rfmodel', RandomForestRegressor())])

In [47]:
# make_pipeline class
make_pipeline_model = make_pipeline(StandardScaler(), RandomForestRegressor())

In [48]:
make_pipeline_model.steps

[('standardscaler', StandardScaler()),
 ('randomforestregressor', RandomForestRegressor())]

In [49]:
custom_pipeline.steps

[('scaling', StandardScaler()), ('rfmodel', RandomForestRegressor())]

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py

### Column Transformers

In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [69]:
# Numeric Features
numerical_features = select_df.drop('SalePrice', axis = 1).select_dtypes(exclude='object').columns
numerical_features

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')

In [70]:
numeric_pipeline = Pipeline([('scaler', StandardScaler())])

In [71]:
# Categorical Features
categorical_features = select_df.select_dtypes('object').columns
categorical_features

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object')

In [72]:
categorical_pipeline = Pipeline([('onehot', OneHotEncoder())])

In [73]:
ColumnTransformer??

In [74]:
transformer = ColumnTransformer([('numeric_preprocessing', numeric_pipeline, numerical_features),
                                 ('categorical_preprocessing', categorical_pipeline, categorical_features)])

In [75]:
transformer

In [76]:
ml_pipeline = Pipeline([('all_column_preprocessing', transformer), ('randforestclassifier', RandomForestRegressor())])

In [77]:
ml_pipeline

In [78]:
X = select_df.drop('SalePrice', axis = 1)
y = select_df['SalePrice']

In [79]:
ml_pipeline.fit(X, y)

In [80]:
ml_pipeline.predict(X)

array([203805.  , 163262.25, 208804.22, ..., 231119.75, 141773.5 ,
       149621.25])

In [81]:
with open('columnstransformermodel.pkl', 'wb') as f:
  pickle.dump(ml_pipeline, f)

In [83]:
with open('columnstransformermodel.pkl', 'rb') as f:
  reloaded_ml_pipeline = pickle.load(f)

In [84]:
reloaded_ml_pipeline

In [85]:
reloaded_ml_pipeline.steps

[('all_column_preprocessing',
  ColumnTransformer(transformers=[('numeric_preprocessing',
                                   Pipeline(steps=[('scaler', StandardScaler())]),
                                   Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                  ('categorical_preprocessing',
                                   Pipeline(steps=[('onehot', OneHotEncoder())]),
                                   Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
         'SaleType'],
        dtype='object'))])),
 ('randforestclassifier', RandomForestRegressor())]