# Creating Chains for Pipeline

In [44]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler # here i do the feature selection so i choose minmax rather than standard scaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('images/Titanic.csv')
df = df.iloc[:,1:]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.shape

(891, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Let's Plan

In [6]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'], inplace=True)

In [7]:
# Step 1 -> train test split
X_train,X_test,y_train,y_test=train_test_split(df.drop('Survived', axis=1),df['Survived'],
                                              test_size=0.2,random_state=True)

In [8]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
301,3,male,,2,0,23.25,Q
309,1,female,30.0,0,0,56.9292,C
516,2,female,34.0,0,0,10.5,S
120,2,male,21.0,2,0,73.5,S
570,2,male,62.0,0,0,10.5,S


In [9]:
y_train.sample(5)

602    0
695    0
176    0
224    1
91     0
Name: Survived, dtype: int64

In [10]:
# Imputation -> Age & Embarked
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(strategy='median'),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
], remainder='passthrough')

In [11]:
# one hot encoding -> Sex & Embarked
# here i will not use the drop first function because it is not necessary to me coz i use decision tree
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False, handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [12]:
# here i try to estimate that after encoding how much new features will create
print(df['Sex'].value_counts())
print()
print(df['Embarked'].value_counts())
# so total features before encoding are = 7 features
# if i convert every categirical feature according to there labels every new feature then
# sex will convert into                 = 2 features
# embarked will convert into            = 3 features
# so the total features will be 7-2 = 5+2+3 = 10

male      577
female    314
Name: Sex, dtype: int64

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [13]:
# Feature Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

### What is SelectKBest used for?
Feature selection is a technique where we choose those features in our data that contribute most to the target variable. In other words we choose the best predictors for the target variable

SelectKBest then simply retains the first k features of X with the highest scores. So, for example, if you pass chi2 as a score function, SelectKBest will compute the chi2 statistic between each feature of X and y (assumed to be class labels). A small value will mean the feature is independent of y.

### What is select k best?
Feature selection is a technique where we choose those features in our data that contribute most to the target variable. In other words we choose the best predictors for the target variable.

In [14]:
# Feature selection
trf4 = SelectKBest(score_func=chi2, k=8)

In [15]:
# train the model
trf5 = DecisionTreeClassifier()

# Create Pipeline

In [16]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

## Pipeline Vs make_pipeline

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)


In [17]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [18]:
# Alternate syntax
# pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [19]:
# train
pipe.fit(X_train,y_train)

# if i do just data preparation then i use fit_transform but when i also doing prediction
# and model training then i use fit then directly to prediction

# Explore the Pipeline

In [22]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(strategy='median'),
                                  [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x7fd917faba60>),
 'trf5': DecisionTreeClassifier()}

In [36]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.])

In [42]:
pipe.named_steps['trf1'].transformers_[1][1].statistics_ # what is the most frequent value which 
# my Simple imputer use for fill the missing value

array(['S'], dtype=object)

In [43]:
# predict
y_pred = pipe.predict(X_test)

In [45]:
accuracy_score(y_test, y_pred)

0.6201117318435754

# Cross validation using Pipeline

In [51]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.6404510981975771

# GridSearch Using Pipeline
hyper parameter tuning

In [52]:
# gridsearchcv
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [53]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [54]:
grid.best_score_

0.6404510981975771

In [55]:
grid.best_params_

{'trf5__max_depth': 4}

# Exporting The Pipeline

In [56]:
import pickle

In [58]:
pickle.dump(pipe, open('models/pipe.pkl','wb'))

# Predict Using Pipeline

In [60]:
import pickle
import numpy as np

In [61]:
pipe = pickle.load(open('models/pipe.pkl','rb'))

In [62]:
# Assume user input
test_input2 = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

In [63]:
pipe.predict(test_input2)

array([0])