# **PIPELINE**

In [19]:
# basic libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
# scikit-learn required libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
# Display Pipeline
from sklearn import set_config
set_config(display='diagram')

In [20]:
# load the dataset
df = pd.read_csv('./dataset/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Route**

In [21]:
df.drop(columns=['PassengerId', 'Ticket', 'Cabin', 'Name'], inplace=True)

In [22]:
df.head() # after droping irrelevent features

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [23]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']), df['Survived'], test_size=0.2, random_state=42)

In [24]:
X_train.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
325,1,female,36.0,0,0,135.6333,C
641,1,female,24.0,0,0,69.3,C
651,2,female,18.0,0,1,23.0,S
476,2,male,34.0,1,0,21.0,S
741,1,male,36.0,1,0,78.85,S


In [25]:
y_train.sample(5)

173    0
725    0
701    1
329    1
243    0
Name: Survived, dtype: int64

In [26]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [27]:
# columnTransformer for imputing age and emarked
trf1 = ColumnTransformer([
    ('age_imputer', SimpleImputer(), [2]),
    ('embarked_imputer', SimpleImputer(strategy='most_frequent'), [6])
    ], remainder='passthrough')

In [28]:
# columnTransformer to encode Sex and Embarked column with columnstransformer name trf2

In [29]:
trf2 = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [30]:
# Scaling
trf3 = ColumnTransformer([
    ('scaling', MinMaxScaler(), slice(0,10))
], remainder='passthrough')

In [31]:
# Feature Selection
trf4 = SelectKBest(score_func=chi2, k=8)

In [32]:
# Model 
trf5 = DecisionTreeClassifier()

**Create a Pipeline**

In [33]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

### Pipeline Vs make_pipeline

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [34]:
# alternate syntax
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [35]:
X_train, y_train 

(     Pclass     Sex   Age  SibSp  Parch      Fare Embarked
 331       1    male  45.5      0      0   28.5000        S
 733       2    male  23.0      0      0   13.0000        S
 382       3    male  32.0      0      0    7.9250        S
 704       3    male  26.0      1      0    7.8542        S
 813       3  female   6.0      4      2   31.2750        S
 ..      ...     ...   ...    ...    ...       ...      ...
 106       3  female  21.0      0      0    7.6500        S
 270       1    male   NaN      0      0   31.0000        S
 860       3    male  41.0      2      0   14.1083        S
 435       1  female  14.0      1      2  120.0000        S
 102       1    male  21.0      0      1   77.2875        S
 
 [712 rows x 7 columns],
 331    0
 733    0
 382    0
 704    0
 813    0
       ..
 106    1
 270    0
 860    0
 435    1
 102    0
 Name: Survived, Length: 712, dtype: int64)

In [36]:
pipe.fit(X_train,y_train)

# Explore the Pipeline

In [37]:
# Code here
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('age_imputer', SimpleImputer(), [2]),
                                 ('embarked_imputer',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'onehotencoder': OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('scaling', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x000001A94ED0B600>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [38]:
# predict 
y_pred = pipe.predict(X_test)

In [39]:
y_pred

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0], dtype=int64)

In [40]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.95      0.83       105
           1       0.88      0.51      0.65        74

    accuracy                           0.77       179
   macro avg       0.81      0.73      0.74       179
weighted avg       0.80      0.77      0.76       179



## **Cross Validation using Pipeline**

In [41]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train,y_train, cv=10, scoring='accuracy').mean()

0.789358372456964

## **GridSearch using Pipeline**

In [42]:
# make a gridsearch cv for pipeline
params = {
    'decisiontreeclassifier__max_depth':[1,2,3,4,5,None]
}   
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)


    grid search cv is used for hyperparameter tuning
as we have seen in the previous notebook, we can use grid search cv to find the best hyperparameters for the model.

## **Exporting the Pipeline**

In [43]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))

## **Predicting using Pipeline**

In [44]:
import pickle
import numpy as np 

In [45]:
pickle = pickle.load(open('pipe.pkl', 'rb'))

In [46]:
# Assume user input
test_input2 = np.array([1, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

In [47]:
pipe.predict(test_input2)



array([0], dtype=int64)