# **PIPELINE**

In [79]:
# basic libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
# scikit-learn required libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
# Display Pipeline
from sklearn import set_config
set_config(display='diagram')

In [80]:
# load the dataset
df = pd.read_csv('./dataset/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Route**

In [81]:
df.drop(columns=['PassengerId', 'Ticket', 'Cabin', 'Name'], inplace=True)

In [82]:
df.head() # after droping irrelevent features

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [83]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']), df['Survived'], test_size=0.2, random_state=42)

In [84]:
X_train.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
557,1,male,,0,0,227.525,C
75,3,male,25.0,0,0,7.65,S
667,3,male,,0,0,7.775,S
18,3,female,31.0,1,0,18.0,S
640,3,male,20.0,0,0,7.8542,S


In [85]:
y_train.sample(5)

225    0
71     0
689    1
823    1
639    0
Name: Survived, dtype: int64

In [86]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [87]:
# columnTransformer for imputing age and emarked
trf1 = ColumnTransformer([
    ('age_imputer', SimpleImputer(), [2]),
    ('embarked_imputer', SimpleImputer(strategy='most_frequent'), [6])
    ], remainder='passthrough')

In [88]:
# trf2 = ColumnTransformer([
#     ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6])
# ],remainder='passthrough') 

In [89]:
trf2 = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [90]:
# Scaling
trf3 = ColumnTransformer([
    ('scaling', MinMaxScaler(), slice(0,10))
], remainder='passthrough')

In [91]:
# Feature Selection
trf4 = SelectKBest(score_func=chi2, k=8)

In [92]:
# Model 
trf5 = DecisionTreeClassifier()

**Create a Pipeline**

In [93]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

### Pipeline Vs make_pipeline

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [94]:
# alternate syntax
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [95]:
pipe.fit(X_train,y_train)

# Explore the Pipeline

In [96]:
# Code here
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('age_imputer', SimpleImputer(), [2]),
                                 ('embarked_imputer',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'onehotencoder': OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('scaling', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x00000266DF76B600>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [97]:
# predict 
y_pred = pipe.predict(X_test)

In [98]:
y_pred

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0], dtype=int64)

In [99]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.95      0.83       105
           1       0.88      0.51      0.65        74

    accuracy                           0.77       179
   macro avg       0.81      0.73      0.74       179
weighted avg       0.80      0.77      0.76       179



## **Cross Validation using Pipeline**

In [100]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train,y_train, cv=10, scoring='accuracy').mean()

0.789358372456964

## **GridSearch using Pipeline**

In [101]:
# # GridSearchCV
# params = {
#     'trf5_max_depth':[1,2,3,4,5,None]
# }

In [102]:
# from sklearn.model_selection import GridSearchCV
# grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
# grid.fit(X_train, y_train)

In [None]:
# make a gridsearch cv for pipeline


## **Exporting the Pipeline**

In [103]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))

## **Predicting using Pipeline**

In [104]:
import pickle
import numpy as np 

In [105]:
pickle = pickle.load(open('pipe.pkl', 'rb'))

In [108]:
# Assume user input
test_input2 = np.array([1, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

In [109]:
pipe.predict(test_input2)



array([0], dtype=int64)