In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import  SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [47]:
df = sns.load_dataset("titanic")

In [48]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [49]:
X=df[["pclass","sex","age","sibsp","parch","fare","embarked"]]
y = df[["survived"]]

In [50]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [51]:
X_train.head(1)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
57,3,male,28.5,0,0,7.2292,C


In [52]:
y_train.head()

Unnamed: 0,survived
57,0
717,1
431,1
633,0
163,0


In [53]:
#imputation transformer
trf1 = ColumnTransformer([
    ("impute_age",SimpleImputer(),[2]),
    ("impute_embarked",SimpleImputer(strategy="most_frequent"),[6])]
    ,remainder="passthrough")

In [54]:
#one hot encoding
trf2 = ColumnTransformer([("ohe_sex_embarked",OneHotEncoder(sparse_output=False,
                          handle_unknown="ignore"),[1,6])],
                         remainder="passthrough")

In [55]:
#scaling
trf3 = ColumnTransformer([("scale",MinMaxScaler(),slice(0,10))])

In [56]:
#feature selection
trf4 = SelectKBest(score_func=chi2,k=7)

In [57]:
#train the model
trf5 = DecisionTreeClassifier()

**Pipeline vs make_pipeline**

In scikit-learn, both Pipeline and make_pipeline are utility functions used to create machine learning pipelines, which are sequences of data transformations followed by an estimator. The main difference between the two lies in how you specify the steps of the pipeline.

**Pipeline**:
The Pipeline class in scikit-learn allows you to define a pipeline by explicitly specifying the steps as a list of tuples. Each tuple consists of a name for the step and an instance of a transformer or an estimator.

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

**make_pipeline**:
The make_pipeline function is a shorthand way of creating a pipeline without the need to explicitly name the steps. It automatically generates names for each step based on the lowercase class names of the transformers or estimators.

pipeline = make_pipeline(StandardScaler(), LogisticRegression())

In [73]:
#Pipeline
pipe = Pipeline([("trf1",trf1),
                 ("trf2",trf2),
                 ("trf3",trf3),
                 ("trf4",trf4),
                 ("trf5",trf5)])

In [72]:
#alternate Syntax #make pipeline
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [60]:
#if you are not training model in pipelines then use fit_transform else use fit
pipe.fit(X_train,y_train)

**Explore the Pipelines**

In [61]:
#Display Pipeline
from sklearn import set_config
set_config(display="diagram")

In [62]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=7, score_func=<function chi2 at 0x7f850af4bd90>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [74]:
#predict
y_pred = pipe.predict(X_test)

In [75]:
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1])

In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred) #low accuracy because of feature selection

0.6759776536312849

**Cross Validation using Pipeline**

In [77]:
#cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring="accuracy").mean()

0.6251058800354574

**GridSearch using Pipeline**

In [78]:
#gridsearchcv
params = {"trf5__max_depth":[1,2,3,4,5,None]}

In [79]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=5,scoring="accuracy")
grid.fit(X_train,y_train)

In [80]:
grid.best_score_

0.6251058800354574

In [81]:
grid.best_params_

{'trf5__max_depth': 4}

**Exporting the Pipeline**

In [85]:
import pickle
pickle.dump(pipe,open("/content/pipe.pkl","wb"))