In [1]:
import pandas as pd
import numpy as np

In [130]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [132]:
df = pd.read_csv('train.csv')

In [134]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [136]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace = True)

In [138]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [140]:
X_train, X_test, y_train, y_test=train_test_split(df.drop(columns=['Survived']),df['Survived'],
                                  test_size = 0.2,
                                    random_state=42)

In [142]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [144]:
y_test

709    1
439    0
840    0
720    1
39     1
      ..
433    0
773    0
25     1
84     1
10     1
Name: Survived, Length: 179, dtype: int64

In [29]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [148]:
#perform simple imputer on age and embarked to fill missing value
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])],remainder = 'passthrough')

In [152]:
#one hot encoding  is implemented to sex and embarked as they have categorical values.
trf2 = ColumnTransformer([('onehotencoding',OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[1,6])
                         ],remainder = 'passthrough')

In [154]:
#scalling
trf3 = ColumnTransformer([
    ('ohe_sex_embarked',MinMaxScaler(), slice(0,10))])

In [161]:
trf4 = SelectKBest(score_func=chi2,k=8)

In machine learning, when using techniques like encoding categorical variables, the parameter `handle_unknown='ignore'` is used to specify how to deal with categories (values) that were not present in the training data when processing new data.

When set to `ignore`, it allows the model to skip any unknown categories without raising an error or causing a problem, effectively treating them as if they do not exist during encoding. This is useful for ensuring that the model can still make predictions even if new, unseen categories appear in the input data.

In [164]:
# Feature selection
trf5 = DecisionTreeClassifier()

# Create Pipeline

In [167]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

# Pipeline Vs make_pipeline
Pipeline requries naming of steps, male_pipeline does not.
(same applies to ColumnTransformer vs male_column_transformer)

In [170]:
# Alternate syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [172]:
#train
pipe.fit(X_train,y_train)

# Explore the pipeline

In [175]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoding',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('ohe_sex_embarked', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x14bd17f60>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [185]:
# this gives the mean value that is filled in the missing value place.
pipe.named_steps['columntransformer-1'].transformers_[0][1].statistics_

array([29.49884615])

In [181]:
pipe.named_steps.keys()

dict_keys(['columntransformer-1', 'columntransformer-2', 'columntransformer-3', 'selectkbest', 'decisiontreeclassifier'])

In [187]:
# this will show the most_frequent value that is filled in the missing value place.
pipe.named_steps['columntransformer-1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [189]:
#display pipeline
from sklearn import set_config
set_config(display='diagram')

In [191]:
#predict
y_pred = pipe.predict(X_test)

In [193]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

In [195]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

## Cross Validation using Pipeline

In [198]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train, y_train, cv=5, scoring='accuracy').mean()

0.6391214419383433

## GridSearch using pipeline

In [211]:
#gridsearchcv
params = {  
    'columntransformer-1__impute_age__strategy': ['mean', 'median']  
}


In [213]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [215]:
grid.best_score_

0.6391214419383433

In [217]:
grid.best_params_

{'columntransformer-1__impute_age__strategy': 'mean'}

## Exporting the pipeline

In [220]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

In [None]:
#after saving the model next we can open a new notebook and load the model and perform predictions.