### Importing Packages

In [98]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn import set_config
import pickle

## Without Pipelines

### Load Data

In [99]:
data = pd.read_csv("./train.csv")

In [100]:
data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
427,428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louis...",female,19.0,0,0,250655,26.0,,S
231,232,0,3,"Larsson, Mr. Bengt Edvin",male,29.0,0,0,347067,7.775,,S
233,234,1,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,31.3875,,S
267,268,1,3,"Persson, Mr. Ernst Ulrik",male,25.0,1,0,347083,7.775,,S


In [101]:
data.drop(["PassengerId", "Name", "Ticket","Cabin"], axis=1,inplace=True)

In [102]:
data.sample(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
805,0,3,male,31.0,0,0,7.775,S
341,1,1,female,24.0,3,2,263.0,S


### Train Test Split

In [103]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Survived']), data['Survived'], test_size=0.2, random_state=42)

In [104]:
X_train.sample(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
500,3,male,17.0,0,0,8.6625,S
766,1,male,,0,0,39.6,C


In [105]:
y_train.sample(2)

850    0
574    0
Name: Survived, dtype: int64

### Applying Imputation

In [106]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [107]:
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_test_age = si_age.transform(X_test[['Age']])

X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

### Applying One Hot Encoding

In [108]:
ohe_sex = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_test_sex = ohe_sex.transform(X_test[['Sex']])

X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [109]:
X_train_rem = X_train.drop(columns=['Sex','Age','Embarked'])
X_test_rem = X_test.drop(columns=['Sex','Age','Embarked'])

In [110]:
X_train_transformed = np.concat([X_train_rem,X_train_age,X_train_sex,X_train_embarked],axis=1)
X_test_transformed = np.concat([X_test_rem,X_test_age,X_test_sex,X_test_embarked],axis=1)

### Model Training, Testing and Evaluation

In [111]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [112]:
y_pred = clf.predict(X_test_transformed)

In [113]:
accuracy_score(y_test,y_pred)

0.7821229050279329

In [114]:
pickle.dump(ohe_sex, open('models/ohe_sex.pkl', 'wb'))
pickle.dump(ohe_embarked, open('models/ohe_embarked.pkl', 'wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))

### Loading Pickle Model

In [115]:
ohe_sex = pickle.load(open('models/ohe_sex.pkl', 'rb'))
ohe_embarked = pickle.load(open('models/ohe_embarked.pkl', 'rb'))
clf = pickle.load(open('models/clf.pkl','rb'))

In [116]:
test_input = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

In [117]:
test_input_sex = ohe_sex.transform(test_input[:,1].reshape(1,1))



In [118]:
test_input_embarked = ohe_embarked.transform(test_input[:, -1].reshape(-1, 1))

In [119]:
test_input_age = test_input[:,2].reshape(1,1)

In [120]:
test_input_transformed = np.concatenate((test_input[:,[0,3,4,5]],test_input_age,test_input_sex,test_input_embarked),axis=1)

In [121]:
clf.predict(test_input_transformed)

array([0])

## With Pipelines

### Load Data

In [122]:
data = pd.read_csv("./train.csv")

In [123]:
data.sample(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
462,463,0,1,"Gee, Mr. Arthur H",male,47.0,0,0,111320,38.5,E63,S
12,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.05,,S


In [124]:
data.drop(["PassengerId", "Name", "Ticket","Cabin"], axis=1,inplace=True)

### Train Test Split

In [125]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Survived']), data['Survived'], test_size=0.2, random_state=42)

In [126]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
709,3,male,,1,1,15.2458,C
439,2,male,31.0,0,0,10.5000,S
840,3,male,20.0,0,0,7.9250,S
720,2,female,6.0,0,1,33.0000,S
39,3,female,14.0,1,0,11.2417,C
...,...,...,...,...,...,...,...
433,3,male,17.0,0,0,7.1250,S
773,3,male,,0,0,7.2250,C
25,3,female,38.0,1,5,31.3875,S
84,2,female,17.0,0,0,10.5000,S


### Column Transformer

In [127]:
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embark',SimpleImputer(strategy='most_frequent'),[6]),
],remainder='passthrough')

In [128]:
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [129]:
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [130]:
trf4 = SelectKBest(score_func=chi2,k=8)

In [131]:
trf5 = DecisionTreeClassifier()

### Pipeline Creation

In [132]:
pipe1 = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [133]:
pipe2 = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [134]:
pipe1.fit(X_train,y_train)

### Explore the Pipe

In [135]:
pipe1.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embark',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x000001477FA1A8E0>),
 'trf5': DecisionTreeClassifier()}

In [136]:
pipe1.named_steps['trf2'].transformers_

[('ohe_sex_embarked',
  OneHotEncoder(handle_unknown='ignore', sparse_output=False),
  [1, 6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 2, 3, 4, 5])]

In [137]:
set_config(display='diagram')

In [138]:
y_pred = pipe1.predict(X_test)

In [139]:
accuracy_score(y_test,y_pred)

0.6256983240223464

### Cross Validation using Pipe Line

In [140]:
cross_val_score(pipe1,X_train,y_train,cv=5,scoring='accuracy').mean()

np.float64(0.6391214419383433)

### Grid Search using Pipe Line

In [141]:
params = {
    'trf5__max_depth':[1,2,3,4,5,None],
}

In [144]:
grid = GridSearchCV(pipe1,param_grid=params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [145]:
grid.best_score_

np.float64(0.6391214419383433)

In [146]:
grid.best_params_

{'trf5__max_depth': 2}

### Exporting Model

In [147]:
pickle.dump(pipe1, open('models/pipe1.pkl', 'wb'))

### Importing Model

In [148]:
pipe = pickle.load(open('models/pipe1.pkl','rb'))

In [149]:
test_input2 = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

In [150]:
pipe.predict(test_input2)



array([0])