In [153]:
import pandas as pd
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day29-sklearn-pipelines/train.csv', usecols = ['Age', 'Survived', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked'])
df.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,1,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,0,male,35.0,0,0,8.05,S


In [122]:
df.columns

Index(['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [134]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import set_config
set_config(display = 'diagram')

In [154]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns = ['Survived']), df['Survived'], test_size = .2, random_state = 42)

In [161]:
x_train

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked
331,male,45.5,0,0,28.5000,S
733,male,23.0,0,0,13.0000,S
382,male,32.0,0,0,7.9250,S
704,male,26.0,1,0,7.8542,S
813,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...
106,female,21.0,0,0,7.6500,S
270,male,,0,0,31.0000,S
860,male,41.0,2,0,14.1083,S
435,female,14.0,1,2,120.0000,S


In [103]:
df.isna().sum()

Survived      0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [125]:
#impute the missing values
trf1 = ColumnTransformer([('imputed_age', SimpleImputer(), [1]), 
                          ('imputed_embarked', SimpleImputer(strategy = 'most_frequent'), [5])], remainder = 'passthrough')

In [126]:
#onehotencoding
one_hot = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
trf2 = ColumnTransformer([('one_hot', one_hot, [0,5])], remainder = 'passthrough')

In [127]:
trf2

ColumnTransformer(remainder='passthrough',
                  transformers=[('one_hot',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 [0, 5])])

In [128]:
#scaling
scaler = MinMaxScaler()
trf3 = ColumnTransformer([('scale', scaler, slice(0, 12))])

In [129]:
#featureselection
trf4 = SelectKBest(score_func = chi2, k = 6)

In [130]:
#Classifier
trf5 = DecisionTreeClassifier()

In [131]:
pipeline = Pipeline([('trf1', trf1),
                     ('trf2', trf2),
                     ('trf3', trf3),
                     ('trf4', trf4),
                     ('trf5', trf5)])

In [132]:
#using make_pipline
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [135]:
pipeline.fit(x_train, y_train)

In [136]:
y_pred = pipeline.predict(x_test)

In [138]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5977653631284916

In [142]:
#crossvlaidation using pipeline
from sklearn.model_selection import cross_val_score
cross_val_score(pipeline, x_train, y_train, cv = 6, scoring = 'accuracy').mean()

0.6404358353510896

In [148]:
#grid serach Using pipeline for CV
params = {'trf5__max_depth': [0,1,2,3,4,5,6,None]}
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipeline, params, cv = 6, scoring = 'accuracy')
grid.fit(x_train, y_train)


6 fits failed out of a total of 48.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Softwareinstall\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Softwareinstall\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "D:\Softwareinstall\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "D:\Softwareinstall\lib\site-packages\sklearn\tree\_classes.py", line 306, in fit
    raise ValueError("max_depth must be greater than zero. ")
ValueError: max_depth 

In [149]:
grid.best_score_

0.6404358353510896

In [147]:
grid.best_params_ 

{'trf5__max_depth': 6}

In [151]:
import pickle
pickle.dump(pipeline, open('model.pkl', 'wb'))

In [152]:
model = pickle.load(open('model.pkl', 'rb'))

In [162]:
test_input = np.array(['male', 62, 3, 1, 38, 'S'], dtype = 'object').reshape(1, 6)
model.predict(test_input)



array([0], dtype=int64)