In [1457]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFECV
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelBinarizer, LabelEncoder
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import GridSearchCV

In [1458]:
dataset = sns.load_dataset('titanic')

In [1459]:
print('Shape: ', dataset.shape)
dataset.head()

Shape:  (891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [1460]:
for feature in dataset.columns:
    if feature not in ['age', 'fare']:
        dataset[feature] = dataset[feature].astype('category')

In [1461]:
y = dataset['survived']
X = dataset.drop('survived', axis=1)

In [1462]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0
                                                    )

In [1463]:
numerical_features = make_column_selector(dtype_exclude='category')
categorical_features  = make_column_selector(dtype_include='category')

In [1464]:
numerical_pipeline = make_pipeline(KNNImputer(),
                                   StandardScaler()
                                   )

categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),
                                     OrdinalEncoder()
                                     )

In [1465]:
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                       (categorical_pipeline, categorical_features)
                                       )

In [1466]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('knnimputer', KNNImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fca87edd1f0>),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fca87edd940>)])

In [1467]:
feature_selector = RFECV(SGDClassifier(random_state=0),
                         step=1,
                         min_features_to_select=6,
                         cv=5
                         )

In [1468]:
feature_selector.fit(preprocessor.transform(X_train), y_train)

RFECV(cv=5, estimator=SGDClassifier(random_state=0), min_features_to_select=6)

In [1469]:
model = make_pipeline(preprocessor,
                      feature_selector,
                      KNeighborsClassifier()
                      )

In [1470]:
parameters = {'columntransformer__pipeline-1__knnimputer__weights': ['uniform', 'distance'],
              'columntransformer__pipeline-1__knnimputer__n_neighbors': np.arange(1, 6),
              'kneighborsclassifier__n_neighbors': np.arange(1, 6)#,
            #   'kneighborsclassifier__metric' : ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
            #   'kneighborsclassifier__weights': ['uniform', 'distance'],
              }

In [1471]:
grid = GridSearchCV(estimator=model,
                    param_grid=parameters,
                    cv=5
                    )

In [None]:
grid.fit(X_train, y_train)

In [1473]:
grid.best_params_

{'columntransformer__pipeline-1__knnimputer__n_neighbors': 1,
 'columntransformer__pipeline-1__knnimputer__weights': 'uniform',
 'kneighborsclassifier__n_neighbors': 1}

In [1474]:
grid.best_score_

nan

In [1475]:
grid.score(X_test, y_test)

0.9441340782122905