In [2]:
import numpy as np

#### avoid data leaked

## train-val-test split

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [22]:
df = pd.read_csv("../../datasets/titanic3.csv")
df.drop(labels=['ticket','body','home.dest','cabin','boat','embarked','name'], axis=1, inplace=True)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,female,29.0,0,0,211.3375
1,1,1,male,0.9167,1,2,151.55
2,1,0,female,2.0,1,2,151.55
3,1,0,male,30.0,1,2,151.55
4,1,0,female,25.0,1,2,151.55


In [23]:
X = df.drop(labels=['survived'], axis=1)
y = df.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1047, 6), (262, 6), (1047,), (262,))

In [26]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
999,3,female,,0,0,7.75
392,2,female,24.0,1,0,27.7208
628,3,female,11.0,4,2,31.275
1165,3,male,25.0,0,0,7.225
604,3,female,16.0,0,0,7.65


## preprocessor

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [24]:
numerical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])
categorical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder())
])


In [20]:
from sklearn.compose import ColumnTransformer

In [28]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ['sibsp','parch','fare','age']),
    ("categoric", categorical_pipeline, ['pclass','sex'])
])

## pipeline

In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [32]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])

In [34]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('inputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['sibsp', 'parch', 'fare',
                                                   'age']),
                                                 ('categoric',
                                                  Pipeline(steps=[('inputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder())]),
                                     

In [36]:
pipeline.score(X_test, y_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.8282442748091603

## grid search cv

In [38]:
from sklearn.model_selection import GridSearchCV

In [43]:
# pipeline.get_params()

In [44]:
parameter = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ['uniform','distance'],
    "algo__p": [1,2]
}

model = GridSearchCV(pipeline, parameter, cv=4, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('inputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['sibsp',
                                                                          'parch',
                                                                          'fare',
                                                                          'age']),
                                                                        ('categoric',
              

In [48]:
pd.DataFrame(model.cv_results_).sort_values("rank_test_score", ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__n_neighbors,param_algo__p,param_algo__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
32,0.047997,0.010843,0.070798,0.013121,17,1,uniform,"{'algo__n_neighbors': 17, 'algo__p': 1, 'algo_...",0.801527,0.763359,0.809160,0.762452,0.784124,0.021392,1
34,0.041753,0.003276,0.067998,0.012209,17,2,uniform,"{'algo__n_neighbors': 17, 'algo__p': 2, 'algo_...",0.797710,0.755725,0.809160,0.770115,0.783178,0.021275,2
10,0.050785,0.001946,0.087249,0.017613,5,2,uniform,"{'algo__n_neighbors': 5, 'algo__p': 2, 'algo__...",0.774809,0.801527,0.797710,0.754789,0.782209,0.018842,3
82,0.047000,0.006164,0.077749,0.007154,41,2,uniform,"{'algo__n_neighbors': 41, 'algo__p': 2, 'algo_...",0.782443,0.786260,0.797710,0.754789,0.780300,0.015764,4
6,0.047250,0.005214,0.068086,0.005576,3,2,uniform,"{'algo__n_neighbors': 3, 'algo__p': 2, 'algo__...",0.770992,0.774809,0.832061,0.739464,0.779332,0.033391,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,0.039999,0.002345,0.027002,0.002345,5,1,distance,"{'algo__n_neighbors': 5, 'algo__p': 1, 'algo__...",0.736641,0.797710,0.767176,0.704981,0.751627,0.034518,96
1,0.063002,0.011291,0.028249,0.004026,1,1,distance,"{'algo__n_neighbors': 1, 'algo__p': 1, 'algo__...",0.717557,0.721374,0.759542,0.697318,0.723948,0.022492,97
0,0.040249,0.007122,0.113500,0.026195,1,1,uniform,"{'algo__n_neighbors': 1, 'algo__p': 1, 'algo__...",0.717557,0.721374,0.759542,0.697318,0.723948,0.022492,97
3,0.044248,0.003963,0.024999,0.001224,1,2,distance,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.698473,0.713740,0.748092,0.697318,0.714406,0.020500,99


In [50]:
model.best_params_

{'algo__n_neighbors': 17, 'algo__p': 1, 'algo__weights': 'uniform'}

In [51]:
model.score(X_train, y_train), model.score(X_test, y_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


(0.8137535816618912, 0.8244274809160306)

## Prediction

In [55]:
X.iloc[0:1]

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
0,1,female,29.0,0,0,211.3375


In [60]:
data = [
    [1, 'female', 29, 1, 1, 180],
    [3, 'male', 27, 0, 0, 110],
]
X_pred = pd.DataFrame(data, index=['Rose', 'Jack'], columns=X.columns)
X_pred

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
Rose,1,female,29,1,1,180
Jack,3,male,27,0,0,110


In [62]:
model.predict(X_pred)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array([1, 0], dtype=int64)

In [64]:
X_pred['survived'] = model.predict(X_pred)
X_pred

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,pclass,sex,age,sibsp,parch,fare,survived
Rose,1,female,29,1,1,180,1
Jack,3,male,27,0,0,110,0


## save model

In [66]:
import pickle

In [68]:
pickle.dump(model, open("../models/knn_titanic.pkl",'wb'))

## load model

In [73]:
model = pickle.load(open("../models/knn_titanic.pkl", "rb"))
# model