# Strategy 0.5

Merupakan hasil salinan dari strategy 0.3. Hanya saja, Binning pada Fare diubah pengelompokannya.

In [1]:
import sys
sys.path.append('..')
from src.func import custom_info

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("../data/interim/for-strategy5.csv", index_col="PassengerId")

In [3]:
X = df.drop(columns="Survived")
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [4]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder())
])

In [5]:
custom_info(X_train)

Unnamed: 0,dataFeatures,dataType,null,nullPct,unique,uniqueSample
0,Pclass,int64,0,0.0,3,"[3, 3, 3]"
1,Sex,object,0,0.0,2,"[male, male, male]"
2,Age,object,137,19.24,5,"[lansia, balita, nan]"
3,SibSp,int64,0,0.0,7,"[0, 0, 0]"
4,Parch,int64,0,0.0,7,"[0, 0, 0]"
5,Fare,object,14,1.97,4,"[cheap, expensive, medium]"
6,Embarked,object,2,0.28,3,"[S, S, S]"
7,isAlone,bool,0,0.0,2,"[False, True, True]"


In [6]:
numerical_columns = ["SibSp", "Parch"]
categorical_columns = ["Pclass", "Sex", "Age", "Fare", "Embarked", "isAlone"]

preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, numerical_columns),
    ("categoric", categorical_pipeline, categorical_columns)
])

pipeline = Pipeline([
    ("prep", preprocessor),
    ("knn", KNeighborsClassifier())
])

In [7]:
parameter = {
    "knn__n_neighbors" : range(1,31,2),
    "knn__weights" : ["uniform", "distance"],
    "knn__p": [1, 1.5, 2]
}

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 90 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:    6.9s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['SibSp',
                                                                          'Parch']),
                                                                        ('categoric',
                                                                         Pipeline(steps=[('imputer',
                                                           

In [8]:
model.best_params_

{'knn__n_neighbors': 9, 'knn__p': 1.5, 'knn__weights': 'distance'}

In [9]:
model.score(X_train, y_train), model.score(X_test, y_test)

(0.8778089887640449, 0.8324022346368715)