# Strategy 0.2

Sama seperti strategy 0.1, namun diberikan ekstensi GridSearchCV untuk melakukan bruteforce pada parameter KNN, sehingga didapatkan parameter dan score terbaik.

In [1]:
import sys
sys.path.append('..')
from src.func import custom_info

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("../data/raw/train.csv", index_col="PassengerId")

In [3]:
custom_info(df, mode='missing_values_only')

Unnamed: 0,dataFeatures,dataType,null,nullPct
0,Age,float64,177,19.87
1,Cabin,object,687,77.1
2,Embarked,object,2,0.22


In [4]:
df.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
df.drop(columns=["Name", "Age", "Cabin", "Ticket"], inplace=True)

Pada strategi ini, saya akan drop feature :
1. Name : Karena goals awal saya adalah membuat model sesimple mungkin
2. Age : Karena persentase missing values mencapai 19%
3. Cabin : Karena persentase missing values mencapai 77%
4. Ticket : Karena untuk saat ini tidak terlalu berguna

In [6]:
X = df.drop(columns="Survived")
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [7]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder())
])

In [8]:
custom_info(X_train)

Unnamed: 0,dataFeatures,dataType,null,nullPct,unique,uniqueSample
0,Pclass,int64,0,0.0,3,"[2, 3, 3]"
1,Sex,object,0,0.0,2,"[female, male, male]"
2,SibSp,int64,0,0.0,7,"[3, 0, 0]"
3,Parch,int64,0,0.0,7,"[0, 0, 0]"
4,Fare,float64,0,0.0,226,"[27.9, 120.0, 7.75]"
5,Embarked,object,2,0.28,3,"[S, S, S]"


In [9]:
numerical_columns = ["SibSp", "Parch", "Fare"]
categorical_columns = ["Pclass", "Sex", "Embarked"]

Columns dipisahkan berdasarkan numerical dan categorical. Pemisahan ini dilakukan berdasarkan informasi datatype ataupun dari nomenklatur dataset.

In [10]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, numerical_columns),
    ("categoric", categorical_pipeline, categorical_columns)
])

In [11]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("knn", KNeighborsClassifier())
])

In [12]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['SibSp', 'Parch', 'Fare']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder())]),
                                                  ['Pclass', 'Sex',
                           

In [13]:
parameter = {
    "knn__n_neighbors" : range(1,51,2),
    "knn__weights" : ["uniform", "distance"],
    "knn__p": [1, 2]
}

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)

In [14]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    7.3s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['SibSp',
                                                                          'Parch',
                                                                          'Fare']),
                                                                        ('categoric',
                                                                         Pipeline(steps=[('impu

In [15]:
model.best_params_

{'knn__n_neighbors': 19, 'knn__p': 1, 'knn__weights': 'uniform'}

In [16]:
model.score(X_train, y_train), model.score(X_test, y_test)

(0.8188202247191011, 0.7877094972067039)