# Strategy 0.3 

Strategi ini adalah ekstensi dari strategi sebelumnya yang menggunakan Feature Engineering.
Feature Engineering ini dilakukan dengan dasar temuan-temuan yang didapatkan saat melakukan EDA sederhana.
Adapun EDA-nya dapat dilihat pada step `0.3.0` dan code untuk melakukan feature engineering ini bisa dilihat pada `src/features/features_engineering.ipynb`

In [1]:
import sys
sys.path.append('..')
from src.func import custom_info

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("../data/interim/for-strategy3.csv", index_col="PassengerId")

In [3]:
X = df.drop(columns="Survived")
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [4]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder())
])

In [5]:
custom_info(X_train)

Unnamed: 0,dataFeatures,dataType,null,nullPct,unique,uniqueSample
0,Pclass,int64,0,0.0,3,"[1, 3, 2]"
1,Sex,object,0,0.0,2,"[female, female, female]"
2,Age,object,137,19.24,5,"[nan, anak-anak, lansia]"
3,SibSp,int64,0,0.0,7,"[0, 0, 0]"
4,Parch,int64,0,0.0,7,"[0, 0, 0]"
5,Fare,object,14,1.97,3,"[cheap, cheap, cheap]"
6,Embarked,object,2,0.28,3,"[S, S, S]"
7,isAlone,bool,0,0.0,2,"[True, True, True]"


In [6]:
numerical_columns = ["SibSp", "Parch"]
categorical_columns = ["Pclass", "Sex", "Age", "Fare", "Embarked", "isAlone"]

preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, numerical_columns),
    ("categoric", categorical_pipeline, categorical_columns)
])

pipeline = Pipeline([
    ("prep", preprocessor),
    ("knn", KNeighborsClassifier())
])

In [7]:
parameter = {
    "knn__n_neighbors" : range(1,31,2),
    "knn__weights" : ["uniform", "distance"],
    "knn__p": [1, 2]
}

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 173 out of 180 | elapsed:    5.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    5.0s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['SibSp',
                                                                          'Parch']),
                                                                        ('categoric',
                                                                         Pipeline(steps=[('imputer',
                                                                            

In [8]:
model.best_params_

{'knn__n_neighbors': 25, 'knn__p': 2, 'knn__weights': 'distance'}

In [9]:
model.score(X_train, y_train), model.score(X_test, y_test)

(0.8792134831460674, 0.7988826815642458)

# Baseline 

In [10]:
import numpy as np 
from sklearn.metrics import accuracy_score

### Random Guessing 

In [11]:
random_pred_train = np.random.choice([0, 1], size=len(y_train))
random_pred_test = np.random.choice([0, 1], size=len(y_test))

train_acc = accuracy_score(y_train, random_pred_train)
test_acc = accuracy_score(y_test, random_pred_test)

print(train_acc, test_acc)

0.46348314606741575 0.5083798882681564


### Threshold 

In [12]:
threshold_pred_train = X_train["Sex"].map({"male":0, "female":1})
threshold_pred_test = X_test["Sex"].map({"male":0, "female":1})

train_acc = accuracy_score(y_train, threshold_pred_train)
test_acc = accuracy_score(y_test, threshold_pred_test)

print(train_acc, test_acc)

0.7893258426966292 0.776536312849162
