# Library

In [5]:
import numpy as np # numerical python
import pandas as pd # pandas
from sklearn.model_selection import train_test_split # spliting data
from sklearn.pipeline import Pipeline # make pipeline
from sklearn.compose import ColumnTransformer # transform pipeline
from sklearn.neighbors import KNeighborsClassifier # algorithm
from sklearn.impute import SimpleImputer # imputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler # encoder and scaler
from sklearn.model_selection import GridSearchCV

# Preprocessing

In [6]:
### import datasets
df = pd.read_csv("drive/MyDrive/Colab Notebooks/data/titanic.csv", index_col="PassengerId")

### drop column
df.drop(columns=["Name", "Ticket", "Age", "Cabin"], inplace=True)

### Dataset spliting
X = df.drop(columns="Survived")
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [7]:
### preprocessor
# make pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder())
])

# column transformer
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, ["SibSp", "Parch", "Fare"]),
    ('categoric', categorical_pipeline, ["Pclass", "Sex", "Embarked"])
])

# main pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsClassifier())
])

# Tuning and Cross Validation

In [8]:
### parameter Tuning
parameter = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ['uniform', 'distance'],
    "algo__p": [1, 2]
}

# ### model
model = GridSearchCV(pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['SibSp',
                                                                          'Parch',
                                                                          'Fare']),
                                                                        ('categoric',
                                                                         Pipeline(steps=[('im

# Evaluation model

In [9]:
### evaluation
print("Best Params: ", model.best_params_)
print("Best Score: ", model.best_score_)
print("Train : ", model.score(X_train, y_train), "\ntest: ", model.score(X_test, y_test))

Best Params:  {'algo__n_neighbors': 19, 'algo__p': 1, 'algo__weights': 'uniform'}
Best Score:  0.8146060111808436
Train :  0.8188202247191011 
test:  0.7877094972067039


# Prediction

In [10]:
# data untuk prediksi
# biasanya dari csv
data = [
    [1, "female", 1, 1, 80, "S"],
    [3, "male", 0, 0, 5, "S"]
]

X_pred = pd.DataFrame(data, index=["Rose", "Jack"], columns=X.columns)
X_pred

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked
Rose,1,female,1,1,80,S
Jack,3,male,0,0,5,S


In [11]:
### predic
model.predict(X_pred)

array([1, 0])

In [12]:
X_pred['Survived'] = model.predict(X_pred)
X_pred

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Survived
Rose,1,female,1,1,80,S,1
Jack,3,male,0,0,5,S,0


# **Save Model**

In [None]:
### install jcopml dan import library save model

!pip install jcopml
from jcopml.utils import save_model

In [16]:
### save model
# with cv and tuning
# save_model(model, "knn_titanic.pkl", 'drive/MyDrive/Colab Notebooks/model')

### save model 2
# only model
save_model(model.best_estimator_, "knn_titanic_model.pkl", 'drive/MyDrive/Colab Notebooks/model')

Model is pickled as drive/MyDrive/Colab Notebooks/model/knn_titanic_model.pkl


# **Load model**

In [18]:
### code ini diibaratkan file baru
# import library
from jcopml.utils import load_model

In [19]:
### load model
model_knn = load_model('drive/MyDrive/Colab Notebooks/model/knn_titanic_model.pkl')

In [20]:
### use model
model_knn.predict(X_pred)

array([1, 0])