# Rangkuman

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

df = pd.read_csv('https://raw.githubusercontent.com/amongnikol/belajaraDataScienceJCOp/refs/heads/main/course2-regresiDanKlasifikasiPemula-supervisedLearning/data/titanic.csv', index_col='PassengerId')
df.drop(columns=['Name', 'Ticket', 'Age', 'Cabin'], inplace=True)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S
4,1,1,female,1,0,53.1,S
5,0,3,male,0,0,8.05,S


In [2]:
# dataset splitting
x = df.drop(columns='Survived')
y = df.Survived

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [3]:
# prepocessor

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, ['SibSp',  'Parch', 'Fare']),
    ('categoric', categorical_pipeline, ['Pclass', 'Sex', 'Embarked'])
])

In [4]:
# pipeline

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsClassifier())
])

In [5]:
# parameter tuning

parameter = {
    'algo__n_neighbors': range(1, 51, 2),
    'algo__weights': ['uniform','distance'],
    'algo__p':[1, 2]
}

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [6]:
print(model.best_params_)
print(model.score(x_train, y_train), model.score(x_test, y_test))

{'algo__n_neighbors': 21, 'algo__p': 1, 'algo__weights': 'uniform'}
0.8174157303370787 0.7821229050279329


# prediction

In [7]:
# data yang masuk harus sesuai dengan dataframe yang kita buat

x.iloc[0:1]

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,male,1,0,7.25,S


In [8]:
data = [
    [1, 'female', 1, 1, 80, 'S'],
    [3, 'male', 0, 0, 5, 'S']
]

x_pred = pd.DataFrame(data, index=['Rose', 'Jack'], columns=x.columns)
x_pred

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked
Rose,1,female,1,1,80,S
Jack,3,male,0,0,5,S


In [9]:
# model disini sudah mengambil yang terbaik

model.predict(x_pred)

array([1, 0], dtype=int64)

In [10]:
x_pred['Survived'] = model.predict(x_pred)
x_pred

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Survived
Rose,1,female,1,1,80,S,1
Jack,3,male,0,0,5,S,0


# Save Model

In [12]:
# menggunakan sckit-learn joblib
import joblib

joblib.dump(model, 'knn_titanic.pkl')


['knn_titanic.pkl']

In [13]:
joblib.dump(model.best_estimator_, 'knn_titanic_small.pkl')

['knn_titanic_small.pkl']

In [17]:
model_loaded = joblib.load('knn_titanic_small.pkl')
model_loaded.predict(x_pred)

array([1, 0], dtype=int64)

In [20]:
# menggunakan pickle

import pickle

with open('knn_titanic_pickle.pkl', 'wb') as f: 
    pickle.dump(model, f) # write binary

with open('knn_titanic_pickle.pkl', 'rb') as f: 
    model_loaded = pickle.load(f) # read binary
