# 0.0 Import Libriaries

In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics as mt
from sklearn.neighbors import KNeighborsClassifier

# 0.1 Loading Datasets

In [2]:
X_train = pd.read_csv("X_training.csv")
y_train = pd.read_csv("y_training.csv")
X_val = pd.read_csv("X_validation.csv")
y_val = pd.read_csv("y_validation.csv")
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [3]:
X_train.head(3)

Unnamed: 0,id,customer_type,age,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,gender_Female,gender_Male,type_of_travel_business_travel,type_of_travel_personal_travel
0,13508,1,0.5,0.0,0.03958,0.6,0.6,0.6,0.6,1.0,...,0.5,1.0,0.6,0.4,0.0,0.013848,1.0,0.0,1.0,0.0
1,28874,1,0.24359,0.0,0.205775,0.6,0.4,0.4,0.4,0.6,...,0.5,0.5,0.2,0.6,0.0,0.0,0.0,1.0,1.0,0.0
2,21484,0,0.435897,1.0,0.026858,0.6,0.6,0.6,0.2,1.0,...,0.0,1.0,0.6,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [4]:
# data preparation
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

In [5]:
# removing column "id"
X_train = X_train.drop(["id"], axis=1)
X_val = X_val.drop(["id"], axis=1)
X_test = X_test.drop(["id"], axis=1)

In [6]:
X_train.head(3)

Unnamed: 0,customer_type,age,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,gender_Female,gender_Male,type_of_travel_business_travel,type_of_travel_personal_travel
0,1,0.5,0.0,0.03958,0.6,0.6,0.6,0.6,1.0,1.0,...,0.5,1.0,0.6,0.4,0.0,0.013848,1.0,0.0,1.0,0.0
1,1,0.24359,0.0,0.205775,0.6,0.4,0.4,0.4,0.6,0.8,...,0.5,0.5,0.2,0.6,0.0,0.0,0.0,1.0,1.0,0.0
2,0,0.435897,1.0,0.026858,0.6,0.6,0.6,0.2,1.0,0.6,...,0.0,1.0,0.6,1.0,0.0,0.0,1.0,0.0,1.0,0.0


# 1.0 KNN Classifier

## 1.1 Training 

In [7]:
# Training with k from 3 ~ 17
k = np.arange(3, 19, 2)
acc_list =[]
precision_list = []
recall_list = []
f1_score_list = []

for i in k:
    # define
    model = KNeighborsClassifier(n_neighbors=i)
    # fit
    model.fit(X_train, y_train)
    # predict
    yhat_train = model.predict(X_train)
    
    # metrics
    acc = np.round(mt.accuracy_score(y_train, yhat_train), 3)
    acc_list.append(acc)

    precision = np.round(mt.precision_score(y_train, yhat_train), 3)
    precision_list.append(precision)

    recall = np.round(mt.recall_score(y_train, yhat_train), 3)
    recall_list.append(recall)

    f1_score = np.round(mt.f1_score(y_train, yhat_train), 3)
    f1_score_list.append(f1_score)

In [8]:
# Results 
df_results = { 'K': k,
              'Accuracy': acc_list,
              'Precisoin': precision_list,
              'Recall': recall_list,
              'F1-Score': f1_score_list}
df_results = pd.DataFrame(df_results)
df_results

Unnamed: 0,K,Accuracy,Precisoin,Recall,F1-Score
0,3,0.957,0.973,0.926,0.949
1,5,0.948,0.97,0.907,0.937
2,7,0.943,0.969,0.898,0.932
3,9,0.94,0.968,0.89,0.927
4,11,0.937,0.966,0.886,0.924
5,13,0.935,0.965,0.882,0.921
6,15,0.934,0.965,0.88,0.92
7,17,0.933,0.963,0.878,0.919


In [None]:
# Best k index
best_k_index = acc_list.index(max(acc_list))
# Best k
best_k = k[best_k_index]
best_k


3

In [None]:
# Training best_k metrics
df_results.loc[best_k_index, :]

K            3.000
Accuracy     0.957
Precisoin    0.973
Recall       0.926
F1-Score     0.949
Name: 0, dtype: float64

## 1.2 Validation

In [19]:
# define
model = KNeighborsClassifier(n_neighbors=best_k)
# fit
model.fit(X_train, y_train)
# predict
yhat_val = model.predict(X_val)

# metrics
acc = np.round(mt.accuracy_score(y_val, yhat_val), 3)
print(f"Accuracy: {acc}")
precision = np.round(mt.precision_score(y_val, yhat_val), 3)
print(f"Precision: {precision}")
recall = np.round(mt.recall_score(y_val, yhat_val), 3)
print(f"Recall: {recall}")
f1_score = np.round(mt.f1_score(y_val, yhat_val), 3)
print(f"F1-Score: {f1_score}")

Accuracy: 0.924
Precision: 0.943
Recall: 0.877
F1-Score: 0.909


## 1.3 Test

In [21]:
# define
model = KNeighborsClassifier(n_neighbors=best_k)
# fit
model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
# predict
yhat_test = model.predict(X_test)

# metrics
acc = np.round(mt.accuracy_score(y_test, yhat_test), 3)
print(f"Accuracy: {acc}")
precision = np.round(mt.precision_score(y_test, yhat_test), 3)
print(f"Precision: {precision}")
recall = np.round(mt.recall_score(y_test, yhat_test), 3)
print(f"Recall: {recall}")
f1_score = np.round(mt.f1_score(y_test, yhat_test), 3)
print(f"F1-Score: {f1_score}")



Accuracy: 0.928
Precision: 0.945
Recall: 0.887
F1-Score: 0.915
