Capstone Project Module 03
Muhammad Arfan

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             classification_report, confusion_matrix, roc_auc_score, roc_curve)
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.impute import SimpleImputer


# Import models
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

In [2]:
# 1. Data Loading, Data Cleaning, dan Feature Engineering
df = pd.read_csv('data_hotel_booking_demand.csv')
print("Shape dataset:", df.shape)
print("\nMissing values per column:")
print(df.isnull().sum())
df['country'].fillna('Unknown')
print("\nDistribusi target variable (is_canceled) dalam proporsi:")
print(df['is_canceled'].value_counts(normalize=True))
print("\nStatistik deskriptif kolom numerik:")
print(df.describe())

# Encoding variabel kategorikal
le = LabelEncoder()
categorical_columns = ['country', 'market_segment', 'deposit_type', 'customer_type', 'reserved_room_type']
for col in categorical_columns:
    df[col] = le.fit_transform(df[col].astype(str))

# Feature Engineering
df['lead_time_category'] = pd.cut(df['days_in_waiting_list'], bins=[0, 30, 90, 180, 365, 1000], labels=[1, 2, 3, 4, 5])
df['cancellation_ratio'] = df['previous_cancellations'] / (df['previous_cancellations'] + df['booking_changes'] + 1)
df['booking_change_flag'] = (df['booking_changes'] > 0).astype(int)
df['is_repeat_guest'] = (df['customer_type'] == 1).astype(int)


# Fitur yang dipakai adalah seluruh kolom kecuali target
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']





Shape dataset: (83573, 11)

Missing values per column:
country                        351
market_segment                   0
previous_cancellations           0
booking_changes                  0
deposit_type                     0
days_in_waiting_list             0
customer_type                    0
reserved_room_type               0
required_car_parking_spaces      0
total_of_special_requests        0
is_canceled                      0
dtype: int64

Distribusi target variable (is_canceled) dalam proporsi:
is_canceled
0    0.631723
1    0.368277
Name: proportion, dtype: float64

Statistik deskriptif kolom numerik:
       previous_cancellations  booking_changes  days_in_waiting_list  \
count            83573.000000     83573.000000          83573.000000   
mean                 0.086798         0.220897              2.330561   
std                  0.841011         0.648635             17.673051   
min                  0.000000         0.000000              0.000000   
25%                

In [None]:
# 2. Split Data, Scaling dan Oversampling dengan SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Impute missing values terlebih dahulu sebelum scaling
imputer = SimpleImputer(strategy='median')  # Bisa pakai 'median' jika data memiliki outlier
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

#Pastikan tidak ada NaN setelah imputasi
print("Apakah ada NaN setelah imputasi?", np.isnan(X_train_imputed).sum(), np.isnan(X_test_imputed).sum())

# Gunakan data yang sudah diimputasi untuk scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

#Pastikan tidak ada NaN setelah scaling
print("Apakah ada NaN setelah scaling?", np.isnan(X_train_scaled).sum(), np.isnan(X_test_scaled).sum())

# Gunakan SMOTE setelah data bebas dari NaN
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)




Apakah ada NaN setelah imputasi? 0 0
Apakah ada NaN setelah scaling? 0 0


In [4]:
# 3. Hyperparameter Tuning dan Cross Validation untuk 3 Model

models = {}

#RANDOM FOREST
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1)
grid_rf.fit(X_train_res, y_train_res)
print("\nRandom Forest best parameters:", grid_rf.best_params_)
print("Random Forest best CV ROC AUC:", grid_rf.best_score_)
models['RandomForest'] = grid_rf.best_estimator_

#XGBOOST
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.01]
}
grid_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='roc_auc', n_jobs=-1)
grid_xgb.fit(X_train_res, y_train_res)
print("\nXGBoost best parameters:", grid_xgb.best_params_)
print("XGBoost best CV ROC AUC:", grid_xgb.best_score_)
models['XGBoost'] = grid_xgb.best_estimator_

#LIGHTGBM
lgb_model = lgb.LGBMClassifier(random_state=42)
param_grid_lgb = {
    'n_estimators': [100, 200],
    'max_depth': [ -1, 10, 20],
    'learning_rate': [0.1, 0.01]
}
grid_lgb = GridSearchCV(lgb_model, param_grid_lgb, cv=5, scoring='roc_auc', n_jobs=-1)
grid_lgb.fit(X_train_res, y_train_res)
print("\nLightGBM best parameters:", grid_lgb.best_params_)
print("LightGBM best CV ROC AUC:", grid_lgb.best_score_)
models['LightGBM'] = grid_lgb.best_estimator_



Random Forest best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest best CV ROC AUC: 0.900654965412374


Parameters: { "use_label_encoder" } are not used.




XGBoost best parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}
XGBoost best CV ROC AUC: 0.8983099749251255
[LightGBM] [Info] Number of positive: 42194, number of negative: 42194
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 781
[LightGBM] [Info] Number of data points in the train set: 84388, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

LightGBM best parameters: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 200}
LightGBM best CV ROC AUC: 0.9009549251145478


In [5]:
# 4. Evaluasi dan Pemilihan Model Terbaik
cv_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_res, y_train_res, cv=5, scoring='roc_auc')
    cv_scores[name] = scores.mean()
    print(name, "Cross Validation ROC AUC:", scores.mean())

best_model_name = max(cv_scores, key=cv_scores.get)
print("\nModel terbaik berdasarkan cross validation ROC AUC:", best_model_name)
best_model = models[best_model_name]

# Latih model terbaik menggunakan seluruh data training yang sudah di-resample
best_model.fit(X_train_res, y_train_res)

#Simpan Model ke File Menggunakan Pickle
joblib.dump(best_model, "best_hotel_booking_model.pkl")




RandomForest Cross Validation ROC AUC: 0.900654965412374


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost Cross Validation ROC AUC: 0.8983099749251255
[LightGBM] [Info] Number of positive: 33755, number of negative: 33755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004418 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 792
[LightGBM] [Info] Number of data points in the train set: 67510, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 33755, number of negative: 33755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003860 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 793
[LightGBM] [Info] Number of data points in the train set: 67510, number of used features: 14
[LightGBM] [Info] [bin

['best_hotel_booking_model.pkl']

In [6]:
# 5. Evaluasi Model Terbaik pada Test Set & Pencarian Threshold Optimal
# Pertama, evaluasi dengan threshold default 0.5
y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_default = (y_proba >= 0.5).astype(int)

print("\nEvaluasi dengan threshold default (0.5):")
print("Accuracy:", accuracy_score(y_test, y_pred_default))
print("Classification Report:\n", classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# Cari threshold terbaik untuk mencapai precision dan recall minimal 80% untuk kedua kelas (0 dan 1), jika memungkinkan.
best_threshold = 0.5
found = False
for thresh in np.arange(0.0, 1.0, 0.01):
    y_pred_thresh = (y_proba >= thresh).astype(int)
    precision0 = precision_score(y_test, y_pred_thresh, pos_label=0)
    recall0 = recall_score(y_test, y_pred_thresh, pos_label=0)
    precision1 = precision_score(y_test, y_pred_thresh, pos_label=1)
    recall1 = recall_score(y_test, y_pred_thresh, pos_label=1)
    if precision0 >= 0.80 and recall0 >= 0.80 and precision1 >= 0.80 and recall1 >= 0.80:
        best_threshold = thresh
        found = True
        print("\nDitemukan threshold:", thresh)
        print("Precision kelas 0: {:.2f}, Recall kelas 0: {:.2f}".format(precision0, recall0))
        print("Precision kelas 1: {:.2f}, Recall kelas 1: {:.2f}".format(precision1, recall1))
        break

if not found:
    print("\nTidak ditemukan threshold yang memenuhi seluruh syarat minimal precision dan recall 80%.")
    print("Threshold default (0.5) akan digunakan.")

# Evaluasi akhir dengan threshold terbaik atau threshold default jika tidak ditemukan threshold optimal
y_pred_final = (y_proba >= best_threshold).astype(int)

print("\nEvaluasi Model Terbaik pada Test Set (Threshold = {}):".format(best_threshold))
print("Accuracy:", accuracy_score(y_test, y_pred_final))
print("Classification Report:\n", classification_report(y_test, y_pred_final))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_final))
print("ROC AUC:", roc_auc_score(y_test, y_proba))





Evaluasi dengan threshold default (0.5):
Accuracy: 0.7949147472330242
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.79      0.83     10601
           1       0.69      0.80      0.74      6114

    accuracy                           0.79     16715
   macro avg       0.78      0.80      0.79     16715
weighted avg       0.81      0.79      0.80     16715

Confusion Matrix:
 [[8394 2207]
 [1221 4893]]
ROC AUC: 0.8943265546973013


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Tidak ditemukan threshold yang memenuhi seluruh syarat minimal precision dan recall 80%.
Threshold default (0.5) akan digunakan.

Evaluasi Model Terbaik pada Test Set (Threshold = 0.5):
Accuracy: 0.7949147472330242
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.79      0.83     10601
           1       0.69      0.80      0.74      6114

    accuracy                           0.79     16715
   macro avg       0.78      0.80      0.79     16715
weighted avg       0.81      0.79      0.80     16715

Confusion Matrix:
 [[8394 2207]
 [1221 4893]]
ROC AUC: 0.8943265546973013
