In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pickle
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [3]:
df =  pd.read_csv('../dataset/booking_train.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df['type_of_meal']=df['type_of_meal'].str.lower().str.replace(' ', '_')
df['room_type']=df['room_type'].str.lower().str.replace(' ', '_')
df['market_segment_type']=df['market_segment_type'].str.lower().str.replace(' ', '_')
df = df[~df["date_of_reservation"].str.contains("-")]
df['month_of_reservation'] = pd.to_datetime(df['date_of_reservation'],format='%m/%d/%Y').dt.strftime('%b') 
df['booking_status'] = (df.booking_status == 'Canceled').astype(int)

In [4]:
#Perform the train/validation/test split with Scikit-Learn
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df,test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train,test_size=0.25, random_state=1)
df_full_train  = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_full_train = df_full_train['booking_status'].values
y_train = df_train['booking_status'].values
y_val = df_val['booking_status'].values
y_test = df_test['booking_status'].values
del df_train['booking_status']
del df_val['booking_status']
del df_test['booking_status']


In [10]:
numerical = ['number_of_adults','number_of_children','number_of_weekend_nights','number_of_week_nights','lead_time','p-c','p-not-c','average_price','special_requests']
categorical = ['type_of_meal','room_type','market_segment_type','car_parking_space','repeated','month_of_reservation']

test_dict = df_test[categorical + numerical].to_dict(orient='records')

#X_test = dv.transform(test_dict)
#features = list(dv.get_feature_names_out())
#dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [12]:
def load_model(file_path):
  with open(file_path, 'rb') as f_in:
    dv, model = pickle.load(f_in)  
  return dv, model

In [13]:
dv_lr, model_lg = load_model('../model/cancellation-pred-model-lr.bin')
dv_dt, model_dt = load_model('../model/cancellation-pred-model-decision-tree.bin')
dv_rf, model_rf = load_model('../model/cancellation-pred-model-random-forest.bin')
dv_xgb, model_xgb = load_model('../model/cancellation-pred-model-xgb.bin')


In [None]:

y_pred_lg = model_lg.predict(y_test)
roc_auc_score(y_val, y_pred)

In [42]:
model_names = ['cancellation-pred-model-lr.bin','cancellation-pred-model-decision-tree.bin','cancellation-pred-model-random-forest.bin','cancellation-pred-model-xgb.bin']
dvs = []
models = []
scores =[]

for i in model_names:
    dv, model = load_model(f'../model/{i}')
    dvs.append(dv)
    models.append(model)

for j in range(len(model_names)):
    X = dvs[j].transform(test_dict)
    features = dvs[j].get_feature_names_out()
    if model_names[j]== 'cancellation-pred-model-xgb.bin':
        dtest = xgb.DMatrix(X, label=y_test, feature_names=list(features))
        y_pred = model.predict(dtest)
    else:   
        y_pred = models[j].predict_proba(X)[:, 1]
        
    auc = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred>0.5)
    precision = precision_score(y_test, y_pred>0.5)
    recall = recall_score(y_test, y_pred>0.5)
    f1_score = 2 * (precision * recall) / (precision + recall)
    scores.append((model_names[j],auc,accuracy,precision,recall,f1_score))



pd.DataFrame(scores, columns=['model_name','auc_score','accuracy', 'precision','recall','f1_score'])



Unnamed: 0,model_name,auc_score,accuracy,precision,recall,f1_score
0,cancellation-pred-model-lr.bin,0.862699,0.799862,0.715898,0.642465,0.677197
1,cancellation-pred-model-decision-tree.bin,0.922478,0.859034,0.81311,0.738286,0.773894
2,cancellation-pred-model-random-forest.bin,0.934336,0.872552,0.852611,0.737442,0.790856
3,cancellation-pred-model-xgb.bin,0.9524,0.889793,0.853285,0.800338,0.825964
