# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
import plotly.express as px

In [2]:
def checkVar(X,y):
    check = pd.concat([ X,y ], axis = 1)

    clist=[]
    for column in check:
        clist.append(column)
    return clist

In [3]:
def doDecision(X,y):
    # splitting data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)

    y_pred_dtc = dtc.predict(X_test)
    
    print(y_pred_dtc)

    acc_dtc = accuracy_score(y_test, y_pred_dtc)
    conf = confusion_matrix(y_test, y_pred_dtc)
    clf_report = classification_report(y_test, y_pred_dtc)
    
    print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
    print(f"Confusion Matrix : \n{conf}")
    print(f"Classification Report : \n{clf_report}")
    
    from joblib import dump, load
    dump(dtc, 'dtc.joblib')
    dtc = load('dtc.joblib')
    return dtc

    

In [4]:
def FillNA(df):
    # แยกค่า NULL
    null = pd.DataFrame({'Null Values' : df.isna().sum(), 'Percentage Null Values' : (df.isna().sum()) / (df.shape[0]) * (100)})

    # filling null values with zero
    df.fillna(0, inplace = True)

    # ทั้ง 3 collumn จะเป็น 0 หมด  ไม่ได้
    filter = (df.children == 0) & (df.adults == 0) & (df.babies == 0)

    # เอา Record ที่ 3 column เป็น 0 ออก
    df = df[~filter]
    
    return df

In [15]:
def doPreProcess(df, LessCorr):
    
    
    dfname = df["name"]
    df.drop(['name'] , axis = 1, inplace = True)
    df = FillNA(df)
    
    
    # creating categorical dataframes
    cat_cols = [col for col in df.columns if df[col].dtype == 'O']
    cat_df = df[cat_cols]

    cat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])
    cat_df['year'] = cat_df['reservation_status_date'].dt.year
    cat_df['month'] = cat_df['reservation_status_date'].dt.month
    cat_df['day'] = cat_df['reservation_status_date'].dt.day
    cat_df.drop(['reservation_status_date','arrival_date_month'] , axis = 1, inplace = True)
    cat_df['hotel'] = cat_df['hotel'].map({'Resort Hotel' : 0, 'City Hotel' : 1})
    cat_df['meal'] = cat_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3, 'Undefined': 4})
    cat_df['market_segment'] = cat_df['market_segment'].map({'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3,
                                                             'Complementary': 4, 'Groups': 5, 'Undefined': 6, 'Aviation': 7})
    cat_df['distribution_channel'] = cat_df['distribution_channel'].map({'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Undefined': 3,
                                                                           'GDS': 4})
    cat_df['reserved_room_type'] = cat_df['reserved_room_type'].map({'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6,
                                                                       'L': 7, 'B': 8})
    cat_df['deposit_type'] = cat_df['deposit_type'].map({'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3})
    cat_df['customer_type'] = cat_df['customer_type'].map({'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3})
    cat_df['year'] = cat_df['year'].map({2015: 0, 2014: 1, 2016: 2, 2017: 3})
    
    
    
    
    num_df = df.drop(columns = cat_cols, axis = 1)
    num_df.drop('is_canceled', axis = 1, inplace = True)
    num_df

    # normalizing numerical variables

    num_df['lead_time'] = np.log(num_df['lead_time'] + 1)
    num_df['arrival_date_week_number'] = np.log(num_df['arrival_date_week_number'] + 1)
    num_df['arrival_date_day_of_month'] = np.log(num_df['arrival_date_day_of_month'] + 1)
    num_df['agent'] = np.log(num_df['agent'] + 1)
    num_df['company'] = np.log(num_df['company'] + 1)
    num_df['adr'] = np.log(num_df['adr'] + 1)
    num_df['adr'] = num_df['adr'].fillna(value = num_df['adr'].mean())
    
    
    
    
    
    df = pd.concat([cat_df, num_df , df['is_canceled']], axis = 1)
    
    
    correlation = df.corr(method ='pearson')['is_canceled'].abs().sort_values(ascending = False) 
    corrIndex=[]
    for index in correlation.index :
        corrIndex.append(index)
        
#     print(correlation)
#     print(corrIndex)

    useCorr = []
    for index in range(0,len(corrIndex)-LessCorr) :
        useCorr.append(corrIndex[index])


    # dropping columns that are not useful
    useless_col = []

    
    useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes',
               'reservation_status', 'country', 'days_in_waiting_list']

    for x in corrIndex:
        if(x not in useCorr):
            useless_col.append(x)
    
    
    df.drop(useless_col, axis = 1, inplace = True)

    
    
    
    
    y = df['is_canceled']
    X = df.drop(columns='is_canceled')


    return X,y,dfname
    

In [16]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn import metrics
from IPython.display import SVG
from graphviz import Source
from IPython.display import display



df = pd.read_csv(r'./hotel_bookings.csv')
# LessCorr = len(df.corr(method ='pearson')['is_canceled'].abs().sort_values(ascending = False))

LessCorr = 0
X,y,dfname = doPreProcess(df, 0)
clf = doDecision(X,y)
var = checkVar(X,y)
print("variable x : ",len(var))
for i in var:
    print(i)



# for LessCorr in range(0,26):
#     X,y = doPreProcess(df, LessCorr)
#     doDecision(X,y)

#     var = checkVar(X,y)
#     print("variable x : ",len(var))
# #     for i in var:
# #         print(i)
        
#     print("------------------")

[0 0 0 ... 1 0 1]
Accuracy Score of Decision Tree is : 0.9267043847241867
Confusion Matrix : 
[[20737  1294]
 [ 1297 12022]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     22031
           1       0.90      0.90      0.90     13319

    accuracy                           0.93     35350
   macro avg       0.92      0.92      0.92     35350
weighted avg       0.93      0.93      0.93     35350

variable x :  27
hotel
meal
market_segment
distribution_channel
reserved_room_type
deposit_type
customer_type
year
month
day
lead_time
arrival_date_week_number
arrival_date_day_of_month
stays_in_weekend_nights
stays_in_week_nights
adults
children
babies
is_repeated_guest
previous_cancellations
previous_bookings_not_canceled
agent
company
adr
required_car_parking_spaces
total_of_special_requests
is_canceled


### List ตัวแปรที่ตัด
assigned_room_type เป็นตัวแปรที่ต่างเฉพาะ Operation ของโรงแรม
reservation_status ในเมื่อเราเช็คที่ว่ายกเลิกไหม  Status สุดท้ายไม่จำเป็น

In [None]:
def checkVar(df, LessCorr):
    
    df = FillNA(df)

    
    correlation = df.corr(method ='pearson')['is_canceled'].abs().sort_values(ascending = False) 
    corrIndex=[]
    for index in correlation.index :
        corrIndex.append(index)

    useCorr = []
    for index in range(0,len(corrIndex)-LessCorr) :
        useCorr.append(corrIndex[index])

    useless_col = []
    
    useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes',
               'reservation_status', 'country', 'days_in_waiting_list']
    
    print(useCorr)

    for x in corrIndex:
        if(x not in useCorr):
            useless_col.append(x)
    
    
    df.drop(useless_col, axis = 1, inplace = True)
    
    for c in df:
        print(c)

    
    

    # creating categorical dataframes
    cat_cols = [col for col in df.columns if df[col].dtype == 'O']
    cat_df = df[cat_cols]

    cat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])
    cat_df['year'] = cat_df['reservation_status_date'].dt.year
    cat_df['month'] = cat_df['reservation_status_date'].dt.month
    cat_df['day'] = cat_df['reservation_status_date'].dt.day
    cat_df.drop(['reservation_status_date','arrival_date_month'] , axis = 1, inplace = True)
    cat_df['hotel'] = cat_df['hotel'].map({'Resort Hotel' : 0, 'City Hotel' : 1})
    cat_df['meal'] = cat_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3, 'Undefined': 4})
    cat_df['market_segment'] = cat_df['market_segment'].map({'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3,
                                                             'Complementary': 4, 'Groups': 5, 'Undefined': 6, 'Aviation': 7})
    cat_df['distribution_channel'] = cat_df['distribution_channel'].map({'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Undefined': 3,
                                                                           'GDS': 4})
    cat_df['reserved_room_type'] = cat_df['reserved_room_type'].map({'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6,
                                                                       'L': 7, 'B': 8})
    cat_df['deposit_type'] = cat_df['deposit_type'].map({'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3})
    cat_df['customer_type'] = cat_df['customer_type'].map({'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3})
    cat_df['year'] = cat_df['year'].map({2015: 0, 2014: 1, 2016: 2, 2017: 3})

    num_df = df.drop(columns = cat_cols, axis = 1)
    num_df.drop('is_canceled', axis = 1, inplace = True)
    num_df

    # normalizing numerical variables

    num_df['lead_time'] = np.log(num_df['lead_time'] + 1)
    num_df['arrival_date_week_number'] = np.log(num_df['arrival_date_week_number'] + 1)
    num_df['arrival_date_day_of_month'] = np.log(num_df['arrival_date_day_of_month'] + 1)
    num_df['agent'] = np.log(num_df['agent'] + 1)
    num_df['company'] = np.log(num_df['company'] + 1)
    num_df['adr'] = np.log(num_df['adr'] + 1)
    num_df['adr'] = num_df['adr'].fillna(value = num_df['adr'].mean())
    
    X = pd.concat([cat_df, num_df], axis = 1)
    y = df['is_canceled']
    return X,y
    