In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier


In [2]:
def FillNA(df):
    # แยกค่า NULL
    null = pd.DataFrame({'Null Values' : df.isna().sum(), 'Percentage Null Values' : (df.isna().sum()) / (df.shape[0]) * (100)})

    # filling null values with zero
    df.fillna(0, inplace = True)

    # ทั้ง 3 collumn จะเป็น 0 หมด  ไม่ได้
    filter = (df.children == 0) & (df.adults == 0) & (df.babies == 0)

    # เอา Record ที่ 3 column เป็น 0 ออก
    df = df[~filter]
    
    return df

In [3]:
def FilterColumn(df):
    
    FeatureList = [
    'hotel',
    'lead_time',
    'arrival_date_month',
    'arrival_date_week_number',
    'arrival_date_day_of_month',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults',
    'children',
    'babies',
    'meal',
    'market_segment',
    'distribution_channel',
    'is_repeated_guest',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'reserved_room_type',
    'deposit_type',
    'agent',
    'company',
    'customer_type',
    'adr',
    'required_car_parking_spaces',
    'total_of_special_requests',
    'reservation_status_date',
    'name'
    
#     ,'Test1'
#     ,'Test2'
#     ,'Test3'
#     ,'Test4'
#     ,'Test5'
    
    ]

    ###   https://www.sciencedirect.com/science/article/pii/S2352340918315191
    
    
    clist = []
    for c in df:
        if(c not in FeatureList):
            df = df.drop(columns=c)
    
    
    for c in df:
        clist.append(c)
    
    
    clist=clist.sort()
    FeatureList=FeatureList.sort()
    if(clist != FeatureList):
        print("Error : Data doesn't met requiment")
        for c in clist:
            if c not in FeatureList:
                print(c+' not found!')
        return 0
    
    return df

In [4]:
def PreProcessingData(df):
    
    dfname = df["name"]
    df.drop(['name'] , axis = 1, inplace = True)
    
    df = FilterColumn(df)
    if (not isinstance(df, pd.DataFrame)):
        return 0
    df = FillNA(df)
    
    
    # creating categorical dataframes
    cat_cols = [col for col in df.columns if df[col].dtype == 'O']
    cat_df = df[cat_cols]

    cat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])
    cat_df['year'] = cat_df['reservation_status_date'].dt.year
    cat_df['month'] = cat_df['reservation_status_date'].dt.month
    cat_df['day'] = cat_df['reservation_status_date'].dt.day
    cat_df.drop(['reservation_status_date','arrival_date_month'] , axis = 1, inplace = True)
    cat_df['hotel'] = cat_df['hotel'].map({'Resort Hotel' : 0, 'City Hotel' : 1})
    cat_df['meal'] = cat_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3, 'Undefined': 4})
    cat_df['market_segment'] = cat_df['market_segment'].map({'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3,
                                                             'Complementary': 4, 'Groups': 5, 'Undefined': 6, 'Aviation': 7})
    cat_df['distribution_channel'] = cat_df['distribution_channel'].map({'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Undefined': 3,
                                                                           'GDS': 4})
    cat_df['reserved_room_type'] = cat_df['reserved_room_type'].map({'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6,
                                                                       'L': 7, 'B': 8})
    cat_df['deposit_type'] = cat_df['deposit_type'].map({'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3})
    cat_df['customer_type'] = cat_df['customer_type'].map({'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3})
    cat_df['year'] = cat_df['year'].map({2015: 0, 2014: 1, 2016: 2, 2017: 3})
    
    
    
    
    num_df = df.drop(columns = cat_cols, axis = 1)
    num_df

    # normalizing numerical variables

    num_df['lead_time'] = np.log(num_df['lead_time'] + 1)
    num_df['arrival_date_week_number'] = np.log(num_df['arrival_date_week_number'] + 1)
    num_df['arrival_date_day_of_month'] = np.log(num_df['arrival_date_day_of_month'] + 1)
    num_df['agent'] = np.log(num_df['agent'] + 1)
    num_df['company'] = np.log(num_df['company'] + 1)
    num_df['adr'] = np.log(num_df['adr'] + 1)
    num_df['adr'] = num_df['adr'].fillna(value = num_df['adr'].mean())
    
    
    X = pd.concat([cat_df, num_df], axis = 1)


    return X, dfname
    

In [5]:
def loadModel(path):
    from joblib import load
    dtc = load(path)
    return dtc

In [6]:
df, dfName = PreProcessingData(pd.read_csv(r'./input.csv'))

dtc = loadModel(r'./dtc.joblib')
pred = dtc.predict(df)
pred = pd.DataFrame(pred, columns = ['is_possible_to_cancel'])


df = pd.concat([dfName, pred], axis = 1)

display(df)

Unnamed: 0,name,is_possible_to_cancel
0,Ronald Rodriguez,0
1,Barbara Petty,0
2,Tammie Watson,1
3,Katrina Finley,1
4,Alexis Arroyo,1
...,...,...
1374,Luis Daniel MD,1
1375,Brandon Robles,0
1376,Rebecca Allen,0
1377,Michael Hood,0
