In [19]:
import pandas as pd
import numpy as np
import sklearn
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import neighbors


original_data = pd.read_csv("../datasets/agoda_cancellation_train.csv", index_col=0).drop_duplicates()

In [15]:
def filtered_data_by_date(base_date_t):
    days_after_date_to_cancel = 7
    min_days_after_booking = 8
    max_days_after_booking = 40
    min_days_to_checkin = 9
    processed_data = original_data.copy()

    processed_data["days_before_checkin"] = (pd.to_datetime(processed_data['checkin_date']) - base_date_t).dt.days
    processed_data["days_after_booking"] = (base_date_t - pd.to_datetime(processed_data['booking_datetime'])).dt.days
    processed_data["days_to_cancel"] = (pd.to_datetime(processed_data['cancellation_datetime']) - base_date_t).dt.days

    processed_data = processed_data[processed_data["days_before_checkin"] > min_days_to_checkin]
    processed_data = processed_data[processed_data["days_after_booking"].between(min_days_after_booking, max_days_after_booking, inclusive="both")]

    y = processed_data["days_to_cancel"].between(0, days_after_date_to_cancel)
    
    return processed_data, y

train_x, train_y = filtered_data_by_date(datetime.datetime(2018, 6, 21, 0, 0))
test_x, test_y = filtered_data_by_date(datetime.datetime(2018, 8, 2, 0, 0))

print(len(train_y), sum(train_y), len(test_y), sum(test_y))
train_x[train_y]

4579 135 3418 110


Unnamed: 0_level_0,booking_datetime,checkin_date,checkout_date,hotel_id,hotel_country_code,hotel_live_date,hotel_star_rating,accommadation_type_name,charge_option,h_customer_id,...,request_airport,request_earlycheckin,cancellation_datetime,hotel_area_code,hotel_brand_code,hotel_chain_code,hotel_city_code,days_before_checkin,days_after_booking,days_to_cancel
h_booking_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9171271712595457018,2018-05-27 08:34:00,2018-07-10 00:00:00,2018-07-11 00:00:00,73986,FR,2014-02-13 20:24:00,2.0,Hotel,Pay Later,2052977273658780037,...,0.0,0.0,2018-06-23,4546,,,2293,19,24,2.0
-9075056764097075616,2018-06-06 21:37:00,2018-07-15 00:00:00,2018-07-16 00:00:00,50079,IN,2013-03-09 10:04:00,4.0,Hotel,Pay Later,7181517274385780098,...,,,2018-06-21,2789,,,884,24,14,0.0
-8868089978323625320,2018-06-04 23:51:00,2018-08-22 00:00:00,2018-08-23 00:00:00,4467586,TW,2018-03-02 11:21:00,4.0,Hotel,Pay Later,8768564096113230012,...,,,2018-06-25,1493,,,2547,62,16,4.0
-8846810676707372489,2018-05-16 00:36:00,2018-09-08 00:00:00,2018-09-12 00:00:00,1619918,TH,2016-12-09 11:27:00,2.0,Hostel,Pay Later,2251257549329840030,...,,,2018-06-23,5891,,,2477,79,35,2.0
-8844616778021746422,2018-05-23 22:57:00,2018-07-17 00:00:00,2018-07-18 00:00:00,4791,JP,2009-06-28 02:02:00,4.0,Hotel,Pay Later,4967204829005020014,...,,,2018-06-26,5241,293.0,595.0,1403,26,28,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8566790924155905545,2018-05-25 14:44:00,2018-07-01 00:00:00,2018-07-07 00:00:00,42851,GB,2009-06-28 02:02:00,2.0,Hotel,Pay Later,859608846887250043,...,,,2018-06-22,264,,,2538,10,26,1.0
8725717988606198953,2018-06-06 12:14:00,2018-07-26 00:00:00,2018-07-27 00:00:00,4573201,ES,2018-02-26 06:57:00,0.0,Guest House / Bed & Breakfast,Pay Now,1682850554599270058,...,,,2018-06-27,2274,,,768,35,14,6.0
8750681748234550826,2018-06-07 08:29:00,2018-07-01 00:00:00,2018-07-05 00:00:00,45318,AU,2012-08-24 10:03:00,4.0,Hotel,Pay Later,4382372132757530092,...,,,2018-06-21,3920,496.0,24.0,336,10,13,0.0
8844355114341842716,2018-06-12 22:29:00,2018-07-06 00:00:00,2018-07-07 00:00:00,869902,JP,2015-03-24 15:36:00,5.0,Ryokan,Pay Later,5169372136215020013,...,,,2018-06-21,1856,,,2114,15,8,0.0


In [40]:
def get_features(basic_data):
    processed_data = basic_data.copy()
    processed_data["booking_length"] = (pd.to_datetime(original_data['checkout_date']) - pd.to_datetime(original_data['checkin_date'])).dt.days
    processed_data["booking_checkin_diff"] = (pd.to_datetime(original_data['checkin_date']) - pd.to_datetime(original_data['booking_datetime'])).dt.days
    processed_data["hotel_time_in_system"] = (pd.to_datetime(original_data['hotel_live_date']) - pd.to_datetime(original_data['booking_datetime'])).dt.days
    processed_data["is_user_logged_in"] = processed_data['is_user_logged_in'].apply(lambda x: 1 if x else 0)
    processed_data["is_first_booking"] = processed_data['is_first_booking'].apply(lambda x: 1 if x else 0)
    
    charge_option_dummies = pd.get_dummies(processed_data["charge_option"], prefix="charge")
    cancel_policy_dummies = pd.get_dummies(processed_data["cancellation_policy_code"], prefix="cancelp")
    
    final_data = processed_data[["days_before_checkin", "days_after_booking", "booking_length", "booking_checkin_diff",
                                 "hotel_time_in_system", "is_first_booking"]]
    
    final_data = pd.concat([final_data, charge_option_dummies], axis=1)
    final_data = pd.concat([final_data, cancel_policy_dummies], axis=1)
    
    return final_data

train_x_features = get_features(train_x)
test_x_features = get_features(test_x)
test_x_features = test_x_features.reindex(columns = train_x_features.columns, fill_value=0)

clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
# clf = neighbors.KNeighborsClassifier(1)


clf.fit(train_x_features, train_y)

prediction = clf.predict(test_x_features)

true_positive = sum([1 for i in range(len(prediction)) if 1 == prediction[i] == test_y.iloc[i]])
true_negative = sum([1 for i in range(len(prediction)) if 0 == prediction[i] == test_y.iloc[i]])
real_positive = sum([1 for i in test_y if i == 1])
real_negative = sum([1 for i in test_y if i == 0])

print(f"True Positive: {true_positive}/{real_positive}")
print(f"True Negative: {true_negative}/{real_negative}")
print(f"Total True: {true_positive+true_negative}/{real_positive+real_negative} = {(true_positive+true_negative)/(real_positive+real_negative)}")

pred_prob1 = clf.predict_proba(test_x_features) 
print(f"AUC Score: {roc_auc_score(test_y, pred_prob1[:,1])}")

force_true = pred_prob1[:,1] >= np.percentile(pred_prob1[:,1], 96)
print(f"If forced true: change {np.percentile(pred_prob1[:,1], 96)} {sum(force_true)}, {sum(test_y[force_true])} real positive")
pred_prob1


True Positive: 0/110
True Negative: 3308/3308
Total True: 3308/3418 = 0.967817437097718
AUC Score: 0.6955534791689568
If forced true: change 0.044766392824223115 137, 7 real positive


array([[0.98427626, 0.01572374],
       [0.97984513, 0.02015487],
       [0.97946969, 0.02053031],
       ...,
       [0.97102329, 0.02897671],
       [0.97862851, 0.02137149],
       [0.9684837 , 0.0315163 ]])