In [144]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [88]:
# Import data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_label_df = pd.read_csv('train_label.csv')

In [90]:
# Analyze invalid data
invalid_data = train_df[train_df.adr < 0]
invalid_data['arrival_date'] = invalid_data.apply(getArrivalDate, axis=1)
invalid_data['stays'] = invalid_data.apply(lambda row: row.stays_in_weekend_nights + row.stays_in_week_nights, axis=1)
invalid_data['expected_cost'] = invalid_data.apply(lambda row: row.adr * row.stays, axis=1)

print(train_df['arrival_date'].value_counts())
print(invalid_data['arrival_date'].value_counts())
print(invalid_data['is_canceled'].value_counts())

In [99]:
# clean invalid data
train_df.drop(train_df[train_df.adr < 0].index, inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [111]:
train_df.shape

(89424, 37)

In [112]:
# feature transform
monthMap = {'January':'1', 'February':'2', 'March':'3',\
            'April':'4', 'May':'5', 'June':'6',\
            'July':'7', 'August':'8', 'September':'9',\
            'October':'10', 'November':'11', 'December':'12'}
def getArrivalDate(row):
    return pd.to_datetime(str(row.arrival_date_year) + '-' + monthMap[row.arrival_date_month] + '-' + str(row.arrival_date_day_of_month))

train_df['arrival_date'] = train_df.apply(getArrivalDate, axis=1)
train_df['stays'] = train_df.apply(lambda row: row.stays_in_weekend_nights + row.stays_in_week_nights, axis=1)
train_df['expected_cost'] = train_df.apply(lambda row: row.adr * row.stays, axis=1)

In [113]:
# Get the target variable from the training set
label = train_df['is_canceled']

In [114]:
numericCols = ['lead_time', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'adr', 'expected_cost', 'required_car_parking_spaces', 'total_of_special_requests'
              ]
categoryCols = ['hotel', 'arrival_date_year', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type'
               ]
featureCols = numericCols + categoryCols

In [115]:
# ColumnTransformer

# Transforming the categoric columns
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

cat_transformers = [('cat', cat_pipe, categoryCols)]
cat_ct = ColumnTransformer(transformers=cat_transformers)

train_cat_transformed = cat_ct.fit_transform(train_df)

In [116]:
# Transforming the numeric columns
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)

num_transformers = [('num', num_pipe, numericCols)]
num_ct = ColumnTransformer(transformers=num_transformers)

train_num_transformed = num_ct.fit_transform(train_df)

In [117]:
# Retrieving the feature names
cat_pl = cat_ct.named_transformers_['cat']
ohe = cat_pl.named_steps['ohe']
transformed_feature_names = list(ohe.get_feature_names()) + numericCols
print("Total number of features = ", len(transformed_feature_names))

Total number of features =  329


In [118]:
# Combining both categorical and numerical column transformations
ct = ColumnTransformer(transformers=[('cat', cat_pipe, categoryCols), ('num', num_pipe, numericCols)])
train_transformed = ct.fit_transform(train_df)

In [119]:
# feature selection
sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
#rfe = RFE(estimator=lr, n_features_to_select=5, step=1)

In [120]:
# SVD
svd = TruncatedSVD(n_components=5) # best 60

In [12]:
# Self-defined loss function
def weighted_zero_one_loss(weights):
    def custom_loss(y_true, y_pred):
        #print(y_true)
        return np.sum(np.abs(y_true - y_pred) * weights[list(y_true.index)])
    return custom_loss

my_loss = make_scorer(weighted_zero_one_loss(weights=train_df['expected_cost'].to_numpy()), greater_is_better=False)

In [122]:
# Logistic regression
lr = LogisticRegression(penalty='l2', C=75.0, max_iter=4000)

In [123]:
# Execute pipeline
lr_pipe = Pipeline([('transform', ct), ('lr', lr)])
lr_pipe.fit(train_df, label)
lr_pipe.score(train_df, label)

0.8305935766684559

In [41]:
# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1126)
#cross_val_score(ml_pipe, train_df, label, cv=kf).mean()

# Selecting parameters when Grid Searching
param_grid = {
    'transform__num__si__strategy': ['median'],
    #'svd__n_components': [60, 70],
    'lr__penalty': ['l2'],
    'lr__C': [10.0, 20.0, 30.0, 40.0, 50.0]
}

gs = GridSearchCV(lr_pipe, param_grid, cv=kf)
gs.fit(train_df, label)

print(gs.best_params_)
print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

{'lr__C': 10.0, 'lr__penalty': 'l2', 'transform__num__si__strategy': 'median'}
-986044.5916580657
   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_lr__C  \
0      10.169332      0.445680         0.099154        0.001317          10   
1      11.554647      0.372301         0.098569        0.001424          20   
2      13.417581      1.059835         0.099304        0.001266          30   
3      14.146114      1.277192         0.098617        0.000393          40   
4      14.946802      0.991918         0.098880        0.001743          50   

  param_lr__penalty param_transform__num__si__strategy  \
0                l2                             median   
1                l2                             median   
2                l2                             median   
3                l2                             median   
4                l2                             median   

                                              params  split0_test_score  \
0  

In [17]:
# SVM
svc = SVC(C=1.0, kernel='linear', gamma='scale')
svc_pipe = Pipeline([('transform', ct), ('svd', svd), ('svc', svc)])
svc_pipe.fit(train_df, label)
svc_pipe.score(train_df, label)

0.7430220075147611

In [124]:
will_be_canceled = lr_pipe.predict(train_df)
will_be_canceled = pd.Series(will_be_canceled)

train_df['will_be_canceled'] = will_be_canceled
print(train_df['will_be_canceled'].value_counts())

0    63593
1    25831
Name: will_be_canceled, dtype: int64


In [145]:
# Stage 1 analysis
print(confusion_matrix(train_df['will_be_canceled'], train_df['is_canceled']))

[[52993 10600]
 [ 4549 21282]]


In [125]:
def getRevenue(row):
    if row.is_canceled:
        return 0
    return row.expected_cost

def predictRevenue(row):
    return (1.0 - row.will_be_canceled) * row.expected_cost

In [126]:
# Post process
train_df['revenue'] = train_df.apply(getRevenue, axis=1)
train_df['predicted_revenue'] = train_df.apply(predictRevenue, axis=1)

In [127]:
# Aggregate by date
daily_revenue_df = train_df.groupby(['arrival_date']).agg({'revenue':'sum', 'predicted_revenue':'sum'})

In [128]:
bins = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]
labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
daily_revenue_df['calculated_label'] = pd.cut(daily_revenue_df.revenue, bins=bins,labels=labels)
daily_revenue_df['predicted_label'] = pd.cut(daily_revenue_df.predicted_revenue, bins=bins,labels=labels)

In [129]:
daily_revenue_df

Unnamed: 0_level_0,revenue,predicted_revenue,calculated_label,predicted_label
arrival_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-07-01,20317.720355,21151.889054,2,2
2015-07-02,16530.645277,19040.399816,1,1
2015-07-03,12989.951853,17136.097402,1,1
2015-07-04,17488.551606,15065.486549,1,1
2015-07-05,19591.458478,20241.138995,1,2
...,...,...,...,...
2017-03-27,26217.381380,29834.381779,2,2
2017-03-28,16185.177703,16927.454674,1,1
2017-03-29,24002.255525,27352.011096,2,2
2017-03-30,33327.810920,39426.908779,3,3


In [130]:
labels = daily_revenue_df.join(train_label_df.set_index('arrival_date'), how='inner')

In [131]:
labels['calculated_err'] = labels.apply(lambda row: abs(row.calculated_label - row.label), axis=1)
labels['err'] = labels.apply(lambda row: abs(row.predicted_label - row.label), axis=1)

In [132]:
labels

Unnamed: 0_level_0,revenue,predicted_revenue,calculated_label,predicted_label,label,calculated_err,err
arrival_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-07-01,20317.720355,21151.889054,2,2,2.0,0.0,0.0
2015-07-02,16530.645277,19040.399816,1,1,1.0,0.0,0.0
2015-07-03,12989.951853,17136.097402,1,1,1.0,0.0,0.0
2015-07-04,17488.551606,15065.486549,1,1,1.0,0.0,0.0
2015-07-05,19591.458478,20241.138995,1,2,1.0,0.0,1.0
...,...,...,...,...,...,...,...
2017-03-27,26217.381380,29834.381779,2,2,2.0,0.0,0.0
2017-03-28,16185.177703,16927.454674,1,1,1.0,0.0,0.0
2017-03-29,24002.255525,27352.011096,2,2,2.0,0.0,0.0
2017-03-30,33327.810920,39426.908779,3,3,3.0,0.0,0.0


In [133]:
total_calculated_error = labels['calculated_err'].sum(axis = 0, skipna = True)
total_predicted_error = labels['err'].sum(axis = 0, skipna = True)
print(total_calculated_error)
print(total_predicted_error)

13.0
196.0


In [134]:
labels['calculated_err'].value_counts()

0.0    627
1.0     13
Name: calculated_err, dtype: int64

In [135]:
labels['err'].value_counts()

0.0    453
1.0    180
2.0      6
4.0      1
Name: err, dtype: int64

In [146]:
# Stage 2 analysis
bad_predictions = labels[labels.err > 0.0]
false_positive = bad_predictions[bad_predictions.predicted_revenue>bad_predictions.revenue].shape[0]
print("predicted > true: ", false_positive)
print("predicted < true: ", bad_predictions.shape[0] - false_positive)

predicted > true:  167
predicted < true:  20
