In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [40]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [41]:
train_df.shape

(91531, 33)

In [42]:
# clean data
train_df.drop(train_df[train_df.adr < 0].index, inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [43]:
train_df.shape

(89424, 33)

In [44]:
# feature transform
train_df['stays'] = train_df.apply(lambda row: row.stays_in_weekend_nights + row.stays_in_week_nights, axis=1)
train_df['expected_cost'] = train_df.apply(lambda row: row.adr * row.stays, axis=1)

In [45]:
# Remove the target variable from the training set
label = train_df['is_canceled']

In [46]:
numericCols = ['lead_time', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'adr', 'expected_cost', 'required_car_parking_spaces', 'total_of_special_requests'
              ]
categoryCols = ['hotel', 'arrival_date_year', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type'
               ]
featureCols = numericCols + categoryCols

In [47]:
# ColumnTransformer

# Transforming the categoric columns
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

cat_transformers = [('cat', cat_pipe, categoryCols)]
ct = ColumnTransformer(transformers=cat_transformers)

train_cat_transformed = ct.fit_transform(train_df)

In [24]:
# Retrieving the feature names
pl = ct.named_transformers_['cat']
ohe = pl.named_steps['ohe']
ohe.get_feature_names()

array(['x0_City Hotel', 'x0_Resort Hotel', 'x1_2015', 'x1_2016',
       'x1_2017', 'x2_April', 'x2_August', 'x2_December', 'x2_February',
       'x2_January', 'x2_July', 'x2_June', 'x2_March', 'x2_May',
       'x2_November', 'x2_October', 'x2_September', 'x3_1', 'x3_2',
       'x3_3', 'x3_4', 'x3_5', 'x3_6', 'x3_7', 'x3_8', 'x3_9', 'x3_10',
       'x3_11', 'x3_12', 'x3_13', 'x3_14', 'x3_15', 'x3_16', 'x3_17',
       'x3_18', 'x3_19', 'x3_20', 'x3_21', 'x3_22', 'x3_23', 'x3_24',
       'x3_25', 'x3_26', 'x3_27', 'x3_28', 'x3_29', 'x3_30', 'x3_31',
       'x3_32', 'x3_33', 'x3_34', 'x3_35', 'x3_36', 'x3_37', 'x3_38',
       'x3_39', 'x3_40', 'x3_41', 'x3_42', 'x3_43', 'x3_44', 'x3_45',
       'x3_46', 'x3_47', 'x3_48', 'x3_49', 'x3_50', 'x3_51', 'x3_52',
       'x3_53', 'x4_1', 'x4_2', 'x4_3', 'x4_4', 'x4_5', 'x4_6', 'x4_7',
       'x4_8', 'x4_9', 'x4_10', 'x4_11', 'x4_12', 'x4_13', 'x4_14',
       'x4_15', 'x4_16', 'x4_17', 'x4_18', 'x4_19', 'x4_20', 'x4_21',
       'x4_22', 'x4_23', 'x

In [48]:
# Transforming the numeric columns
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)

num_transformers = [('num', num_pipe, numericCols)]
ct = ColumnTransformer(transformers=num_transformers)

train_num_transformed = ct.fit_transform(train_df)

In [49]:
# Combining both categorical and numerical column transformations
ct = ColumnTransformer(transformers=[('cat', cat_pipe, categoryCols), ('num', num_pipe, numericCols)])
train_transformed = ct.fit_transform(train_df)

In [10]:
# SVD
svd = TruncatedSVD(n_components=60) # best 60

In [50]:
# Logistic regression
lr = LogisticRegression(penalty='l2', C=75.0, max_iter=4000)

In [17]:
# feature selection
#sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
rfe = RFE(estimator=lr, n_features_to_select=5, step=1)

In [51]:
# Execute pipeline
lr_pipe = Pipeline([('transform', ct), ('lr', lr)])
lr_pipe.fit(train_df, label)
lr_pipe.score(train_df, label)

0.8305935766684559

In [19]:
# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1126)
cross_val_score(ml_pipe, train_df, label, cv=kf).mean()


# Selecting parameters when Grid Searching
param_grid = {
    'transform__num__si__strategy': ['median'],
    #'svd__n_components': [60, 70],
    'lr__penalty': ['l2'],
    'lr__C': [75.0]
}

gs = GridSearchCV(lr_pipe, param_grid, cv=kf)
gs.fit(train_df, label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=KFold(n_splits=5, random_state=1126, shuffle=True),
             estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('si',
                                                                                          SimpleImputer(fill_value='MISSING',
                                                                                                        strategy='constant')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['hotel',
                                                                          'arrival_date_year',
                                          

In [20]:
print(gs.best_params_)

print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

{'lr__C': 75.0, 'lr__penalty': 'l2', 'svd__n_components': 60, 'transform__num__si__strategy': 'median'}
0.8225081930052852
   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_lr__C  \
0       4.165994      0.046248         0.111351        0.001417          75   
1       4.957987      0.173773         0.132769        0.025136          75   

  param_lr__penalty param_svd__n_components  \
0                l2                      50   
1                l2                      60   

  param_transform__num__si__strategy  \
0                             median   
1                             median   

                                              params  split0_test_score  \
0  {'lr__C': 75.0, 'lr__penalty': 'l2', 'svd__n_c...           0.823237   
1  {'lr__C': 75.0, 'lr__penalty': 'l2', 'svd__n_c...           0.825094   

   split1_test_score  split2_test_score  split3_test_score  split4_test_score  \
0           0.815962           0.821097           0.824648           

In [None]:
# SVM
'''
svc = SVC(C=1.0, kernel='linear', gamma='scale')
svc_pipe = Pipeline([('transform', ct), ('svc', svc)])
svc_pipe.fit(train_df, label)
svc_pipe.score(train_df, label)
'''

In [52]:
will_be_canceled = lr_pipe.predict(train_df)

In [53]:
will_be_canceled = pd.Series(will_be_canceled)

train_df['will_be_canceled'] = will_be_canceled

train_df['will_be_canceled'].value_counts()

0    63593
1    25831
Name: will_be_canceled, dtype: int64

In [55]:
monthMap = {'January':'1', 'February':'2', 'March':'3',\
            'April':'4', 'May':'5', 'June':'6',\
            'July':'7', 'August':'8', 'September':'9',\
            'October':'10', 'November':'11', 'December':'12'}
def getArrivalDate(row):
    return pd.to_datetime(str(row.arrival_date_year) + '-' + monthMap[row.arrival_date_month] + '-' + str(row.arrival_date_day_of_month))

def getRevenue(row):
    if row.is_canceled:
        return 0
    return row.expected_cost

def predictRevenue(row):
    if row.will_be_canceled:
        return 0
    return row.expected_cost

In [56]:
# Post process
train_df['arrival_date'] = train_df.apply(getArrivalDate, axis=1)
train_df['predicted_revenue'] = train_df.apply(predictRevenue, axis=1)

In [58]:
# Aggregate by date
daily_revenue_df = train_df.groupby(['arrival_date']).agg({'revenue':'sum', 'predicted_revenue':'sum'})

In [110]:
bins = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]
labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
daily_revenue_df['calculated_label'] = pd.cut(daily_revenue_df.revenue, bins=bins,labels=labels)
daily_revenue_df['predicted_label'] = pd.cut(daily_revenue_df.predicted_revenue, bins=bins,labels=labels)

In [111]:
daily_revenue_df

Unnamed: 0_level_0,revenue,predicted_revenue,predicted_label,calculated_label
arrival_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-07-01,20317.720355,21151.889054,2,2
2015-07-02,16530.645277,19040.399816,1,1
2015-07-03,12989.951853,17136.097402,1,1
2015-07-04,17488.551606,15065.486549,1,1
2015-07-05,19591.458478,20241.138995,2,1
...,...,...,...,...
2017-03-27,26217.381380,29834.381779,2,2
2017-03-28,16185.177703,16927.454674,1,1
2017-03-29,24002.255525,27352.011096,2,2
2017-03-30,33327.810920,39426.908779,3,3


In [112]:
train_label_df = pd.read_csv('train_label.csv')

In [113]:
labels = daily_revenue_df.join(train_label_df.set_index('arrival_date'), how='inner')

In [114]:
labels['calculated_err'] = labels.apply(lambda row: abs(row.calculated_label - row.label), axis=1)
labels['err'] = labels.apply(lambda row: abs(row.predicted_label - row.label), axis=1)

In [115]:
labels

Unnamed: 0_level_0,revenue,predicted_revenue,predicted_label,calculated_label,label,calculated_err,err
arrival_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-07-01,20317.720355,21151.889054,2,2,2.0,0.0,0.0
2015-07-02,16530.645277,19040.399816,1,1,1.0,0.0,0.0
2015-07-03,12989.951853,17136.097402,1,1,1.0,0.0,0.0
2015-07-04,17488.551606,15065.486549,1,1,1.0,0.0,0.0
2015-07-05,19591.458478,20241.138995,2,1,1.0,0.0,1.0
...,...,...,...,...,...,...,...
2017-03-27,26217.381380,29834.381779,2,2,2.0,0.0,0.0
2017-03-28,16185.177703,16927.454674,1,1,1.0,0.0,0.0
2017-03-29,24002.255525,27352.011096,2,2,2.0,0.0,0.0
2017-03-30,33327.810920,39426.908779,3,3,3.0,0.0,0.0


In [116]:
total_calculated_error = labels['calculated_err'].sum(axis = 0, skipna = True)
total_predicted_error = labels['err'].sum(axis = 0, skipna = True)
print(total_calculated_error)
print(total_predicted_error)

13.0
196.0


In [117]:
labels['calculated_err'].value_counts()

0.0    627
1.0     13
Name: calculated_err, dtype: int64

In [118]:
labels['err'].value_counts()

0.0    453
1.0    180
2.0      6
4.0      1
Name: err, dtype: int64