In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import make_scorer, confusion_matrix, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

In [3]:
# Import data
raw_train_df = pd.read_csv("train.csv")
raw_train_label_df = pd.read_csv('train_label.csv')

In [4]:
# feature transform
monthMap = {'January':'1', 'February':'2', 'March':'3',\
            'April':'4', 'May':'5', 'June':'6',\
            'July':'7', 'August':'8', 'September':'9',\
            'October':'10', 'November':'11', 'December':'12'}
def getArrivalDate(row):
    return pd.to_datetime(str(row.arrival_date_year) + '-' + monthMap[row.arrival_date_month] + '-' + str(row.arrival_date_day_of_month))

raw_train_df['arrival_date'] = raw_train_df.apply(getArrivalDate, axis=1)
raw_train_df['stays'] = raw_train_df.apply(lambda row: row.stays_in_weekend_nights + row.stays_in_week_nights, axis=1)
raw_train_df['expected_cost'] = raw_train_df.apply(lambda row: row.adr * row.stays, axis=1)

In [97]:
# Analyze invalid data
invalid_data = raw_train_df[raw_train_df.adr < 0]
invalid_data['arrival_date'] = invalid_data.apply(getArrivalDate, axis=1)
invalid_data['stays'] = invalid_data.apply(lambda row: row.stays_in_weekend_nights + row.stays_in_week_nights, axis=1)
invalid_data['expected_cost'] = invalid_data.apply(lambda row: row.adr * row.stays, axis=1)

print(raw_train_df['arrival_date'].value_counts())
print(invalid_data['arrival_date'].value_counts())
print(invalid_data['is_canceled'].value_counts())

2015-12-05    448
2016-11-07    366
2015-10-16    356
2016-10-13    344
2015-09-18    340
             ... 
2015-12-15     29
2015-11-15     28
2015-12-07     27
2015-11-29     20
2015-12-13     19
Name: arrival_date, Length: 640, dtype: int64
2015-09-17    68
2016-10-06    47
2016-05-12    41
2015-09-02    40
2016-10-20    37
              ..
2015-11-10     1
2017-03-20     1
2016-01-28     1
2016-10-02     1
2016-05-07     1
Name: arrival_date, Length: 490, dtype: int64
0    1229
1     878
Name: is_canceled, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  invalid_data['arrival_date'] = invalid_data.apply(getArrivalDate, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  invalid_data['stays'] = invalid_data.apply(lambda row: row.stays_in_weekend_nights + row.stays_in_week_nights, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  invalid_data

In [5]:
# clean invalid data
raw_train_df.drop(raw_train_df[raw_train_df.adr < 0].index, inplace=True)
raw_train_df.reset_index(drop=True, inplace=True)

In [6]:
raw_train_df.shape

(89424, 36)

In [7]:
# Split validation set and training set
dates = raw_train_df.arrival_date.unique()
train_indices, val_indices = train_test_split(dates, test_size=0.3)
val_df = raw_train_df.set_index('arrival_date').loc[val_indices, :]
train_df = raw_train_df.set_index('arrival_date').loc[train_indices, :]

In [8]:
# Get the target variable from the training set
raw_train_label = raw_train_df['is_canceled']
train_label = train_df['is_canceled']
val_label = val_df['is_canceled']

In [9]:
train_df.reset_index(drop=False, inplace=True)
val_df.reset_index(drop=False, inplace=True)

In [12]:
numericCols = ['lead_time', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'required_car_parking_spaces', 'total_of_special_requests'
              ]
categoryCols = ['hotel', 'arrival_date_year', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type'
               ]
featureCols = numericCols + categoryCols

In [13]:
# ColumnTransformer

# Transforming the categoric columns
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

cat_transformers = [('cat', cat_pipe, categoryCols)]
cat_ct = ColumnTransformer(transformers=cat_transformers)

train_cat_transformed = cat_ct.fit_transform(train_df)

In [14]:
# Transforming the numeric columns
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)

num_transformers = [('num', num_pipe, numericCols)]
num_ct = ColumnTransformer(transformers=num_transformers)

train_num_transformed = num_ct.fit_transform(train_df)

In [15]:
# Retrieving the feature names
cat_pl = cat_ct.named_transformers_['cat']
ohe = cat_pl.named_steps['ohe']
transformed_feature_names = list(ohe.get_feature_names()) + numericCols
print("Total number of features = ", len(transformed_feature_names))

Total number of features =  316


In [16]:
# Combining both categorical and numerical column transformations
ct = ColumnTransformer(transformers=[('cat', cat_pipe, categoryCols), ('num', num_pipe, numericCols)])
train_transformed = ct.fit_transform(train_df)

In [17]:
# feature selection
sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
#rfe = RFE(estimator=lr, n_features_to_select=5, step=1)

In [18]:
# SVD
svd = TruncatedSVD(n_components=50) # best 50

In [19]:
# Logistic regression
lr = LogisticRegression(penalty='l2', C=90.0, max_iter=10000)

In [20]:
# Build pipeline
lr_pipe = Pipeline([('transform', ct), ('svd', svd), ('lr', lr)])

In [104]:
# Decide sample weight
sample_weights = train_df.apply(lambda row: pow(row.expected_cost, 1), axis=1)

In [106]:
# Execute pipeline
lr_pipe.fit(train_df, train_label, lr__sample_weight=sample_weights)
lr_pipe.score(train_df, train_label)

ValueError: y should be a 1d array, got an array of shape (640, 2) instead.

In [None]:
# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1126)

# Selecting parameters when Grid Searching
param_grid = {
    'transform__num__si__strategy': ['median'],
    'svd__n_components': [40, 45, 50],
    'lr__penalty': ['l2'],
    'lr__C': [80, 90, 100]
}

gs = GridSearchCV(lr_pipe, param_grid, cv=kf, scoring=make_scorer(log_loss))
gs.fit(train_df, train_label, lr__sample_weight=sample_weights)

print(gs.best_params_)
print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

In [None]:
'''
# Too slow !!!
# SVC
svc = SVC(C=80.0, kernel='rbf', gamma='scale', probability=True, cache_size=200)
svc_pipe = Pipeline([('transform', ct), ('svd', svd), ('svc', svc)])
svc_pipe.fit(train_df, train_label)
svc_pipe.score(train_df, train_label)
'''

In [36]:
# Use hard classification
will_be_canceled = gs.predict(val_df)
val_df['will_be_canceled'] = will_be_canceled



In [70]:
# Use soft classification
will_be_canceled_prob = lr_pipe.predict_proba(val_df)
will_be_canceled = np.empty(shape=will_be_canceled_prob.shape[0])
for i in range(will_be_canceled.shape[0]):
    will_be_canceled[i] = will_be_canceled_prob[i][1]
will_be_canceled = pd.Series(will_be_canceled)

val_df['will_be_canceled'] = will_be_canceled
print(val_df['will_be_canceled'].value_counts())

1.000000e+00    34
1.725301e-01     2
3.040120e-02     2
7.220556e-02     2
9.918505e-01     1
                ..
3.812331e-16     1
1.772105e-01     1
5.078780e-01     1
7.587975e-02     1
5.089265e-01     1
Name: will_be_canceled, Length: 26955, dtype: int64


In [71]:
def getRevenue(row):
    if row.is_canceled:
        return 0
    return row.expected_cost

def predictRevenue(row):
    return (1.0 - row.will_be_canceled) * row.expected_cost

In [72]:
# Post process
val_df['revenue'] = val_df.apply(getRevenue, axis=1)
val_df['predicted_revenue'] = val_df.apply(predictRevenue, axis=1)

In [73]:
# Aggregate by date
daily_revenue_df = val_df.groupby(['arrival_date']).agg({'revenue':'sum', 'predicted_revenue':'sum'})

In [74]:
thresholds = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]
ranks = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
daily_revenue_df['calculated_label'] = pd.cut(daily_revenue_df.revenue, bins=thresholds,labels=ranks)
daily_revenue_df['predicted_label'] = pd.cut(daily_revenue_df.predicted_revenue, bins=thresholds,labels=ranks)

In [75]:
daily_revenue_df

Unnamed: 0_level_0,revenue,predicted_revenue,calculated_label,predicted_label
arrival_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-07-02,16530.645277,18204.872724,1,1
2015-07-03,12989.951853,15449.986202,1,1
2015-07-06,21441.591688,18522.035301,2,1
2015-07-11,27216.100986,26869.471823,2,2
2015-07-21,15595.747583,14166.861166,1,1
...,...,...,...,...
2017-03-14,19654.761144,23140.751446,1,2
2017-03-18,22949.160166,23598.928514,2,2
2017-03-24,26057.764825,29995.586348,2,2
2017-03-25,24464.132874,22835.898639,2,2


In [92]:
result_df = daily_revenue_df.join(raw_train_label.set_index('arrival_date'), how='inner')

In [80]:
result_df['calculated_err'] = result_df.apply(lambda row: abs(row.calculated_label - row.label), axis=1)
result_df['err'] = result_df.apply(lambda row: abs(row.predicted_label - row.label), axis=1)

In [81]:
result_df

Unnamed: 0_level_0,revenue,predicted_revenue,calculated_label,predicted_label,label,calculated_err,err
arrival_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-07-02,16530.645277,18204.872724,1,1,1.0,0.0,0.0
2015-07-03,12989.951853,15449.986202,1,1,1.0,0.0,0.0
2015-07-06,21441.591688,18522.035301,2,1,2.0,0.0,1.0
2015-07-11,27216.100986,26869.471823,2,2,2.0,0.0,0.0
2015-07-21,15595.747583,14166.861166,1,1,1.0,0.0,0.0
...,...,...,...,...,...,...,...
2017-03-14,19654.761144,23140.751446,1,2,1.0,0.0,1.0
2017-03-18,22949.160166,23598.928514,2,2,2.0,0.0,0.0
2017-03-24,26057.764825,29995.586348,2,2,2.0,0.0,0.0
2017-03-25,24464.132874,22835.898639,2,2,2.0,0.0,0.0


In [82]:
total_calculated_error = result_df['calculated_err'].sum(axis = 0, skipna = True)
total_predicted_error = result_df['err'].sum(axis = 0, skipna = True)
print(total_calculated_error)
print(total_predicted_error)

4.0
51.0


In [83]:
result_df['calculated_err'].value_counts()

0.0    188
1.0      4
Name: calculated_err, dtype: int64

In [84]:
result_df['err'].value_counts()

0.0    144
1.0     45
2.0      3
Name: err, dtype: int64

In [85]:
# Stage 2 analysis
bad_predictions_df = result_df[result_df.err > 0.0]
false_positive = bad_predictions_df[bad_predictions_df.predicted_revenue>bad_predictions_df.revenue].shape[0]
print("predicted > true: ", false_positive)
print("predicted < true: ", bad_predictions_df.shape[0] - false_positive)

predicted > true:  28
predicted < true:  20


In [86]:
bad_predictions_df

Unnamed: 0_level_0,revenue,predicted_revenue,calculated_label,predicted_label,label,calculated_err,err
arrival_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-07-06,21441.591688,18522.035301,2,1,2.0,0.0,1.0
2015-08-15,57467.688304,64891.695907,5,6,5.0,0.0,1.0
2015-09-20,32611.793329,29801.45883,3,2,3.0,0.0,1.0
2015-09-25,95449.120935,84839.765348,9,8,9.0,0.0,1.0
2015-10-14,11114.843383,9184.226266,1,0,1.0,0.0,1.0
2015-10-25,42398.765577,35375.455956,4,3,4.0,0.0,1.0
2015-11-23,28219.132152,36128.580978,2,3,2.0,0.0,1.0
2016-01-29,10139.320964,9809.751199,1,0,1.0,0.0,1.0
2016-02-19,21661.402154,19247.954355,2,1,2.0,0.0,1.0
2016-02-23,10694.432756,9111.355927,1,0,1.0,0.0,1.0


In [23]:
# Train with all training data
# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1126)

# Decide sample weight
sample_weights = raw_train_df.apply(lambda row: pow(row.expected_cost, 1), axis=1)

# Selecting parameters when Grid Searching
param_grid = {
    'transform__num__si__strategy': ['median'],
    'svd__n_components': [40, 45, 50, 55],
    'lr__penalty': ['l2'],
    'lr__C': [80, 90, 100]
}

gs = GridSearchCV(lr_pipe, param_grid, cv=kf, scoring=make_scorer(log_loss))
gs.fit(raw_train_df, raw_train_label, lr__sample_weight=sample_weights)

print(gs.best_params_)
print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

{'lr__C': 100, 'lr__penalty': 'l2', 'svd__n_components': 50, 'transform__num__si__strategy': 'median'}
6.168234519787664
   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_lr__C  \
0       6.608865      0.774375         0.116733        0.001261          80   
1       7.703421      0.955715         0.117696        0.000803          80   
2       7.749588      1.141011         0.118780        0.001637          80   
3       5.818701      0.870916         0.115441        0.002192          90   
4       7.891132      1.091097         0.117737        0.001571          90   
5       7.681377      0.321138         0.120124        0.002071          90   
6       6.906748      0.386835         0.117435        0.003943         100   
7       7.643118      0.796039         0.119949        0.002208         100   
8       7.793081      0.438887         0.119108        0.001209         100   

  param_lr__penalty param_svd__n_components  \
0                l2                      

In [24]:
# Make prediction on test data
test_df = pd.read_csv("test.csv")
test_nolabel_df = pd.read_csv("test_nolabel.csv")

In [28]:
test_df['arrival_date'] = test_df.apply(getArrivalDate, axis=1)
test_df['stays'] = test_df.apply(lambda row: row.stays_in_weekend_nights + row.stays_in_week_nights, axis=1)
test_df['expected_cost'] = test_df.apply(lambda row: row.adr * row.stays, axis=1)

AttributeError: 'Series' object has no attribute 'adr'

In [26]:
# Use soft classification
will_be_canceled_prob = gs.predict_proba(test_df)
will_be_canceled = np.empty(shape=will_be_canceled_prob.shape[0])
for i in range(will_be_canceled.shape[0]):
    will_be_canceled[i] = will_be_canceled_prob[i][1]
will_be_canceled = pd.Series(will_be_canceled)

test_df['will_be_canceled'] = will_be_canceled
print(test_df['will_be_canceled'].value_counts())

ValueError: Number of features of the input must be equal to or greater than that of the fitted transformer. Transformer n_features is 36 and input n_features is 29.

In [None]:
# Post process
test_df['predicted_revenue'] = val_df.apply(predictRevenue, axis=1)

In [None]:
# Aggregate by date
daily_revenue_df = test_df.groupby(['arrival_date']).agg({'predicted_revenue':'sum'})

In [None]:
daily_revenue_df['predicted_label'] = pd.cut(daily_revenue_df.predicted_revenue, bins=thresholds,labels=ranks)