In [81]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix, log_loss
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression, SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor, RandomForestClassifier
from xgboost.sklearn import XGBRegressor, XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.kernel_approximation import Nystroem
from joblib import dump, load
from matplotlib import pyplot

In [3]:
# Import data
raw_train_df = pd.read_csv("train.csv")
raw_train_label_df = pd.read_csv('train_label.csv')
test_df = pd.read_csv("test.csv")

In [4]:
# Split into validation set and training set
dates = raw_train_df.arrival_date.unique()
train_indices, val_indices = train_test_split(dates, test_size=0.3)
train_df = raw_train_df.set_index('arrival_date').loc[train_indices, :]
val_df = raw_train_df.set_index('arrival_date').loc[val_indices, :]

In [5]:
train_df.reset_index(drop=False, inplace=True)
val_df.reset_index(drop=False, inplace=True)

In [6]:
# ColumnTransformer

# Transforming the categoric columns
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

# Transforming the numeric columns
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)

In [7]:
# Stage 1: Predict ADR by regression
# Determine features
adr_numericCols = ['lead_time', 'arrival_date_year', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'persons', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'required_car_parking_spaces', 'total_of_special_requests'
              ]
adr_categoricCols = ['hotel', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type', 'agent', 'company'
               ]

adr_featureCols = adr_numericCols + adr_categoricCols

In [7]:
'''
will_be_canceled = predict_canceled_proba(gbc_pipe, train_df)
train_df['will_be_canceled'] = will_be_canceled
'''

"\nwill_be_canceled = predict_canceled_proba(gbc_pipe, train_df)\ntrain_df['will_be_canceled'] = will_be_canceled\n"

In [8]:
adr_train = train_df[adr_featureCols]
adr_val = val_df[adr_featureCols]

In [8]:
# Combining both categorical and numerical column transformations
adr_ct = ColumnTransformer(transformers=[('cat', cat_pipe, adr_categoricCols), ('num', num_pipe, adr_numericCols)])
adr_train_transformed = adr_ct.fit_transform(adr_train)

In [9]:
# Retrieving the feature names
cat_transformers = [('cat', cat_pipe, adr_categoricCols)]
cat_ct = ColumnTransformer(transformers=cat_transformers)
train_cat_transformed = cat_ct.fit_transform(train_df)

num_transformers = [('num', num_pipe, adr_numericCols)]
num_ct = ColumnTransformer(transformers=num_transformers)
train_num_transformed = num_ct.fit_transform(train_df)

cat_pl = cat_ct.named_transformers_['cat']
ohe = cat_pl.named_steps['ohe']
transformed_feature_names = list(ohe.get_feature_names()) + adr_numericCols
print("Total number of features = ", len(transformed_feature_names))

Total number of features =  870


In [132]:
# feature selection
adr_sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
adr_svd = TruncatedSVD(n_components=30) # best 35

In [133]:
# Linear regression
ridge = Ridge(alpha=1.0/(2*100), max_iter=10000) # alpha = 1 / 2C

In [134]:
# feature transformation
poly = PolynomialFeatures(2)

In [135]:
# Build ridge pipeline
ridge_pipe = Pipeline([('adr_transform', adr_ct), ('adr_svd', adr_svd), ('poly', poly), ('ridge', ridge)])

In [136]:
# Execute ridge pipeline
ridge_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(ridge_pipe.score(adr_train, train_df['adr']))
print(ridge_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(ridge_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(ridge_pipe.predict(adr_val), val_df['adr']))

0.5396044840978833
0.6039522121224565
1224.477257650087
794.8081163227173


In [26]:
# SVR
rbf_feature = Nystroem(gamma=10**(-1.5), n_components=600, random_state=1126)
sgdreg = SGDRegressor(loss='epsilon_insensitive', alpha=10**(-11), max_iter=10000)

sgd_pipe = Pipeline([('adr_transform', adr_ct), ('rbf', rbf_feature), ('sgdreg', sgdreg)])

In [28]:
# Execute svr pipeline
sgd_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(sgd_pipe.score(adr_train, train_df['adr']))
print(sgd_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(sgd_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(sgd_pipe.predict(adr_val), val_df['adr']))

0.38272605380374847
0.45698674243912296
1591.1612936139725
1179.9846833209801


In [25]:
# Gradient boosting regressor
gbreg = GradientBoostingRegressor(loss='huber',
                                  alpha=0.9,
                                  validation_fraction=0.2,
                                  n_iter_no_change=5,
                                  tol=0.005,
                                  max_depth=4,
                                  n_estimators=10000,
                                  random_state=1126,
                                  warm_start=False,
                                  verbose=True)
gbreg_pipe = Pipeline([('adr_transform', adr_ct), ('gbreg', gbreg)])

In [26]:
# Execute gradient boosting pipeline
gbreg_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(gbreg_pipe.score(adr_train, train_df['adr']))
print(gbreg_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(gbreg_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(gbreg_pipe.predict(adr_val), val_df['adr']))

      Iter       Train Loss   Remaining Time 
         1         973.2872           28.69m
         2         927.4869           26.89m
         3         876.2162           26.52m
         4         825.3887           25.87m
         5         789.1283           25.80m
         6         759.0266           25.55m
         7         737.2141           25.37m
         8         707.6285           25.20m
         9         679.5077           25.36m
        10         658.5888           25.59m
        20         524.2881           25.62m
        30         446.7734           25.32m
        40         399.5543           24.76m
        50         368.1456           24.48m
        60         338.6124           24.24m
        70         314.8602           24.20m
        80         295.5711           24.11m
        90         279.7047           23.94m
       100         269.0907           23.73m
       200         204.2607           22.61m
       300         181.2113           21.80m
       40

In [156]:
# XGBoost regressor
xgbreg = XGBRegressor(booster='gbtree',
                      objective='reg:pseudohubererror',
                      learning_rate=0.01,
                      n_estimators=20000,
                      min_child_weight=1,
                      subsample=0.6,
                      colsample_bytree=0.8, # best
                      max_depth=6,
                      gamma=0.0,
                      random_state=1126)
xgbreg_pipe = Pipeline([('adr_transform', adr_ct), ('xgbreg', xgbreg)])

In [155]:
# Execute gradient boosting pipeline
xgbreg_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(xgbreg_pipe.score(adr_train, train_df['adr']))
print(xgbreg_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(xgbreg_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(xgbreg_pipe.predict(adr_val), val_df['adr']))

0.7169984567844712
0.8172273522999641
752.6766477477331
366.79710954148254


In [77]:
# Cross-Validation for regression
kf = KFold(n_splits=3, shuffle=True, random_state=1126)

# Selecting parameters when Grid Searching
param_grid = {
    'xgbreg__n_estimators': [1000, 3000, 5000],
    #'xgbreg__objective': ['reg:squarederror', 'reg:pseudohubererror']
    #'xgbreg__max_depth': [3, 4],
    #'xgbreg__min_child_weight': [1, 3, 5, 7]
}

raw_adr_train = raw_train_df[adr_featureCols]
gs = GridSearchCV(xgbreg_pipe, param_grid, cv=kf, n_jobs=4)
gs.fit(raw_adr_train, raw_train_df['adr'])

print(gs.best_params_)
print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

KeyboardInterrupt: 

In [44]:
raw_train_df['adr'].mean()

87.0436366032985

In [10]:
# Stage 2: Predict is_canceled by soft logistic regression
# Determine features
isc_numericCols = ['lead_time', 'arrival_date_year', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'persons', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'required_car_parking_spaces', 'total_of_special_requests'
              ]
isc_categoricCols = ['hotel', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type', 'company', 'agent'
               ]
isc_featureCols = isc_numericCols + isc_categoricCols

In [11]:
isc_train = train_df[isc_featureCols]
isc_val = val_df[isc_featureCols]

In [12]:
# Combining both categorical and numerical column transformations
isc_ct = ColumnTransformer(transformers=[('cat', cat_pipe, isc_categoricCols), ('num', num_pipe, isc_numericCols)])
isc_train_transformed = isc_ct.fit_transform(isc_train)

In [13]:
# feature selection
isc_sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
isc_svd = TruncatedSVD(n_components=50)

In [17]:
# Logostic regression
logistic = LogisticRegression(penalty='l2', C=90.0, max_iter=10000)

# Build pipeline
logistic_pipe = Pipeline([('isc_transform', isc_ct), ('isc_svd', isc_svd), ('logistic', logistic)])

# Decide sample weight
sample_weights = train_df.apply(lambda row: pow(row.expected_cost, 1), axis=1)

In [19]:
# Execute pipeline
logistic_pipe.fit(isc_train, train_df['is_canceled'], logistic__sample_weight=sample_weights)

# Correlation coefficients
print(logistic_pipe.score(isc_train, train_df['is_canceled']))
print(logistic_pipe.score(isc_val, val_df['is_canceled']))

0.8288394089920808
0.8304562737642586


In [41]:
# Gradient boosting classifier
gbc = GradientBoostingClassifier(validation_fraction=0.2, n_iter_no_change=10, tol=0.001, n_estimators=10000, random_state=1126)

# Build pipeline
gbc_pipe = Pipeline([('isc_transform', isc_ct), ('gbc', gbc)])

In [42]:
# Execute pipeline
gbc_pipe.fit(isc_train, train_df['is_canceled'])

# Correlation coefficients
print(gbc_pipe.score(isc_train, train_df['is_canceled']))
print(gbc_pipe.score(isc_val, val_df['is_canceled']))

0.8799809417930596
0.8639641691338343


In [16]:
# Random forest classifier
rfc = RandomForestClassifier(n_estimators=500, max_depth=None, max_features='sqrt', random_state=1126, n_jobs=4)
rfc_pipe = Pipeline([('isc_transform', isc_ct), ('rfc', rfc)])

In [17]:
# Execute pipeline
rfc_pipe.fit(isc_train, train_df['is_canceled'])

# Correlation coefficients
print(rfc_pipe.score(isc_train, train_df['is_canceled']))
print(rfc_pipe.score(isc_val, val_df['is_canceled']))

0.9944447067340226
0.8718177584601056


In [156]:
# XGBoost classifier
xgbc = XGBClassifier(booster='gbtree',
                     objective='binary:logistic',
                     learning_rate=0.01,
                     n_estimators=2500,
                     min_child_weight=1,
                     subsample=0.8,
                     colsample_bytree=0.8, # best
                     max_depth=6,
                     gamma=3.5,
                     random_state=1126)
xgbc_pipe = Pipeline([('isc_transform', isc_ct), ('xgbc', xgbc)])

In [157]:
# Execute gradient boosting pipeline
xgbc_pipe.fit(isc_train, train_df['is_canceled'])

# Correlation coefficients
print(xgbc_pipe.score(isc_train, train_df['is_canceled']))
print(xgbc_pipe.score(isc_val, val_df['is_canceled']))

# Cross entropy loss
print(log_loss(xgbc_pipe.predict(isc_train), train_df['is_canceled']))
print(log_loss(xgbc_pipe.predict(isc_val), val_df['is_canceled']))



0.8996903041372191
0.8712518029302361
3.46462577739712
4.446872354082216


In [133]:
# Cross-Validation for classification
kf = KFold(n_splits=3, shuffle=True, random_state=1126)

# Selecting parameters when Grid Searching
param_grid = {
    'xgbc__n_estimators': [1000, 1500, 2000, 2500],
    'xgbc__gamma': [2.5, 3, 3.5, 4]
    #'xgbc__max_depth': [6, 7, 8],
    #'xgbc__min_child_weight': [1, 2]
}

raw_isc_train = raw_train_df[isc_featureCols]
gs = GridSearchCV(xgbc_pipe, param_grid, cv=kf, scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True))
gs.fit(raw_isc_train, raw_train_df['is_canceled'])

print(gs.best_params_)
print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))



































































































































































































{'xgbc__gamma': 2.5, 'xgbc__n_estimators': 2500}
-0.2541087412398761
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       24.870938      0.199448         0.712743        0.039753   
1       36.259700      0.279511         0.806287        0.057031   
2       47.677677      0.167785         0.884678        0.078864   
3       58.615890      0.553827         0.928959        0.064682   
4       25.211329      0.077153         0.701996        0.017052   
5       36.425261      0.370019         0.761870        0.007250   
6       47.338361      0.403701         0.816493        0.004352   
7       58.698398      0.665474         0.872329        0.012453   
8       24.786041      0.361252         0.676111        0.001238   
9       36.171772      0.648569         0.752358        0.002150   
10      47.364672      0.238556         0.814148        0.004893   
11      58.459388      0.239972         0.892255        0.043124   
12      24.915874      0.365532         0.71409

In [17]:
def predict_canceled_proba(isc_model, X_df):
    predicted_prob = isc_model.predict_proba(X_df)
    will_be_canceled = np.empty(shape=predicted_prob.shape[0])
    for i in range(will_be_canceled.shape[0]):
        will_be_canceled[i] = predicted_prob[i][1]
    return will_be_canceled

In [18]:
# Make prediction
def predict_daily_rank(isc_model, adr_model, X_df):
    # Predict "is_canceled"
    will_be_canceled = predict_canceled_proba(isc_model, X_df)
    
    # Predict "adr"
    predicted_adr = adr_model.predict(X_df)
       
    # Predict revenue
    predicted_revenue = predicted_adr * (1.0 - will_be_canceled) * (np.array(X_df['stays']))
    X_df['predicted_revenue'] = pd.Series(predicted_revenue)
    
    # Aggregate by date
    daily_df = X_df.groupby(['arrival_date']).agg({'predicted_revenue':'sum'})
    
    thresholds = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]
    ranks = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    daily_df['label'] = pd.cut(daily_df.predicted_revenue, bins=thresholds,labels=ranks)
    daily_df.reset_index(drop=False, inplace=True)
    
    return daily_df[['arrival_date', 'label']]
    

In [134]:
# Make prediction on validation data
daily_df = predict_daily_rank(rfc_pipe, xgbreg_pipe, val_df)

NameError: name 'rfc_pipe' is not defined

In [21]:
result_df = daily_df.set_index('arrival_date').join(raw_train_label_df.set_index('arrival_date'), lsuffix="_predicted", rsuffix="_true", how='inner')

In [63]:
result_df['err'] = result_df.apply(lambda row: abs(row.label_predicted - row.label_true), axis=1)

In [64]:
total_error = result_df['err'].sum(axis = 0, skipna = True) / result_df.shape[0]
print(total_error)

0.20833333333333334


In [65]:
print(result_df['err'].value_counts())

0.0    152
1.0     40
Name: err, dtype: int64


In [93]:
daily_df

Unnamed: 0,arrival_date,label
0,2015-07-03,1
1,2015-07-05,2
2,2015-07-06,2
3,2015-07-07,1
4,2015-07-10,2
...,...,...
187,2017-03-17,2
188,2017-03-20,3
189,2017-03-27,2
190,2017-03-30,2


In [140]:
# Train model with all training data
# Train is_canceled
isc_raw_train = raw_train_df[isc_featureCols]
#sample_weights = raw_train_df.apply(lambda row: pow(row.expected_cost, 1), axis=1)
xgbc_pipe.fit(isc_raw_train, raw_train_df['is_canceled'])





Pipeline(steps=[('isc_transform',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('si',
                                                                   SimpleImputer(fill_value='MISSING',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['hotel',
                                                   'arrival_date_month',
                                                   'arrival_date_week_number',
                                                   'arrival_date_day_of_month',
                                                   'meal', 'country',
                                                   'market_segment',
              

In [150]:
# Train adr
#will_be_canceled = pd.Series(predict_canceled_proba(gbc_pipe, raw_train_df))
#raw_train_df['will_be_canceled'] = will_be_canceled
adr_raw_train = raw_train_df[adr_featureCols]
xgbreg_pipe.fit(adr_raw_train, raw_train_df['adr'])

KeyboardInterrupt: 

In [158]:
# Make prediction on test data
#will_be_canceled = pd.Series(predict_canceled_proba(gbc_pipe, test_df))
#test_df['will_be_canceled'] = will_be_canceled
prediction_df = predict_daily_rank(xgbc_pipe, xgbreg_pipe, test_df)



In [159]:
prediction_df

Unnamed: 0,arrival_date,label
0,2017-04-01,3
1,2017-04-02,2
2,2017-04-03,4
3,2017-04-04,1
4,2017-04-05,3
...,...,...
148,2017-08-27,5
149,2017-08-28,7
150,2017-08-29,3
151,2017-08-30,3


In [160]:
# 人肉7
prediction_df['label'].value_counts()

4    49
3    38
2    22
5    18
6    15
7     7
1     3
8     1
9     0
0     0
Name: label, dtype: int64

In [147]:
prediction_df.to_csv("prediction.csv", index=False)

In [83]:
# Save regression model
dump(xgbreg_pipe, 'regression_models/xgbreg_pipe_20000_huber_lr-0.01_td-6_mcw-1_ss-0.8_cs-0.8_gamma-0.01.joblib')

['xgbreg_pipe_20000_huber_lr-0.01_td-6_mcw-1_ss-0.8_cs-0.8_gamma-0.01.joblib']

In [155]:
# Save classification medel
dump(xgbc_pipe, 'classification_models/xgbc_pipe_2000_log_lr-0.01_td-6_mcw-1_ss-0.8_cs-0.8_gamma-0.0.joblib')

['classification_models/xgbc_pipe_2000_log_lr-0.01_td-6_mcw-1_ss-0.8_cs-0.8_gamma-0.0.joblib']

In [27]:
# Load model
xgbc_pipe = load('classification_models/xgbc_pipe_20000_log_lr-0.01_td-6_mcw-1_ss-0.8_cs-0.8_gamma-0.01.joblib')

In [16]:
xgbreg_pipe = load('regression_models/xgbreg_pipe_20000_huber_lr-0.01_td-6_mcw-1_ss-0.8_cs-0.8_gamma-0.01.joblib')