In [75]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix, log_loss
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression, SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor, RandomForestClassifier
from xgboost.sklearn import XGBRegressor
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.kernel_approximation import Nystroem
from joblib import dump, load

In [60]:
# Import data
raw_train_df = pd.read_csv("train.csv")
raw_train_label_df = pd.read_csv('train_label.csv')
test_df = pd.read_csv("test.csv")

In [61]:
# Split into validation set and training set
dates = raw_train_df.arrival_date.unique()
train_indices, val_indices = train_test_split(dates, test_size=0.3)
train_df = raw_train_df.set_index('arrival_date').loc[train_indices, :]
val_df = raw_train_df.set_index('arrival_date').loc[val_indices, :]

In [62]:
train_df.reset_index(drop=False, inplace=True)
val_df.reset_index(drop=False, inplace=True)

In [63]:
# ColumnTransformer

# Transforming the categoric columns
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

# Transforming the numeric columns
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)

In [64]:
# Stage 1: Predict ADR by regression
# Determine features
adr_numericCols = ['lead_time', 'arrival_date_year', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'persons', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'required_car_parking_spaces', 'total_of_special_requests'
              ]
adr_categoricCols = ['hotel', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type', 'agent'
               ]

adr_featureCols = adr_numericCols + adr_categoricCols

In [45]:
'''
will_be_canceled = predict_canceled_proba(gbc_pipe, train_df)
train_df['will_be_canceled'] = will_be_canceled
'''

"\nwill_be_canceled = predict_canceled_proba(gbc_pipe, train_df)\ntrain_df['will_be_canceled'] = will_be_canceled\n"

In [65]:
adr_train = train_df[adr_featureCols]
adr_val = val_df[adr_featureCols]

In [66]:
# Combining both categorical and numerical column transformations
adr_ct = ColumnTransformer(transformers=[('cat', cat_pipe, adr_categoricCols), ('num', num_pipe, adr_numericCols)])
adr_train_transformed = adr_ct.fit_transform(adr_train)

In [38]:
# Retrieving the feature names
cat_transformers = [('cat', cat_pipe, adr_categoricCols)]
cat_ct = ColumnTransformer(transformers=cat_transformers)
train_cat_transformed = cat_ct.fit_transform(train_df)

num_transformers = [('num', num_pipe, adr_numericCols)]
num_ct = ColumnTransformer(transformers=num_transformers)
train_num_transformed = num_ct.fit_transform(train_df)

cat_pl = cat_ct.named_transformers_['cat']
ohe = cat_pl.named_steps['ohe']
transformed_feature_names = list(ohe.get_feature_names()) + adr_numericCols
print("Total number of features = ", len(transformed_feature_names))

Total number of features =  580


In [134]:
# feature selection
adr_sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
adr_svd = TruncatedSVD(n_components=30) # best 35

In [11]:
# Linear regression
ridge = Ridge(alpha=1.0/(2*100), max_iter=10000) # alpha = 1 / 2C

In [12]:
# feature transformation
poly = PolynomialFeatures(2)

In [13]:
# Build ridge pipeline
ridge_pipe = Pipeline([('adr_transform', adr_ct), ('adr_svd', adr_svd), ('poly', poly), ('ridge', ridge)])

In [15]:
# Execute ridge pipeline
ridge_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(ridge_pipe.score(adr_train, train_df['adr']))
print(ridge_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(ridge_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(ridge_pipe.predict(adr_val), val_df['adr']))

0.606102430361219
0.6599906576605303
1034.7022239340704
698.932785860866


In [26]:
# SVR
rbf_feature = Nystroem(gamma=10**(-1.5), n_components=600, random_state=1126)
sgdreg = SGDRegressor(loss='epsilon_insensitive', alpha=10**(-11), max_iter=10000)

sgd_pipe = Pipeline([('adr_transform', adr_ct), ('rbf', rbf_feature), ('sgdreg', sgdreg)])

In [28]:
# Execute svr pipeline
sgd_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(sgd_pipe.score(adr_train, train_df['adr']))
print(sgd_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(sgd_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(sgd_pipe.predict(adr_val), val_df['adr']))

0.38272605380374847
0.45698674243912296
1591.1612936139725
1179.9846833209801


In [67]:
# Gradient boosting regressor
gbreg = GradientBoostingRegressor(loss='huber',
                                  alpha=0.9,
                                  validation_fraction=0.2,
                                  n_iter_no_change=5,
                                  tol=0.005,
                                  max_depth=4,
                                  n_estimators=10000,
                                  random_state=1126,
                                  warm_start=True)
gbreg_pipe = Pipeline([('adr_transform', adr_ct), ('gbreg', gbreg)])

In [68]:
# Execute gradient boosting pipeline
gbreg_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(gbreg_pipe.score(adr_train, train_df['adr']))
print(gbreg_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(gbreg_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(gbreg_pipe.predict(adr_val), val_df['adr']))

0.7161300645462938
0.8125832790070624
744.780177956058
397.0577868712073


In [85]:
# XGBoost regressor
xgbreg = XGBRegressor(booster='gbtree',
                      learning_rate=0.01,
                      n_estimators=2000,
                      min_child_weight=5,
                      subsample=0.8,
                      colsample_bytree:0.8,
                      max_depth=3,
                      gamma=0.05,
                      random_state=1126,
                      n_jobs=4)
xgbreg_pipe = Pipeline([('adr_transform', adr_ct), ('xgbreg', xgbreg)])

In [86]:
# Execute gradient boosting pipeline
xgbreg_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(xgbreg_pipe.score(adr_train, train_df['adr']))
print(xgbreg_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(xgbreg_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(xgbreg_pipe.predict(adr_val), val_df['adr']))

0.6607253676806282
0.7438344215068012
890.1436519895433
542.7079138415021


In [None]:
# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1126)

# Selecting parameters when Grid Searching
param_grid = {
    'xgbreg__n_estimators': [1000, 2000, 5000],
    'xgbreg__objective': ['reg:squarederror', 'reg:pseudohubererror']
    #'xgbreg__max_depth': [3, 4],
    #'xgbreg__min_child_weight': [1, 3, 5, 7]
}

raw_adr_train = raw_train_df[adr_featureCols]
gs = GridSearchCV(xgbreg_pipe, param_grid, cv=kf, n_jobs=4)
gs.fit(raw_adr_train, raw_train_df['adr'])

print(gs.best_params_)
print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

In [44]:
raw_train_df['adr'].mean()

87.0436366032985

In [45]:
# Stage 2: Predict is_canceled by soft logistic regression
# Determine features
isc_numericCols = ['lead_time', 'arrival_date_year', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'persons', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'required_car_parking_spaces', 'total_of_special_requests'
              ]
isc_categoricCols = ['hotel', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type', 'company', 'agent'
               ]
isc_featureCols = isc_numericCols + isc_categoricCols

In [46]:
isc_train = train_df[isc_featureCols]
isc_val = val_df[isc_featureCols]

In [47]:
# Combining both categorical and numerical column transformations
isc_ct = ColumnTransformer(transformers=[('cat', cat_pipe, isc_categoricCols), ('num', num_pipe, isc_numericCols)])
isc_train_transformed = isc_ct.fit_transform(isc_train)

In [48]:
# feature selection
isc_sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
isc_svd = TruncatedSVD(n_components=50)

In [17]:
# Logostic regression
logistic = LogisticRegression(penalty='l2', C=90.0, max_iter=10000)

# Build pipeline
logistic_pipe = Pipeline([('isc_transform', isc_ct), ('isc_svd', isc_svd), ('logistic', logistic)])

# Decide sample weight
sample_weights = train_df.apply(lambda row: pow(row.expected_cost, 1), axis=1)

In [19]:
# Execute pipeline
logistic_pipe.fit(isc_train, train_df['is_canceled'], logistic__sample_weight=sample_weights)

# Correlation coefficients
print(logistic_pipe.score(isc_train, train_df['is_canceled']))
print(logistic_pipe.score(isc_val, val_df['is_canceled']))

0.8288394089920808
0.8304562737642586


In [219]:
# Gradient boosting classifier
gbc = GradientBoostingClassifier(validation_fraction=0.2, n_iter_no_change=10, tol=0.001, n_estimators=10000, random_state=1126)

# Build pipeline
gbc_pipe = Pipeline([('isc_transform', isc_ct), ('gbc', gbc)])

In [153]:
# Execute pipeline
gbc_pipe.fit(isc_train, train_df['is_canceled'])

# Correlation coefficients
print(gbc_pipe.score(isc_train, train_df['is_canceled']))
print(gbc_pipe.score(isc_val, val_df['is_canceled']))

0.8783696287288979
0.8785208557795906


In [236]:
# Random forest classifier
rfc = RandomForestClassifier(n_estimators=500, max_depth=None, max_features='sqrt', random_state=1126, n_jobs=4)
rfc_pipe = Pipeline([('isc_transform', isc_ct), ('rfc', rfc)])

In [223]:
# Execute pipeline
rfc_pipe.fit(isc_train, train_df['is_canceled'])

# Correlation coefficients
print(rfc_pipe.score(isc_train, train_df['is_canceled']))
print(rfc_pipe.score(isc_val, val_df['is_canceled']))

0.9947412472561312
0.8740572571956288


In [119]:
def predict_canceled_proba(isc_model, X_df):
    predicted_prob = isc_model.predict_proba(X_df)
    will_be_canceled = np.empty(shape=predicted_prob.shape[0])
    for i in range(will_be_canceled.shape[0]):
        will_be_canceled[i] = predicted_prob[i][1]
    return will_be_canceled

In [158]:
# Make prediction
def predict_daily_rank(isc_model, adr_model, X_df):
    # Predict "is_canceled"
    will_be_canceled = predict_canceled_proba(isc_model, X_df)
    
    # Predict "adr"
    predicted_adr = adr_model.predict(X_df)
       
    # Predict revenue
    predicted_revenue = predicted_adr * (1.0 - will_be_canceled) * (np.array(X_df['stays']))
    X_df['predicted_revenue'] = pd.Series(predicted_revenue)
    
    # Aggregate by date
    daily_df = X_df.groupby(['arrival_date']).agg({'predicted_revenue':'sum'})
    
    thresholds = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]
    ranks = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    daily_df['label'] = pd.cut(daily_df.predicted_revenue, bins=thresholds,labels=ranks)
    daily_df.reset_index(drop=False, inplace=True)
    
    return daily_df[['arrival_date', 'label']]
    

In [226]:
# Make prediction on validation data
daily_df = predict_daily_rank(rfc_pipe, gbreg_pipe, val_df)



In [227]:
result_df = daily_df.set_index('arrival_date').join(raw_train_label_df.set_index('arrival_date'), lsuffix="_predicted", rsuffix="_true", how='inner')

In [228]:
result_df['err'] = result_df.apply(lambda row: abs(row.label_predicted - row.label_true), axis=1)

In [229]:
total_error = result_df['err'].sum(axis = 0, skipna = True) / result_df.shape[0]
print(total_error)

0.15625


In [163]:
print(result_df['err'].value_counts())

0.0    153
1.0     36
3.0      2
2.0      1
Name: err, dtype: int64


In [93]:
daily_df

Unnamed: 0,arrival_date,label
0,2015-07-03,1
1,2015-07-05,2
2,2015-07-06,2
3,2015-07-07,1
4,2015-07-10,2
...,...,...
187,2017-03-17,2
188,2017-03-20,3
189,2017-03-27,2
190,2017-03-30,2


In [237]:
# Train model with all training data
# Train is_canceled
isc_raw_train = raw_train_df[isc_featureCols]
#sample_weights = raw_train_df.apply(lambda row: pow(row.expected_cost, 1), axis=1)
rfc_pipe.fit(isc_raw_train, raw_train_df['is_canceled'])

Pipeline(steps=[('isc_transform',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('si',
                                                                   SimpleImputer(fill_value='MISSING',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['hotel',
                                                   'arrival_date_month',
                                                   'arrival_date_week_number',
                                                   'arrival_date_day_of_month',
                                                   'meal', 'country',
                                                   'market_segment',
              

In [169]:
# Train adr
#will_be_canceled = pd.Series(predict_canceled_proba(gbc_pipe, raw_train_df))
#raw_train_df['will_be_canceled'] = will_be_canceled
adr_raw_train = raw_train_df[adr_featureCols]
gbreg_pipe.fit(adr_raw_train, raw_train_df['adr'])

Pipeline(steps=[('adr_transform',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('si',
                                                                   SimpleImputer(fill_value='MISSING',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['hotel',
                                                   'arrival_date_month',
                                                   'arrival_date_week_number',
                                                   'arrival_date_day_of_month',
                                                   'meal', 'country',
                                                   'market_segment',
              

In [239]:
# Make prediction on test data
#will_be_canceled = pd.Series(predict_canceled_proba(gbc_pipe, test_df))
#test_df['will_be_canceled'] = will_be_canceled
prediction_df = predict_daily_rank(rfc_pipe, gbreg_pipe, test_df)



In [240]:
prediction_df

Unnamed: 0,arrival_date,label
0,2017-04-01,3
1,2017-04-02,2
2,2017-04-03,3
3,2017-04-04,1
4,2017-04-05,3
...,...,...
148,2017-08-27,5
149,2017-08-28,8
150,2017-08-29,3
151,2017-08-30,3


In [241]:
# 人肉7
prediction_df['label'].value_counts()

3    44
4    41
2    24
5    18
6    14
7     7
1     3
9     1
8     1
0     0
Name: label, dtype: int64

In [242]:
prediction_df.to_csv("prediction.csv", index=False)

In [170]:
# Save regression model
dump(gbreg_pipe, 'gbreg_pipe_10000_huber_oob-0.2_es-0.005-5_td-4.joblib')

['gbreg_pipe_10000_huber_oob-0.2_es-0.005-5_td-4.joblib']

In [238]:
# Save classification medel
dump(rfc_pipe, 'rfc_pipe_500_sqrt.joblib')

['rfc_pipe_500_sqrt.joblib']

In [164]:
# Load model
gbc_pipe = load('gbc_pipe_5000_oob-0.2_es-0.001-10.joblib')

In [224]:
gbreg_pipe = load('gbreg_pipe_10000_huber_oob-0.2_es-0.005-5_td-4.joblib')