In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix, log_loss
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression, SGDRegressor
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.kernel_approximation import Nystroem

In [2]:
# Import training data
raw_train_df = pd.read_csv("train.csv")
raw_train_label_df = pd.read_csv('train_label.csv')

In [3]:
# Split into validation set and training set
dates = raw_train_df.arrival_date.unique()
train_indices, val_indices = train_test_split(dates, test_size=0.3)
train_df = raw_train_df.set_index('arrival_date').loc[train_indices, :]
val_df = raw_train_df.set_index('arrival_date').loc[val_indices, :]

In [4]:
train_df.reset_index(drop=False, inplace=True)
val_df.reset_index(drop=False, inplace=True)

In [5]:
# ColumnTransformer

# Transforming the categoric columns
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

# Transforming the numeric columns
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)

In [6]:
# Stage 1: Predict ADR by regression
# Determine features
adr_numericCols = ['lead_time', 'arrival_date_year', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'persons', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'required_car_parking_spaces', 'total_of_special_requests'
              ]
adr_categoricCols = ['hotel', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type'
               ]

adr_featureCols = adr_numericCols + adr_categoricCols

In [7]:
adr_train = train_df[adr_featureCols]
adr_val = val_df[adr_featureCols]

In [8]:
# Combining both categorical and numerical column transformations
adr_ct = ColumnTransformer(transformers=[('cat', cat_pipe, adr_categoricCols), ('num', num_pipe, adr_numericCols)])
adr_train_transformed = adr_ct.fit_transform(adr_train)

In [9]:
# Retrieving the feature names
cat_transformers = [('cat', cat_pipe, adr_categoricCols)]
cat_ct = ColumnTransformer(transformers=cat_transformers)
train_cat_transformed = cat_ct.fit_transform(train_df)

num_transformers = [('num', num_pipe, adr_numericCols)]
num_ct = ColumnTransformer(transformers=num_transformers)
train_num_transformed = num_ct.fit_transform(train_df)

cat_pl = cat_ct.named_transformers_['cat']
ohe = cat_pl.named_steps['ohe']
transformed_feature_names = list(ohe.get_feature_names()) + adr_numericCols
print("Total number of features = ", len(transformed_feature_names))

Total number of features =  311


In [10]:
# feature selection
adr_sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
adr_svd = TruncatedSVD(n_components=10) # best 35

In [11]:
# Linear regression
ridge = Ridge(alpha=1.0/(2*100), max_iter=10000) # alpha = 1 / 2C

In [12]:
# feature transformation
poly = PolynomialFeatures(2)

In [13]:
# Build ridge pipeline
ridge_pipe = Pipeline([('adr_transform', adr_ct), ('adr_svd', adr_svd), ('poly', poly), ('ridge', ridge)])

In [15]:
# Execute ridge pipeline
ridge_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(ridge_pipe.score(adr_train, train_df['adr']))
print(ridge_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(ridge_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(ridge_pipe.predict(adr_val), val_df['adr']))

0.606102430361219
0.6599906576605303
1034.7022239340704
698.932785860866


In [26]:
# SVR
rbf_feature = Nystroem(gamma=10**(-1.5), n_components=600, random_state=1126)
sgdreg = SGDRegressor(loss='epsilon_insensitive', alpha=10**(-11), max_iter=10000)

sgd_pipe = Pipeline([('adr_transform', adr_ct), ('rbf', rbf_feature), ('sgdreg', sgdreg)])

In [28]:
# Execute svr pipeline
sgd_pipe.fit(adr_train, train_df['adr'])

# Correlation coefficients
print(sgd_pipe.score(adr_train, train_df['adr']))
print(sgd_pipe.score(adr_val, val_df['adr']))

# Mean squared errors
print(mean_squared_error(sgd_pipe.predict(adr_train), train_df['adr']))
print(mean_squared_error(sgd_pipe.predict(adr_val), val_df['adr']))

0.38272605380374847
0.45698674243912296
1591.1612936139725
1179.9846833209801


In [None]:
# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1126)

# Selecting parameters when Grid Searching
param_grid = {
    'adr_transform__num__si__strategy': ['median'],
    'adr_svd__n_components': [35],
    'ridge__alpha': [1.0/(2 * 10), 1.0/(2 * 40), 1.0/(2 * 70), 1.0/(2 * 100)]
}

gs = GridSearchCV(ridge_pipe, param_grid, cv=kf)
gs.fit(adr_train, train_df['adr'])

print(gs.best_params_)
print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

In [16]:
raw_train_df['adr'].mean()

87.0436366032985

In [26]:
# Stage 2: Predict is_canceled by soft logistic regression
# Determine features
isc_numericCols = ['lead_time', 'arrival_date_year', 'stays', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'persons', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'required_car_parking_spaces', 'total_of_special_requests'
              ]
isc_categoricCols = ['hotel', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type'
               ]
isc_featureCols = isc_numericCols + isc_categoricCols

In [27]:
isc_train = train_df[isc_featureCols]
isc_val = val_df[isc_featureCols]

In [28]:
# Combining both categorical and numerical column transformations
isc_ct = ColumnTransformer(transformers=[('cat', cat_pipe, isc_categoricCols), ('num', num_pipe, isc_numericCols)])
isc_train_transformed = isc_ct.fit_transform(isc_train)

In [29]:
# feature selection
isc_sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
isc_svd = TruncatedSVD(n_components=50)

In [30]:
# Logostic regression
logistic = LogisticRegression(penalty='l2', C=90.0, max_iter=10000)

In [31]:
# Build pipeline
logistic_pipe = Pipeline([('isc_transform', isc_ct), ('isc_svd', isc_svd), ('logistic', logistic)])

In [32]:
# Decide sample weight
sample_weights = train_df.apply(lambda row: pow(row.expected_cost, 1), axis=1)

In [34]:
# Execute pipeline
logistic_pipe.fit(isc_train, train_df['is_canceled'], logistic__sample_weight=sample_weights)

# Correlation coefficients
print(logistic_pipe.score(isc_train, train_df['is_canceled']))
print(logistic_pipe.score(isc_val, val_df['is_canceled']))

0.8194602093580527
0.8196220874067777




ValueError: Unknown label type: (0        9.886840e-01
1        2.034994e-01
2        1.358755e-01
3        1.485171e-01
4        3.427470e-12
             ...     
63427    4.975677e-02
63428    3.630723e-02
63429    2.985079e-02
63430    9.819158e-02
63431    4.094753e-01
Length: 63432, dtype: float64,)

In [51]:
def predict_canceled_proba(logistic_pipe, X):
    predicted_prob = logistic_pipe.predict_proba(X)
    will_be_canceled = np.empty(shape=predicted_prob.shape[0])
    for i in range(will_be_canceled.shape[0]):
        will_be_canceled[i] = predicted_prob[i][1]
    return will_be_canceled

In [57]:
# Make prediction

# Predict "adr"
predicted_adr = ridge_pipe.predict(val_df)

# Predict "is_canceled"
will_be_canceled = predict_canceled_proba(logistic_pipe, val_df)
    
# Predict revenue
predicted_revenue = predicted_adr * (1.0 - will_be_canceled)



In [58]:
val_df['predicted_revenue'] = pd.Series(predicted_revenue)

In [62]:
# Aggregate by date
daily_df = val_df.groupby(['arrival_date']).agg({'predicted_revenue':'sum'})

In [63]:
thresholds = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]
ranks = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
daily_df['predicted_label'] = pd.cut(daily_df.predicted_revenue, bins=thresholds,labels=ranks)

In [65]:
result_df = daily_df.join(raw_train_label_df.set_index('arrival_date'), how='inner')

In [67]:
result_df['err'] = result_df.apply(lambda row: abs(row.predicted_label - row.label), axis=1)

In [72]:
total_error = result_df['err'].sum(axis = 0, skipna = True) / result_df.shape[0]
print(total_error)

1.6354166666666667


In [73]:
print(result_df['err'].value_counts())

2.0    63
1.0    57
0.0    36
3.0    25
5.0     5
4.0     4
7.0     1
8.0     1
Name: err, dtype: int64
