In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import make_scorer, confusion_matrix, log_loss
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

In [2]:
# Import training data
raw_train_df = pd.read_csv("train.csv")
raw_train_label_df = pd.read_csv('train_label.csv')

In [3]:
# Split into validation set and training set
dates = raw_train_df.arrival_date.unique()
train_indices, val_indices = train_test_split(dates, test_size=0.3)
train_df = raw_train_df.set_index('arrival_date').loc[train_indices, :]
val_df = raw_train_df.set_index('arrival_date').loc[val_indices, :]

In [4]:
train_df.reset_index(drop=False, inplace=True)
val_df.reset_index(drop=False, inplace=True)

In [5]:
# ColumnTransformer

# Transforming the categoric columns
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

# Transforming the numeric columns
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)

In [6]:
# Stage 1: Predict ADR by regression
# Determine features
adr_numericCols = ['arrival_date_year', 'lead_time', 'stays', 'stays_in_weekend_nights',\
               'stays_in_week_nights', 'adults', 'children', 'babies',\
               'booking_changes', 'days_in_waiting_list',\
               'required_car_parking_spaces', 'total_of_special_requests'
              ]
adr_categoricCols = ['hotel', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type'
               ]

adr_featureCols = adr_numericCols + adr_categoricCols

In [7]:
adr_train = train_df[adr_featureCols]
adr_val = val_df[adr_featureCols]

In [8]:
# Combining both categorical and numerical column transformations
adr_ct = ColumnTransformer(transformers=[('cat', cat_pipe, adr_categoricCols), ('num', num_pipe, adr_numericCols)])
adr_train_transformed = adr_ct.fit_transform(adr_train)

In [9]:
# Retrieving the feature names
cat_transformers = [('cat', cat_pipe, adr_categoricCols)]
cat_ct = ColumnTransformer(transformers=cat_transformers)
train_cat_transformed = cat_ct.fit_transform(train_df)

num_transformers = [('num', num_pipe, adr_numericCols)]
num_ct = ColumnTransformer(transformers=num_transformers)
train_num_transformed = num_ct.fit_transform(train_df)

cat_pl = cat_ct.named_transformers_['cat']
ohe = cat_pl.named_steps['ohe']
transformed_feature_names = list(ohe.get_feature_names()) + adr_numericCols
print("Total number of features = ", len(transformed_feature_names))

Total number of features =  304


In [10]:
# feature selection
adr_sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
adr_svd = TruncatedSVD(n_components=10)

In [11]:
# Linear regression
ridge = Ridge(alpha=1.0/(2*20), max_iter=10000) # alpha = 1 / 2C

In [56]:
# feature transformation
poly = PolynomialFeatures(2)

In [13]:
# Build pipeline
ridge_pipe = Pipeline([('adr_transform', adr_ct), ('ridge', ridge)])

In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61670 entries, 0 to 61669
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   arrival_date                    61670 non-null  object 
 1   Unnamed: 0                      61670 non-null  int64  
 2   Unnamed: 0.1                    61670 non-null  int64  
 3   ID                              61670 non-null  int64  
 4   hotel                           61670 non-null  object 
 5   is_canceled                     61670 non-null  int64  
 6   lead_time                       61670 non-null  int64  
 7   arrival_date_year               61670 non-null  int64  
 8   arrival_date_month              61670 non-null  object 
 9   arrival_date_week_number        61670 non-null  int64  
 10  arrival_date_day_of_month       61670 non-null  int64  
 11  stays_in_weekend_nights         61670 non-null  int64  
 12  stays_in_week_nights            

In [14]:
# Execute pipeline
ridge_pipe.fit(adr_train, train_df['adr_pp'])
ridge_pipe.score(adr_train, train_df['adr_pp'])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [14]:
# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1126)

# Selecting parameters when Grid Searching
param_grid = {
    'adr_transform__num__si__strategy': ['median', 'mean'],
    'adr_svd__n_components': [20, 40, 60],
    'ridge__alpha': [1.0/(2 * 10), 1.0/(2 * 30), 1.0/(2 * 50), 1.0/(2 * 70)]
}

gs = GridSearchCV(ridge_pipe, param_grid, cv=kf)
gs.fit(adr_train, train_df['adr'])

print(gs.best_params_)
print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

ValueError: Invalid parameter transform for estimator Pipeline(steps=[('adr_transform',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('si',
                                                                   SimpleImputer(fill_value='MISSING',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['hotel',
                                                   'arrival_date_month',
                                                   'arrival_date_week_number',
                                                   'arrival_date_day_of_month',
                                                   'meal', 'country',
                                                   'market_segment',
                                                   'distribution_channel',
                                                   'is_repe...
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['lead_time', 'stays',
                                                   'stays_in_weekend_nights',
                                                   'stays_in_week_nights',
                                                   'adults', 'children',
                                                   'babies', 'booking_changes',
                                                   'days_in_waiting_list',
                                                   'required_car_parking_spaces',
                                                   'total_of_special_requests'])])),
                ('adr_svd', TruncatedSVD(n_components=50)),
                ('ridge', Ridge(alpha=0.025, max_iter=10000))]). Check the list of available parameters with `estimator.get_params().keys()`.