In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
# Remove the target variable from the training set
label = train_df.pop('is_canceled').values

In [5]:
# transfer to a 2d array
country_train = train_df[['country']].copy()

In [7]:
# set pipeline
si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
steps = [si_step, ohe_step]
pipe = Pipeline(steps)

country_train_transformed = pipe.fit_transform(country_train)
country_test_transformed = pipe.transform(country_train)

In [8]:
numericCols = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',\
               'adults', 'children', 'babies', 'previous_cancellations',\
               'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',\
               'adr', 'required_car_parking_spaces', 'total_of_special_requests'
              ]
categoryCols = ['hotel', 'arrival_date_year', 'arrival_date_month',\
                'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',\
                'country', 'market_segment', 'distribution_channel',\
                'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',\
                'deposit_type', 'customer_type'
               ]
featureCols = numericCols + categoryCols

In [9]:
category_train = train_df[categoryCols]
category_train_transformed = pipe.fit_transform(category_train)

In [10]:
# Get individual pieces of pipeline
ohe = pipe.named_steps['ohe']

ohe.get_feature_names()

array(['x0_City Hotel', 'x0_Resort Hotel', 'x1_2015', 'x1_2016',
       'x1_2017', 'x2_April', 'x2_August', 'x2_December', 'x2_February',
       'x2_January', 'x2_July', 'x2_June', 'x2_March', 'x2_May',
       'x2_November', 'x2_October', 'x2_September', 'x3_1', 'x3_2',
       'x3_3', 'x3_4', 'x3_5', 'x3_6', 'x3_7', 'x3_8', 'x3_9', 'x3_10',
       'x3_11', 'x3_12', 'x3_13', 'x3_14', 'x3_15', 'x3_16', 'x3_17',
       'x3_18', 'x3_19', 'x3_20', 'x3_21', 'x3_22', 'x3_23', 'x3_24',
       'x3_25', 'x3_26', 'x3_27', 'x3_28', 'x3_29', 'x3_30', 'x3_31',
       'x3_32', 'x3_33', 'x3_34', 'x3_35', 'x3_36', 'x3_37', 'x3_38',
       'x3_39', 'x3_40', 'x3_41', 'x3_42', 'x3_43', 'x3_44', 'x3_45',
       'x3_46', 'x3_47', 'x3_48', 'x3_49', 'x3_50', 'x3_51', 'x3_52',
       'x3_53', 'x4_1', 'x4_2', 'x4_3', 'x4_4', 'x4_5', 'x4_6', 'x4_7',
       'x4_8', 'x4_9', 'x4_10', 'x4_11', 'x4_12', 'x4_13', 'x4_14',
       'x4_15', 'x4_16', 'x4_17', 'x4_18', 'x4_19', 'x4_20', 'x4_21',
       'x4_22', 'x4_23', 'x

In [11]:
# ColumnTransformer

# Transforming the categoric columns
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

cat_transformers = [('cat', cat_pipe, categoryCols)]
ct = ColumnTransformer(transformers=cat_transformers)

train_cat_transformed = ct.fit_transform(train_df)

In [12]:
# Retrieving the feature names
pl = ct.named_transformers_['cat']
ohe = pl.named_steps['ohe']
ohe.get_feature_names()

array(['x0_City Hotel', 'x0_Resort Hotel', 'x1_2015', 'x1_2016',
       'x1_2017', 'x2_April', 'x2_August', 'x2_December', 'x2_February',
       'x2_January', 'x2_July', 'x2_June', 'x2_March', 'x2_May',
       'x2_November', 'x2_October', 'x2_September', 'x3_1', 'x3_2',
       'x3_3', 'x3_4', 'x3_5', 'x3_6', 'x3_7', 'x3_8', 'x3_9', 'x3_10',
       'x3_11', 'x3_12', 'x3_13', 'x3_14', 'x3_15', 'x3_16', 'x3_17',
       'x3_18', 'x3_19', 'x3_20', 'x3_21', 'x3_22', 'x3_23', 'x3_24',
       'x3_25', 'x3_26', 'x3_27', 'x3_28', 'x3_29', 'x3_30', 'x3_31',
       'x3_32', 'x3_33', 'x3_34', 'x3_35', 'x3_36', 'x3_37', 'x3_38',
       'x3_39', 'x3_40', 'x3_41', 'x3_42', 'x3_43', 'x3_44', 'x3_45',
       'x3_46', 'x3_47', 'x3_48', 'x3_49', 'x3_50', 'x3_51', 'x3_52',
       'x3_53', 'x4_1', 'x4_2', 'x4_3', 'x4_4', 'x4_5', 'x4_6', 'x4_7',
       'x4_8', 'x4_9', 'x4_10', 'x4_11', 'x4_12', 'x4_13', 'x4_14',
       'x4_15', 'x4_16', 'x4_17', 'x4_18', 'x4_19', 'x4_20', 'x4_21',
       'x4_22', 'x4_23', 'x

In [13]:
# Transforming the numeric columns
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)

num_transformers = [('num', num_pipe, numericCols)]
ct = ColumnTransformer(transformers=num_transformers)

train_num_transformed = ct.fit_transform(train_df)

In [14]:
# Combining both categorical and numerical column transformations
ct = ColumnTransformer(transformers=[('cat', cat_pipe, categoryCols), ('num', num_pipe, numericCols)])
train_transformed = ct.fit_transform(train_df)

In [15]:
# SVD
svd = TruncatedSVD(n_components=3)

In [16]:
# Logistic regression
lr = LogisticRegression(penalty='l2', C=1.0)

ml_pipe = Pipeline([('transform', ct), ('svd', svd), ('lr', lr)])

ml_pipe.fit(train_df, label)
ml_pipe.score(train_df, label)

0.7171668615004753

In [19]:
# Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1126)
cross_val_score(ml_pipe, train_df, label, cv=kf).mean()


# Selecting parameters when Grid Searching
param_grid = {
    'transform__num__si__strategy': ['median'],
    'svd__n_components': [60, 70],
    'lr__penalty': ['l2'],
    'lr__C': [75.0]
}

gs = GridSearchCV(ml_pipe, param_grid, cv=kf)
gs.fit(train_df, label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=KFold(n_splits=5, random_state=1126, shuffle=True),
             estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('si',
                                                                                          SimpleImputer(fill_value='MISSING',
                                                                                                        strategy='constant')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['hotel',
                                                                          'arrival_date_year',
                                          

In [20]:
print(gs.best_params_)

print(gs.best_score_)

# Getting all the grid search results in a Pandas DataFrame
print(pd.DataFrame(gs.cv_results_))

{'lr__C': 75.0, 'lr__penalty': 'l2', 'svd__n_components': 60, 'transform__num__si__strategy': 'median'}
0.8225081930052852
   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_lr__C  \
0       4.165994      0.046248         0.111351        0.001417          75   
1       4.957987      0.173773         0.132769        0.025136          75   

  param_lr__penalty param_svd__n_components  \
0                l2                      50   
1                l2                      60   

  param_transform__num__si__strategy  \
0                             median   
1                             median   

                                              params  split0_test_score  \
0  {'lr__C': 75.0, 'lr__penalty': 'l2', 'svd__n_c...           0.823237   
1  {'lr__C': 75.0, 'lr__penalty': 'l2', 'svd__n_c...           0.825094   

   split1_test_score  split2_test_score  split3_test_score  split4_test_score  \
0           0.815962           0.821097           0.824648           