In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd

In [4]:
# loading data
pd.set_option('display.max_columns', None)
df_city=pd.read_pickle("cleaned_city_bookings.pkl")
df_city.head(15)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,country_grouped,special_requests_clipped,deposit_paid,got_requested_room_type
40060,City Hotel,0,6,2015,July,27,1,0,2,1,0.0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,6.0,,0,Transient,0.0,0,0,Check-Out,2015-07-03,2015-07-01,PRT,0,0,1
40061,City Hotel,1,88,2015,July,27,1,0,4,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,76.5,0,1,Canceled,2015-07-01,2015-07-01,PRT,1,0,1
40062,City Hotel,1,65,2015,July,27,1,0,4,1,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,68.0,0,1,Canceled,2015-04-30,2015-07-01,PRT,1,0,1
40063,City Hotel,1,92,2015,July,27,1,2,4,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,76.5,0,2,Canceled,2015-06-23,2015-07-01,PRT,2,0,1
40064,City Hotel,1,100,2015,July,27,2,0,2,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,76.5,0,1,Canceled,2015-04-02,2015-07-02,PRT,1,0,1
40065,City Hotel,1,79,2015,July,27,2,0,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,76.5,0,1,Canceled,2015-06-25,2015-07-02,PRT,1,0,1
40066,City Hotel,0,3,2015,July,27,2,0,3,1,0.0,0,HB,PRT,Groups,TA/TO,0,0,0,A,A,1,No Deposit,1.0,,0,Transient-Party,58.67,0,0,Check-Out,2015-07-05,2015-07-02,PRT,0,0,1
40067,City Hotel,1,63,2015,July,27,2,1,3,1,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,68.0,0,0,Canceled,2015-06-25,2015-07-02,PRT,0,0,1
40068,City Hotel,1,62,2015,July,27,2,2,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,8.0,,0,Transient,76.5,0,1,No-Show,2015-07-02,2015-07-02,PRT,1,0,1
40069,City Hotel,1,62,2015,July,27,2,2,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,8.0,,0,Transient,76.5,0,1,No-Show,2015-07-02,2015-07-02,PRT,1,0,1


In [5]:
# split data into features and target
features = [
    'lead_time',
    'required_car_parking_spaces',
    'special_requests_clipped',
    'market_segment',
    'distribution_channel',
    'deposit_paid',
    'got_requested_room_type',
    'customer_type',
    'country_grouped'
    ]

target = 'is_canceled'
X = df_city[features]
y = df_city[target]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [7]:
# cat list for onehotencoding
ohe_list = ['market_segment', 'distribution_channel', 'deposit_paid',
            'got_requested_room_type', 'customer_type', 'country_grouped']

# num list for scaling
num_list = ['lead_time', 'required_car_parking_spaces', 'special_requests_clipped']

In [8]:
# Pipeline for OneHotEncoding
cat_transformer = Pipeline(steps = [('ohe', OneHotEncoder(handle_unknown = 'ignore'))])
 
# Pipeline for numerical for scaling
num_transformer = Pipeline(steps = [('scaling', StandardScaler())])

# Instantiating ColumnTransformer
preprocessor = ColumnTransformer(transformers = [('cat', cat_transformer, ohe_list),
                                                 ('num', num_transformer, num_list)                                                
                                                ], 
                                 remainder = 'passthrough')
# Instantiating final pipeline with model
pipeline_lr = Pipeline(steps = [('preprocessor', preprocessor), 
                                ('model', LogisticRegression(max_iter=1000, 
                                                             random_state = 42, 
                                                             class_weight = 'balanced'))])


In [9]:
# Training model
pipeline_lr.fit(X_train, y_train)

# predicting test data and get probability
y_test_pred_proba = pipeline_lr.predict_proba(X_test)[:,1]

# predicting test data and get probability
y_train_pred_proba = pipeline_lr.predict_proba(X_train)[:,1]

# metrics
print("AUC:", roc_auc_score(y_test, y_test_pred_proba))

AUC: 0.8782766005860867
