In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd

In [7]:
# loading data
pd.set_option('display.max_columns', None)
df_resort=pd.read_pickle("cleaned_resort_bookings.pkl")
df_resort.head(15)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,country_grouped,lead_time_scaled,required_car_parking_spaces_scaled,special_requests_clipped,special_requests_scaled,deposit_paid,got_requested_room_type
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,2015-07-01,PRT,2.280972,-0.256841,0,-0.728703,0,1
1,Resort Hotel,0,444,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,2015-07-01,PRT,3.256111,-0.256841,0,-0.728703,0,1
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,GBR,-0.92169,-0.256841,0,-0.728703,0,0
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,GBR,-0.864328,-0.256841,0,-0.728703,0,1
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,GBR,-0.854768,-0.256841,1,0.554576,0,1
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,GBR,-0.854768,-0.256841,1,0.554576,0,1
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,0,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03,2015-07-01,PRT,-0.988611,-0.256841,0,-0.728703,0,1
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,0.0,0,FB,PRT,Direct,Direct,0,0,0,C,C,0,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03,2015-07-01,PRT,-0.902569,-0.256841,1,0.554576,0,1
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06,2015-07-01,PRT,-0.175995,-0.256841,1,0.554576,0,1
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,0.0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22,2015-07-01,PRT,-0.271597,-0.256841,0,-0.728703,0,1


In [8]:
# split data into features and target
features = [
    'lead_time_scaled',
    'required_car_parking_spaces_scaled',
    'special_requests_scaled',
    'market_segment',
    'distribution_channel',
    'deposit_paid',
    'got_requested_room_type',
    'customer_type',
    'country_grouped'
    ]

target = 'is_canceled'
X = df_resort[features]
y = df_resort[target]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [10]:
# cat list for onehotencoding
ohe_list = ['market_segment', 'distribution_channel', 'deposit_paid',
            'got_requested_room_type', 'customer_type', 'country_grouped']

# num list for scaling
num_list = ['lead_time', 'required_car_parking_spaces', 'special_requests']

In [13]:
# Pipeline for OneHotEncoding
# drop = 'first' because all cat_cols are binary coded
cat_transformer = Pipeline(steps = [('ohe', OneHotEncoder(handle_unknown = 'ignore'))])

# Pipeline for numerical for scaling
#num_transformer = Pipeline(steps = [('scaling', StandardScaler())])

# Instantiating ColumnTransformer
preprocessor = ColumnTransformer(transformers = [('cat', cat_transformer, ohe_list)                                                
                                                ], 
                                 remainder = 'passthrough')
#('num', num_transformer, num_list)    oben einfügen wenn scaling hier
# Instantiating final pipeline with model
pipeline_lr = Pipeline(steps = [('preprocessor', preprocessor), 
                                ('model', LogisticRegression(max_iter=1000, 
                                                             random_state = 42, 
                                                             class_weight = 'balanced'))])


In [14]:
# Training model
pipeline_lr.fit(X_train, y_train)

# predicting test data and get probability
y_test_pred_proba = pipeline_lr.predict_proba(X_test)[:,1]

# predicting test data and get probability
y_train_pred_proba = pipeline_lr.predict_proba(X_train)[:,1]

# metrics
print("AUC:", roc_auc_score(y_test, y_test_pred_proba))

AUC: 0.8909115000223127
