### In this notebook, we detail all of the features that we engineered with explanations of why we believed these features could be useful. Not all of the features shown in this JNB were used in our final models, as some did lower our AUC score, which we aimed to optimize.  

In [51]:
import pandas as pd
import datetime as dt
import numpy as np
import holidays as holiday_lib
#conda install country_converter
import country_converter as cc

In [52]:
df=pd.read_csv('hotel_bookings.csv')
df.drop(['is_canceled'], axis=1, inplace=True)
for x in df.columns:
    print(x, bookings[x].isna().sum() / bookings.shape[0])

hotel 0.0
lead_time 0.0
arrival_date_year 0.0
arrival_date_month 0.0
arrival_date_week_number 0.0
arrival_date_day_of_month 0.0
stays_in_weekend_nights 0.0
stays_in_week_nights 0.0
adults 0.0
children 3.350364352123293e-05
babies 0.0
meal 0.0
country 0.004087444509590418
market_segment 0.0
distribution_channel 0.0
is_repeated_guest 0.0
previous_cancellations 0.0
previous_bookings_not_canceled 0.0
reserved_room_type 0.0
assigned_room_type 0.0
booking_changes 0.0
deposit_type 0.0
agent 0.13686238378423654
company 0.943068933746545
days_in_waiting_list 0.0
customer_type 0.0
adr 0.0
required_car_parking_spaces 0.0
total_of_special_requests 0.0
reservation_status 0.0
reservation_status_date 0.0


In [53]:
df.hotel

0         Resort Hotel
1         Resort Hotel
2         Resort Hotel
3         Resort Hotel
4         Resort Hotel
              ...     
119385      City Hotel
119386      City Hotel
119387      City Hotel
119388      City Hotel
119389      City Hotel
Name: hotel, Length: 119390, dtype: object

In [54]:
#Cleaning

#filling missing values for children value as 0- signaling there were no children on the booking
df['children'] = df['children'].fillna(0)

#dataset explanation said that Undefined == SC; combined all undefined to be SC.
df["meal"] = df.meal.map(lambda x: "SC" if x == 'Undefined' else x)

#dropped all rows that had a missing country value ~463
df.dropna(subset=['country'], inplace=True)

In [55]:
#creating new arrival_date column from individual date columns
df['arrival_date'] = pd.to_datetime(df['arrival_date_year'].astype(str) + 
                                    df['arrival_date_month'].astype(str) + 
                                    df['arrival_date_day_of_month'].astype(str), format='%Y%B%d')

In [56]:
#creating 'season' variable - maybe cancellations happen more during a particular type of year? Could be weather influenced
df['season'] = df.arrival_date.dt.month.map(lambda x:'Spring' if x in [3,4,5] else 
                                            ('Summer' if x in [6,7,8] else 
                                             ('Fall' if x in [9,10,11] else 'Winter')))

In [57]:
#creating variable showing total length of stay by adding weekday and weeknight stays
df['stay_length'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

In [58]:
#creating a last minute variable, all bookings that were booked 5 days or earlier before check in are labeled as 1
df['last_minute'] = df.lead_time.map(lambda x: 1 if x < 5 else 0)

In [59]:
#extracting what day of the week the visitor was scheduled to arrive at the hotel; maybe guests that arrive on 
#certain weekdays/weekends are more likely to cancel. Business vs. leisure traveler?
df['day_of_week_of_arrival'] = (df['arrival_date']).dt.day_name()

In [60]:
#changing Month name to numeric
df['arrival_date_month'] = pd.to_datetime(df.arrival_date_month, format='%B').dt.month

In [61]:
#assigning bookings that were reserved the room type they requested 1, if they did not receive the room type they
#requested, they are assigned 0.
df['room_type_requested_and_received'] = np.where(df['reserved_room_type'] == df['assigned_room_type'], 1, 0)

In [62]:
#creating range of dates: 1 week before their arrival date, 1 week after their arrival date
df['range_before'] = df.arrival_date.apply(lambda x: x - dt.timedelta(weeks=1))
df['range_after'] = df.arrival_date.apply(lambda x: x + dt.timedelta(weeks=1))

#using holiday package to find local holidays in Portugal from the years of data we have and appending their dates
#to a list 
holidays = holiday_lib.PT(years =[2015,2016,2017])
holidates= []
for date, name in holidays.items():
    holidates.append(date)

#assigning True/False if any of the holiday dates are within the 2 weeks of their arrival_date range; as some 
#guests may arrive for a holiday a few days before actual holiday, or used holiday time after holiday for travel
hol=[any([(z>x)&(z<y) for z in holidates]) for x , y in zip(df.range_before, df.range_after)]
df['is_holiday'] = hol
df['is_holiday'] = df.is_holiday.map(lambda x: 1 if x == True else 0)

In [63]:
#using country converter package 
converter = cc.CountryConverter()

#code rounds through each country ISO-3 code per row, assigns what continent the country is part of in visitor_contient
#variable, if ISO-3 is not recognized == 'missing'
df['visitor_continent'] = df.country.map(lambda x: converter.convert(names = str(x),
                                                                                 to='continent',
                                                                                 not_found = 'missing'))

#2 bookings came from Antarctica, assigning those as missing as they seem to be errors.
df['visitor_continent'] = df['visitor_continent'].map(lambda x: "missing" if x == 'Antarctica' else x)

TMP not found in ISO3
TMP not found in ISO3
TMP not found in ISO3


In [64]:
#created binning variables for waiting list duration - quantiles were not ideal as median is 0.0, yet max is 391. 
df['no_wait'] = df['days_in_waiting_list'].map(lambda x: 1 if x == 0 else 0)
df['one_week_wait'] = df['days_in_waiting_list'].map(lambda x: 1 if x > 0 and x < 8 else 0)
df['week_to_month_wait'] = df['days_in_waiting_list'].map(lambda x: 1 if x > 7  and x < 31 else 0)
df['month_plus_wait'] = df['days_in_waiting_list'].map(lambda x: 1 if x > 31 else 0)

In [65]:
#binning variables with similar value distributions
binn =['booking_changes', 'previous_cancellations']

for x in binn:
    df[x + "_none"] = df[x].map(lambda x: 1 if x == 0 else 0)
    df[x + '_max5'] = df[x].map(lambda x: 1 if x > 0 and x < 6 else 0)
    df[x + '_many'] = df[x].map(lambda x: 1 if x > 5 else 0)

In [66]:
encode = ['hotel','market_segment','deposit_type','customer_type', 'day_of_week_of_arrival',
          'distribution_channel', 'meal', 'season', 'visitor_continent']

for var in encode:
    book = pd.get_dummies(df[var], prefix = var) 
    df = pd.concat([df, book], axis = 1) 
    del df[var]