# Machine Learning in Python - Project 1

Due Friday, April 9th by 5 pm UK local time.

*include contributors names here*

## 0. Setup

In [123]:
# Install required packages
!pip install -q -r requirements.txt

In [124]:
# Add any additional libraries or submodules below

# Display plots inline
%matplotlib inline

# Data libraries
import pandas as pd
import numpy as np

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting defaults
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['figure.dpi'] = 80

# sklearn modules
import sklearn

In [125]:
# Load data
d = pd.read_csv("hotel.csv")



In [126]:
d.head()

Unnamed: 0,is_canceled,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,0,Resort Hotel,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,,,0,Transient,0.0,0,0
1,0,Resort Hotel,737,2015,July,27,1,0,0,2,...,C,4,No Deposit,,,0,Transient,0.0,0,0
2,0,Resort Hotel,7,2015,July,27,1,0,1,1,...,C,0,No Deposit,,,0,Transient,75.0,0,0
3,0,Resort Hotel,13,2015,July,27,1,0,1,1,...,A,0,No Deposit,304.0,,0,Transient,75.0,0,0
4,0,Resort Hotel,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,240.0,,0,Transient,98.0,0,1


In [127]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 30 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   is_canceled                     119390 non-null  int64  
 1   hotel                           119390 non-null  object 
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [128]:
perc_missing_data = pd.DataFrame([d.isnull().sum(),d.isnull().sum()*100.0/d.shape[0]]).T
perc_missing_data.columns = ['No. of Missing Data', '% Missing Data']
perc_missing_data

Unnamed: 0,No. of Missing Data,% Missing Data
is_canceled,0.0,0.0
hotel,0.0,0.0
lead_time,0.0,0.0
arrival_date_year,0.0,0.0
arrival_date_month,0.0,0.0
arrival_date_week_number,0.0,0.0
arrival_date_day_of_month,0.0,0.0
stays_in_weekend_nights,0.0,0.0
stays_in_week_nights,0.0,0.0
adults,0.0,0.0


In [129]:
d['deposit_type'].value_counts()

No Deposit    104641
Non Refund     14587
Refundable       162
Name: deposit_type, dtype: int64

In [130]:


d['children'].value_counts()

0.0     110796
1.0       4861
2.0       3652
3.0         76
10.0         1
Name: children, dtype: int64

In [131]:
d['children'].fillna(0,inplace=True)

In [132]:
perc_country_data = pd.DataFrame([d['country'].value_counts(),d['country'].value_counts()*100/d.shape[0]]).T
perc_country_data.columns = ['Count', '% Distribution']
perc_country_data

Unnamed: 0,Count,% Distribution
PRT,48590.0,40.698551
GBR,12129.0,10.159142
FRA,10415.0,8.723511
ESP,8568.0,7.176480
DEU,7287.0,6.103526
...,...,...
AIA,1.0,0.000838
NIC,1.0,0.000838
ATF,1.0,0.000838
DMA,1.0,0.000838


In [133]:
d['country'].fillna('PRT',inplace=True)

In [134]:
d.drop(['agent','company'],axis=1,inplace=True)

In [135]:
perc_missing_data = pd.DataFrame([d.isnull().sum(),d.isnull().sum()*100.0/d.shape[0]]).T
perc_missing_data.columns = ['No. of Missing Data', '% Missing Data']
perc_missing_data

Unnamed: 0,No. of Missing Data,% Missing Data
is_canceled,0.0,0.0
hotel,0.0,0.0
lead_time,0.0,0.0
arrival_date_year,0.0,0.0
arrival_date_month,0.0,0.0
arrival_date_week_number,0.0,0.0
arrival_date_day_of_month,0.0,0.0
stays_in_weekend_nights,0.0,0.0
stays_in_week_nights,0.0,0.0
adults,0.0,0.0


In [136]:
d = d.drop(['meal', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'country', 'reserved_room_type', 'assigned_room_type','children','babies','required_car_parking_spaces'],axis=1)



In [137]:
d = pd.concat([d, 
                 pd.get_dummies(d['hotel'], drop_first=True), 
                 pd.get_dummies(d['arrival_date_month'], drop_first=True), 
                 pd.get_dummies(d['market_segment'], drop_first=True),
                 pd.get_dummies(d['distribution_channel'], drop_first=True),
                 pd.get_dummies(d['customer_type'], drop_first=True),
                 pd.get_dummies(d['deposit_type'], drop_first=True)
                 ], axis=1)
d = d.drop(['hotel','arrival_date_month','market_segment','distribution_channel','customer_type','deposit_type'], axis=1)

In [138]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 40 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   is_canceled                     119390 non-null  int64  
 1   lead_time                       119390 non-null  int64  
 2   stays_in_weekend_nights         119390 non-null  int64  
 3   stays_in_week_nights            119390 non-null  int64  
 4   adults                          119390 non-null  int64  
 5   is_repeated_guest               119390 non-null  int64  
 6   previous_cancellations          119390 non-null  int64  
 7   previous_bookings_not_canceled  119390 non-null  int64  
 8   booking_changes                 119390 non-null  int64  
 9   days_in_waiting_list            119390 non-null  int64  
 10  adr                             119390 non-null  float64
 11  total_of_special_requests       119390 non-null  int64  
 12  Resort Hotel    

## MODELS

In [139]:
X = d.iloc[:, 1:].values
y = d.iloc[:, 0].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [140]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [141]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
classifier = LogisticRegression(random_state = 0, max_iter=250)
classifier.fit(X_train, y_train)

# Predict
y_pred = classifier.predict(X_test)

# Computing accuracy

sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_pred)

0.7559358598824693

In [142]:
from sklearn import tree


clf = tree.DecisionTreeClassifier(max_depth=25)
clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_pred)

0.8008256303884976

In [144]:

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

base_models = [("DT_model", tree.DecisionTreeClassifier(random_state=0)),
               ("RF_model", RandomForestClassifier(random_state=0,n_jobs=-1)),
               ("LR_model", LogisticRegression(random_state=0,n_jobs=-1))]

# split data into 'kfolds' parts for cross validation,
# use shuffle to ensure random distribution of data:
kfolds = 5 
split = KFold(n_splits=kfolds, shuffle=True, random_state=0)

# Preprocessing, fitting, making predictions and scoring for every model:
for name, model in base_models:
    # pack preprocessing of data and the model in a pipeline:
    model_steps = Pipeline(steps=[
                              ('model', model)])
    
    # get cross validation score for each model:
    cv_results = cross_val_score(model_steps, 
                                 X_train, y_train, 
                                 cv=split,
                                 scoring="roc_auc",
                                 n_jobs=-1)
    # output:
    min_score = round(min(cv_results), 4)
    max_score = round(max(cv_results), 4)
    mean_score = round(np.mean(cv_results), 4)
    std_dev = round(np.std(cv_results), 4)
    print(f"{name} cross validation accuarcy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

DT_model cross validation accuarcy score: 0.7905 +/- 0.0015 (std) min: 0.7883, max: 0.7926
RF_model cross validation accuarcy score: 0.9105 +/- 0.0015 (std) min: 0.9091, max: 0.9127
LR_model cross validation accuarcy score: 0.8365 +/- 0.0029 (std) min: 0.8334, max: 0.8408


## 1. Introduction

*This section should include a brief introduction to the task and the data (assume this is a report you are delivering to a client). If you use any additional data sources, you should introduce them here and discuss why they were included.*

*Briefly outline the approaches being used and the conclusions that you are able to draw.*

## 2. Exploratory Data Analysis and Feature Engineering

*Include a detailed discussion of the data with a particular emphasis on the features of the data that are relevant for the subsequent modeling. Including visualizations of the data is strongly encouraged - all code and plots must also be described in the write up. Think carefully about whether each plot needs to be included in your final draft - your report should include figures but they should be as focused and impactful as possible.*

*Additionally, this section should also implement and describe any preprocessing / feature engineering of the data. Specifically, this should be any code that you use to generate new columns in the data frame `d`. All of this processing is explicitly meant to occur before we split the data in to training and testing subsets. Processing that will be performed as part of an sklearn pipeline can be mentioned here but should be implemented in the following section.*

*All code and figures should be accompanied by text that provides an overview / context to what is being done or presented.*

## 3. Model Fitting and Tuning

*In this section you should detail your choice of model and describe the process used to refine and fit that model. You are strongly encouraged to explore many different modeling methods (e.g. logistic regression, classification trees, SVC, etc.) but you should not include a detailed narrative of all of these attempts. At most this section should mention the methods explored and why they were rejected - most of your effort should go into describing the model you are using and your process for tuning and validatin it.*

*This section should also include the full implementation of your final model, including all necessary validation. As with figures, any included code must also be addressed in the text of the document.*

## 4. Discussion & Conclusions


*In this section you should provide a general overview of your final model, its performance, and reliability. You should discuss what the implications of your model are in terms of the included features, predictive performance, and anything else you think is relevant.*

*This should be written with a target audience of the client who is with the hotel data and university level mathematics but not necessarily someone who has taken a postgraduate statistical modeling course. Your goal should be to convince this audience that your model is both accurate and useful.*

*Keep in mind that a negative result, i.e. a model that does not work well predictively, that is well explained and justified in terms of why it failed will likely receive higher marks than a model with strong predictive performance but with poor or incorrect explinations / justifications.*

## 5. Convert Document

In [None]:
# Run the following to render to PDF
!jupyter nbconvert --to pdf proj2.ipynb