# Data Preparation Part 1

###  Import Libraries and Alias

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from dataframe_column_identifier import DataFrameColumnIdentifier
from sklearn.metrics import accuracy_score 
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import time

### Data Intiation

#### Data Cleansing and preprocessing

#### New Features

In [4]:
# Data Init
url="https://raw.githubusercontent.com/bensonnd/msds/master/ds7331/data/hotel_bookings.csv"
hotel = pd.read_csv(url)

# Data Cleaning Feature Engineering
#converting arrival y/m/d columns to a singular arrival date column
hotel['arrival_date'] = pd.to_datetime([f'{y}-{m}-{d}' for y, m, d in zip(hotel.arrival_date_year,
                                                                          hotel.arrival_date_month,
                                                                          hotel.arrival_date_day_of_month)])

# source:
# https://stackoverflow.com/questions/54487059/pandas-how-to-create-a-single-date-column-from-columns-containing-year-month

# add `departure_date` and `length_of_stay`
hotel['length_of_stay'] = hotel["stays_in_weekend_nights"] + hotel["stays_in_week_nights"]

# set `length_of_stay` as a pandas time delta
length = hotel['length_of_stay'].apply(np.ceil).apply(lambda x: pd.Timedelta(x, unit='D'))

hotel['departure_date'] = hotel['arrival_date'] + length

# source:
# https://stackoverflow.com/questions/42768649/add-days-to-date-in-pandas


# Adding total_revenue column to the data frame
hotel["total_revenue"] = abs(hotel["adr"]) * hotel["length_of_stay"]

# Adding the cancelation rate by country
hotel['is_canceled_int'] = pd.to_numeric(hotel['is_canceled'])

contry_cancellation_rate_df = pd.DataFrame(hotel.groupby(['country'])['is_canceled_int'].count())

contry_cancellation_rate_df.columns = ['country_count']
contry_cancellation_rate_df['cancelations'] = pd.DataFrame(hotel.groupby(['country'])['is_canceled_int'].sum())

contry_cancellation_rate_df['country_cancelation_rate'] = contry_cancellation_rate_df['cancelations'] / contry_cancellation_rate_df['country_count']

hotel = hotel.join(contry_cancellation_rate_df, on='country')

total_cancelations = hotel.is_canceled_int.sum()

hotel = hotel.drop(['country_count','cancelations','is_canceled_int'], axis=1)

# Changing Stays in Weeknights and Weekend nights to Boolean
hotel['stays_in_week_nights_cat'] = np.where(hotel['stays_in_week_nights']>0, 1, 0)
hotel['stays_in_weekend_nights_cat'] = np.where(hotel['stays_in_weekend_nights']>0, 1, 0)

# Dropping redundant columns
hotel = hotel.drop(['arrival_date_year','arrival_date_month','arrival_date_day_of_month'], axis=1)

# Finding missing values
hotel.columns[hotel.isnull().any()].tolist()

# Replacing missing values for categorical to "Unknown"
cat_cols = ["country","agent","company"]
hotel[cat_cols] = hotel[cat_cols].replace({np.nan:"Unknown"})

# Replacing missing values for continuous to 0
con_cols = ["children", "country_cancelation_rate"]
hotel[con_cols] = hotel[con_cols].replace({np.nan:0})

# Source:
# https://stackoverflow.com/questions/45416684/python-pandas-replace-multiple-columns-zero-to-nan

# converting these columns to string type
hotel[cat_cols] = hotel[cat_cols].astype(str)

#make all adr values positive. (only one is actually negative.)
hotel['adr'] = hotel['adr'].abs()

# list of continuous attributes
hotel_continuous = ['lead_time', 'arrival_date_week_number', 'stays_in_weekend_nights', 'stays_in_week_nights', 'length_of_stay', 
                      'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes','days_in_waiting_list', 'adr', 
                      'required_car_parking_spaces', 'total_of_special_requests', 'adults', 'children', 'babies','total_revenue', 'country_cancelation_rate']

# hotel df of continuos variables in the data set
hotelCont = hotel[hotel_continuous]

# Mean Normalization of the Continous Variables -  still contains large outliers pictured in graphs below
hotelCont_mean_normed = (hotelCont-hotelCont.mean())/(hotelCont.std())

# list of categorical attributes
hotel_categoricals =  ['hotel','is_canceled','deposit_type','agent','company','customer_type',
                       'reservation_status','meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest',
                       'reserved_room_type', 'assigned_room_type', 'stays_in_week_nights_cat','stays_in_weekend_nights_cat']

# setting categoricals as that type.
for cat in hotel_categoricals:
  hotel[cat] = hotel[cat].astype('category')

# hotel df of categorical variables
hotelCats = hotel[hotel_categoricals]


# converting reservation_status_date to datetime
hotel['reservation_status_date'] = pd.to_datetime(hotel['reservation_status_date'])


# hotel df of datetime variables
hotelDates = hotel.select_dtypes(include=['datetime64'])

# Removing outliers greater than 5
hotel_nol = hotelCont_mean_normed[(np.abs(hotelCont_mean_normed) < 5).all(axis=1)]


# Grabbing indices of the non-outlier rows
no_outlier_indices = pd.DataFrame(hotel_nol.index)
no_outlier_indices.rename(columns={0: "indices"}, inplace=True)
# no_outlier_indices

# This data set has removed the outliers and un-normed the data so that we can use it without snooping on our test data
hotel_no_outliers = pd.concat([hotelCont, hotelCats, hotelDates], axis = 1, join = 'inner')
hotel_no_outliers = hotel_no_outliers.iloc[no_outlier_indices.indices,]


# Changing Company to be a boolean variable. Running into issues when we split train/test that some of these are too rare to be populated
company_adjust = pd.DataFrame(hotel_no_outliers['company'])
company_adjust.replace(to_replace='Unknown', value = 0, inplace=True)
company_adjust = company_adjust.astype('float64')

# https://stackoverflow.com/questions/45790889/replace-non-zero-values-in-a-pandas-dataframe-with-1
company_adjust = company_adjust.astype(bool).astype(int)
#company_adjust.head(20)

# Changing Agent to be a boolean variable. Running into issues when we split train/test that some of these are too rare to be populated
agent_adjust = pd.DataFrame(hotel_no_outliers['agent'])
agent_adjust.replace(to_replace='Unknown', value = 0, inplace=True)
agent_adjust = agent_adjust.astype(bool).astype(int)
#agent_adjust.head(10)

# Create Right Room
trial_df = hotel_no_outliers
trial_df['assigned_room_type'].dtype,trial_df['reserved_room_type'].dtype

trial_df['right_room'] = np.where((trial_df['reserved_room_type'].astype(str)==trial_df['assigned_room_type'].astype(str)), 1, 0)

# Previous Cancel to Category
PrevCancel_adjust = pd.DataFrame(hotel_no_outliers['previous_cancellations'])
PrevCancel_adjust = PrevCancel_adjust.astype(bool).astype(int)
PrevCancel_adjust.columns = ['prev_canceled']


lt_cat = pd.DataFrame(hotel_no_outliers['lead_time'])
# Lead time categories 0 days to 1 week, 1 week to 1 month, 1 month to 6 months, greater than 6 months
lt_cat = pd.cut(lt_cat['lead_time'], bins=[0,7,31,180,737],
                labels=['booked_week', 'booked_month', 'booked_6_months', 'booked_long'])

# Country Categorizing
country_cat = pd.DataFrame(hotel_no_outliers['country'])
# countries changed to top 10 and others
country_cat['c_group'] = country_cat['country'].apply(lambda x: 'top_ten' 
                                                      if x in ['PRT','GBR','BEL', 'NLD', 'DEU','ESP', 'ITA', 'IRL', 'BRA', 'FRA'] 
                                                      else 'other_country')
trial_df = hotel_no_outliers.join(country_cat['c_group'])

# Dropping variables and adding in the new ones. 
# Also creates the dummy data set which contains the one hot encoded cats
dummy_trial = trial_df.drop(['company','agent','is_canceled', 'reservation_status',
                          'lead_time', 'country', 'previous_cancellations'], axis = 1)

dummy_trial['company'] = company_adjust
dummy_trial['agent'] = agent_adjust
dummy_trial['right_room'] = trial_df['right_room']
dummy_trial['previous_canceled'] = PrevCancel_adjust['prev_canceled']
dummy_trial['lead_time'] = lt_cat
dummy_trial = pd.get_dummies(dummy_trial, drop_first=True)


#((115232, 37), (115232, 67))
# Quality check on merged data
trial_df.shape, dummy_trial.shape

((115232, 38), (115232, 70))

### Define and prepare class variables

> We are going to predict cancelations (is_canceled), either a reservation canceled or did not cancel. This is a binary response, and is categorical, so this is a classification problem.

> is_canceled, our target for the classification task, was not balanced and we did not do any adjustment of the class variable because we have data directly from the hotels' databases for a 3 year period. Also, this is a large enough sample that we feel it is representative of the population of the hotels' records.

### Use proper variable representations

### Use Pre-processing methods for dimensionality reduction, scaling, etc.

### Remove variables that are not needed/useful for analysis 

# Data Preparation Part 2

### Describe the final dataset that is used for classification/regression (descriptions of variables included)

# Modeling and Evaluation 1

#### Choose and explain your evaluation metrics you will use. Why are the measure(s) appropriate for analyzing the results of your modeling? Give a detailed explanation backing up any assertions

> For our classification task, the target variable of cancelations versus non-cancelations is unbalanced (~30/70) so it will be a better choice to use F1 score - which is the harmonic mean of Recall and Precision - versus the single metrics themselves. We will also use accuracy to compare our model runs. 

> For our regression prediction we are going to predict length of stay (length_of_stay). We chose regression because the number of nights guests stayed at the hotel length_of_stay is a continous variable.   We will compare against the metrics of RMSE or mean absolute error. We will use RMSE because this metric penalizes large errors in the model (due to squaring first) and having a large error on a predicted length of stay may be catastrophic to the businesses uses of the model. 

# Modeling and Evaluation Part 2

#### Choose the method you will use for dividing your data into training and testing splits (i.e., are you using Stratified 10-fold cross validation? Why?) Explain why your chosen method is appropriate or use more than one method as appropriate.

> We will measure the effectiveness of this classification model by using Stratified 10-fold cross validation. The count of cancelations versus non-cancelations is unbalanced (~30/70) and we want to make sure that each fold maintains the same balance as the source data.

> We will measure the effectiveness of this regression model by using 10-fold cross validation.

# Modeling and Evaluation 3

#### Create three different classification/regression models for each task (e.g., random forest, KNN, and SVM for task one and the same or different algorithms for task two). 

> Classification Task: KNN, Logistic, NB


> Regression Task: KNN, OLS, Random Forest

#### Adjust parameters as appropriate to increase generalization performance using your chosen metric. You must investigate different parameters of the algorithms! (Hyperparameter tuning and gridsearchcv)

# Modeling and Evaluation Part 4

#### Analyze the results using your chosen method of evaluation. Use visualizations of the results to bolster the analysis. Explain any visuals and analyze why they are interesting to someone that might use this model

> List of Visualizations:
>    - ROC (Classification) 

>    - KNN Viz https://towardsdatascience.com/knn-visualization-in-just-13-lines-of-code-32820d72c6b6

>    - SE Link for Regression Viz https://stats.stackexchange.com/questions/89747/how-to-describe-or-visualize-a-multiple-linear-regression-model

>    - Confusion Matrices as heat maps (Classification) https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea

>    - Correlation Matrix heat maps (Regression)
    

# Modeling and Evaluation Part 5

### Discuss the advantages of each model for each classification task, if any. If there are not advantages, explain why. Is any model better than another?

> Better on Metrics?

> Better on Time?

> Sparse vs Non

> Model Specific https://www.quora.com/What-are-the-advantages-of-different-classification-algorithms, https://discuss.analyticsvidhya.com/t/which-one-to-use-randomforest-vs-svm-vs-knn/2897/3, https://towardsdatascience.com/comparative-study-on-classic-machine-learning-algorithms-24f9ff6ab222

####  Is the difference significant with 95% confidence? proper statistical comparison methods. You must use statistical comparison techniques—be sure they are appropriate for your chosen method of validation as discussed in unit 7 of the course.

> Comparing on Statistics: https://machinelearningmastery.com/statistical-significance-tests-for-comparing-machine-learning-algorithms/

# Modeling and Evaluation Part 6

### Which attributes from your analysis are most important? Use proper methods discussed in class to evaluate the importance of different attributes. Discuss the results and hypothesize about why certain attributes are more important than others for a given classification task

### Do we need to this for every model iteration? ASK HIM ON THURSDAY

# Deployment NEIL GOT THIS

### How useful is your model for interested parties (i.e., the companies or organizations that might want to use it for prediction)?

> Have a discussion about scope of use

### How would you measure the model's value if it was used by these parties?

###  How would your deploy your model for interested parties? What other datashould be collected? How often would the model need to be updated, etc.?

# Exceptional Work