In [38]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew 
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
import squarify

import plotly.offline as plty
from plotly import tools
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot, plot 
import plotly.graph_objs as go 

from sklearn.preprocessing import LabelEncoder

from typing import List
import itertools

%matplotlib inline
plt.style.use('fivethirtyeight')
init_notebook_mode(connected=True)
sns.set_style("whitegrid")
sns.set_context("paper")

pd.set_option("display.max_columns", None)

In [39]:
!pip install -q feature-engine

### Helper Functions

In [40]:
def DataDesc(df: pd.DataFrame) -> pd.DataFrame:
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 
    
    return summary


def Null_Count(df : pd.DataFrame) -> pd.DataFrame:
    df_null = df.isnull().sum().sort_values(ascending = False).rename('Null').reset_index()

    null_count = df_null['Null']
    null_percent = (null_count * 100) / (df.shape[0])

    df_null = pd.concat([df_null['index'],null_count,null_percent], axis=1, keys=['Column','Null_Count','Null_Percent'])

    return df_null[df_null['Null_Count'] != 0]


def plot_feature_vs_target(df : pd.DataFrame, feature: str):
    group = df.groupby([feature,'is_canceled']).size().rename('Count').reset_index() 
    fig  = px.histogram(group, 
              x=feature, 
              y='Count',
              color='is_canceled',
              color_discrete_sequence=["#457b9d","#fca311"],
              template='plotly_white')

    fig.update_layout(width=900, height=400, 
                      barmode='group',
                      title= {'text': f"{feature} vs Cancellation",
                              'y':0.95,'x':0.5,
                              'xanchor': 'center',
                              'yanchor': 'top'},
                     showlegend=True,
                     margin = dict(l=25, r=10, t=50, b=10))
                 
                 
    fig.show()
    
    # Calculate Rate of cancellations
    df_ratio = pd.DataFrame()
    ratio = list()
    values = group[feature].unique()
    for val in values:
        try:
            cancelled = group.query(f"is_canceled == 1 and {feature} == '{val}'")['Count'].values[0]
        except:
            cancelled = 0
        try:
            confirmed = group.query(f"is_canceled == 0 and {feature} == '{val}'")['Count'].values[0]
        except:
            confirmed = 0
        
        ratio.append(cancelled/(cancelled+confirmed))

    df_ratio['values'] = values
    df_ratio['Rate of Cancellation'] = ratio   

    print('')
    print('Cancellation Rate')
    display(df_ratio.sort_values(by='Rate of Cancellation', ascending=False))
    
    
    
    
#correlation heatmap of dataset
def correlation_heatmap(df: pd.DataFrame):
    _ , ax = plt.subplots(figsize =(20, 30))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':0.5 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':15 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=30)
    

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### 1. Reading Data

In [41]:
PATH = '../input/hotel-bookings/hotel_bookings (Assignment).csv'

df_hotel = pd.read_csv(PATH)

df_hotel.info()

In [42]:
df_hotel.sample(7, random_state=10)

## 2. Exploring different Features

### i. Columns : Unique, Missing values

In [43]:
DataDesc(df_hotel)

### ii. Columns with null values

In [44]:
Null_Count(df_hotel)

### Exploring - Company

In [45]:
sns.distplot(df_hotel[~df_hotel.company.isnull()].company, bins=50)

The company values peak at ranges 50, 230

### Distributuon of company vs cancellation

In [46]:
fig = plt.figure(figsize=(15,5))

ax1 = fig.add_subplot(121)
_ = sns.distplot(df_hotel[~df_hotel.company.isnull()].query("is_canceled == 1")["company"], bins=50, color='#EC2700', ax=ax1)
_ = ax1.set_title('Cancelled', fontsize=20)
_ = ax1.set_xlabel("company",fontsize=15)
_ = ax1.set_ylabel("")

ax2 = fig.add_subplot(122)
_ = sns.distplot(df_hotel[~df_hotel.company.isnull()].query("is_canceled == 0")["company"], bins=50, color='#0064EC', ax=ax2)
_ = ax2.set_title('Confirmed', fontsize=20)
_ = ax2.set_xlabel("Company",fontsize=15)
_ = ax2.set_ylabel("")

From the above, we see that the distributions are somewhat similar in both the cases.<br>
Company values for ranges 40,70,210,350,380 have very high cancellations as compared to confirmation.<br>
Values for range 50 show very high confirmations as compared to cancellations

### Exploring Agent

In [47]:
sns.distplot(df_hotel[~df_hotel.agent.isnull()].agent)

In [48]:
fig = plt.figure(figsize=(15,5))

ax1 = fig.add_subplot(121)
_ = sns.distplot(df_hotel[~df_hotel.agent.isnull()].query("is_canceled == 1")["agent"], bins=50, color='#EC2700', ax=ax1)
_ = ax1.set_title('Cancelled', fontsize=20)
_ = ax1.set_xlabel("Agent",fontsize=15)
_ = ax1.set_ylabel("")

ax2 = fig.add_subplot(122)
_ = sns.distplot(df_hotel[~df_hotel.agent.isnull()].query("is_canceled == 0")["agent"], bins=50, color='#0064EC', ax=ax2)
_ = ax2.set_title('Confirmed', fontsize=20)
_ = ax2.set_xlabel("Agent",fontsize=15)
_ = ax2.set_ylabel("")

Distributions for cancelled & confirmations are very similar for feature Agent

### Analyzing Reservation Status, Arrival Date & Cancellations

Strafied Sampling Based on Reservation status

In [49]:
df_hotel[['arrival_date_month','arrival_date_day_of_month', 'reservation_status_date', 'reservation_status']].groupby('reservation_status', group_keys=False)\
                                .apply(lambda x: x.sample(4))

In [50]:
display(df_hotel.groupby(['is_canceled','reservation_status']).size().rename('Count').reset_index())

We can clearly see that Reservation status(Canceled & No-show) correspond to Target variable is_canceled, <br>hence we need to drop features reservation_status & reservation_status_date

### Comparison of Categorical features with target.
#### Cancellation Rate
#### Hotel

In [51]:
plot_feature_vs_target(df_hotel, 'hotel')

We can see that for City hotel the ratio of cancellation-Confirmation is higher(40%) as comapred to Resort hotel(20%)

In [52]:
plot_feature_vs_target(df_hotel, 'customer_type')

In [53]:
plot_feature_vs_target(df_hotel, 'deposit_type')

Deposit Type: Non Refund, has extremely high cancellation rates more than 99%

In [54]:
plot_feature_vs_target(df_hotel, 'distribution_channel')

In [55]:
plot_feature_vs_target(df_hotel, 'market_segment')

In [56]:
plot_feature_vs_target(df_hotel, 'assigned_room_type')

In [57]:
plot_feature_vs_target(df_hotel, 'arrival_date_month')

Feature Month does not really impact the cancellation as <br>all the months have similar cancellationrates

### Class Distribution

In [58]:
_ = sns.countplot(df_hotel['is_canceled'])

Our Focus would be on Cancellations(Minority class), while training the models

### 3. Feature Selection

In [59]:
# Creating a copy of original dataframe for feature selection analysis
df = df_hotel.copy()

# Dropping company as 94% data is missing
df.drop(['reservation_status', 'reservation_status_date', 'company'], axis=1, inplace=True)
df['country'].fillna('Missing', inplace=True)
df['agent'].fillna(df['agent'].median(), inplace=True)
df.dropna(axis=0, inplace=True)


"""
median_canceled = df.groupby(['is_canceled'])['company'].agg('median').values[0]
median_confirmed = df.groupby(['is_canceled'])['company'].agg('median').values[1]

def impute_median(values):    
    is_canceled = values[0]
    company = values[1]
    
    if not company:
        if is_canceled == 1:
            return median_canceled
        else:
            return median_confirmed

    return company"""
    
# df['company'] = df[['is_canceled', 'company']].apply(impute_company_median, axis=1)

### Confirming if any null values

In [60]:
df.isnull().sum()

In [61]:
NUMERICAL_COLS = df.columns[df.dtypes != 'object']
CATEGORICAL_COLS = df.columns[df.dtypes == 'object']

In [62]:
# Label Encoding
le = LabelEncoder()

df[CATEGORICAL_COLS] = df[CATEGORICAL_COLS].apply(le.fit_transform)

### Correlation Between Numerical Features

In [63]:
correlation_heatmap(df[NUMERICAL_COLS])

We do not see high correlation between features

### Feature imporatance/ Selection using RForrest

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [65]:
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123)



rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [66]:
# Prediction
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

In [67]:
# Get feature importance
selected_features = X.columns.to_list()
feature_importance = pd.DataFrame(selected_features, columns = ["Feature Label"])
feature_importance["Feature Importance"] = rf.feature_importances_

# Sort by feature importance
feature_importance = feature_importance.sort_values(by="Feature Importance", ascending=False)

# Set graph style
sns.set(font_scale = 1.75)
sns.set_style({"axes.facecolor": "1.0", "axes.edgecolor": "0.85", "grid.color": "0.85",
               "grid.linestyle": "-", 'axes.labelcolor': '0.4', "xtick.color": "0.4",
               'ytick.color': '0.4'})

# Set figure size and create barplot
f, ax = plt.subplots(figsize=(12, 9))
sns.barplot(x = "Feature Importance", y = "Feature Label",
            palette = reversed(sns.color_palette('YlOrRd', 15)),  data = feature_importance)

# Generate a bolded horizontal line at y = 0
ax.axvline(x = 0, color = 'black', linewidth = 4, alpha = .7)

# Turn frame off
ax.set_frame_on(False)

# Tight layout
plt.tight_layout()

### Recursive Feature Extraction

In [68]:
from sklearn.feature_selection import RFE


rfe_random_forrest = RFE(estimator=RandomForestClassifier(), n_features_to_select=20, step=2, verbose=1)

rfe_random_forrest.fit(X_train, y_train)
random_forrest_mask = rfe_random_forrest.support_

In [69]:
result = dict(zip(X.columns, rfe_random_forrest.ranking_))
pd.concat([pd.Series(result.keys()), pd.Series(result.values())], axis=1)

In [70]:
# Features that were dropped by RandomForrest

list(df.drop('is_canceled', axis=1).columns[~random_forrest_mask])

Columns with values > 1 are less important and were rejected while fitting the model<br>
We can see that features Babies & days_in_waiting_list were rejected first by the model<br><br>

Even though feature year was not dropped by the model, we will still drop it as it will not contribute to prediction

In [71]:
from sklearn.linear_model import LogisticRegression


rfe_log_reg = RFE(estimator=LogisticRegression(solver='liblinear'), n_features_to_select=20, step=2, verbose=1)

rfe_log_reg.fit(X_train, y_train)
log_reg_mask = rfe_log_reg.support_

In [72]:
result = dict(zip(X.columns, rfe_log_reg.ranking_))
#pd.concat([pd.Series(result.keys()), pd.Series(result.values())], axis=1)
result

In [73]:
# Features that were dropped by LogisticRegression

df.drop('is_canceled', axis=1).columns[~log_reg_mask]

In [74]:
_ = sns.violinplot(data=df_hotel, y='days_in_waiting_list', x='is_canceled')

### 4. Find out the best Model

In [75]:
from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder
from feature_engine.imputation import (
    AddMissingIndicator,
    CategoricalImputer,
    MeanMedianImputer,
)
from feature_engine.selection import DropFeatures
from feature_engine.transformation import LogTransformer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer, MinMaxScaler

from sklearn.base import BaseEstimator, TransformerMixin

### Create Mappings

In [76]:
mapping_room_type = dict()
for i, room in enumerate(sorted(list(df_hotel['assigned_room_type'].unique()))):
    mapping_room_type[room] = i
mapping_room_type

In [77]:
mapping_month = {
    'July' : 7, 'August' : 8, 'September' : 9, 'October': 10, 'November' : 11, 'December' : 12,
       'January' : 1, 'February' : 2, 'March' : 3, 'April' : 4, 'May' : 5, 'June' : 6
}

In [78]:
encode_columns = ['country', 'market_segment','distribution_channel', 
                  'deposit_type', 'customer_type']

In [79]:
drop_features = ['hotel','children','babies','meal','arrival_date_year','arrival_date_week_number',
                 'company', 'reservation_status', 'reservation_status_date']

In [80]:
# Mapper class to map Categorical Features

class Mapper(BaseEstimator, TransformerMixin):
    """Categorical variable mapper."""

    def __init__(self, variables: List[str], mappings: dict):

        if not isinstance(variables, list):
            raise ValueError("variables should be a list")

        self.variables = variables
        self.mappings = mappings

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        # we need the fit statement to accomodate the sklearn pipeline
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].map(self.mappings)

        return X

In [81]:
def CreatePipe():

    hotel_pipe = Pipeline(
        [
            # impute categorical variables with string missing
            (
                "missing_imputation",
                CategoricalImputer(
                    imputation_method="missing",
                    variables='country',
                ),
            ),
            # add missing indicator/ Creates a new binary column indicating 1 for missing values
            (
                "missing_indicator",
                AddMissingIndicator(variables=['agent']),
            ),
            # Impute median
            (
                "mean_imputation",
                MeanMedianImputer(
                    imputation_method="median",
                    variables=['agent'],
                ),
            ),

            ("drop_features", DropFeatures(features_to_drop=drop_features)),
            #("log", LogTransformer(variables='adr')),
            # Encode Categorical Features
            (
                "categorical_encoder",
                OrdinalEncoder(
                    encoding_method="ordered",
                    variables=encode_columns,
                ),
            ),
            # Custom Mapper Functions
            (
                "mapper_month",
                Mapper(
                    variables=['arrival_date_month'],
                    mappings=mapping_month,
                ),
            ),
            (
                "mapper_room_type",
                Mapper(
                    variables=['reserved_room_type', 'assigned_room_type'],
                    mappings=mapping_room_type,
                ),
            ),

            # Scaling
            (
                "scaler",
                SklearnTransformerWrapper(
                    transformer=MinMaxScaler(),
                    variables=['lead_time', 'days_in_waiting_list', 'adr'],
                ),
            )

        ])
    
    return hotel_pipe


### Evaluation Metrics

In [82]:
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score

# More Focus on Recall (Minimizing False Negatives)
def f2_measure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=2)



def evaluate_model(X, y, model):
    #cv = StratifiedKFold(n_splits=10, random_state=123)
    
    #metric = make_scorer(f2_measure)
    
    pipe = CreatePipe()
    
    X = pipe.fit_transform(X,y)
    
    scores = cross_val_score(model, X.values, y, scoring='roc_auc', cv=6, n_jobs=-1)
    return scores

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import RandomizedSearchCV

In [84]:
X = df_hotel.drop('is_canceled', axis=1)
y = df_hotel['is_canceled']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123)

def get_models():
    models, names = list(), list()
    
    # LR
    models.append(LogisticRegression(solver='liblinear')) 
    names.append('LR')
    
    # SVM
    """ 
    models.append(SVC(gamma='scale'))
    names.append('SVM')"""
    
    # Bagging 
    models.append(BaggingClassifier(n_estimators=100)) 
    names.append('BAG')
    
    # RF 
    models.append(RandomForestClassifier(n_estimators=100)) 
    names.append('RF')
    
    # GBM 
    models.append(GradientBoostingClassifier(n_estimators=100)) 
    names.append('GBM')
    return models, names

models, names = get_models()
results = list()

for i in range(len(models)):
    scores = evaluate_model(X_train, y_train, models[i])
    results.append(scores)
    print('>%s %.3f (%.3f)' % (names[i], np.mean(scores), np.std(scores)))

In [85]:
plt.figure(figsize=(10,5))
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

### Random Forrest Algorithm outperforms other models also having nearly same scores(low std deviation) for different splits in the cross validation(as seen from the box plot)

### Checking the models accuracy with balanced weights due to imbalance in target classes

In [86]:
from sklearn.model_selection import train_test_split

X = df_hotel.drop('is_canceled', axis=1)
y = df_hotel['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123)

pipe = CreatePipe()
X_train = pipe.fit_transform(X_train, y_train)
X_test = pipe.fit_transform(X_test, y_test)

In [87]:
rf_model = RandomForestClassifier(n_estimators=1000, class_weight='balanced', oob_score=True)

rf_model.fit(X_train, y_train)

y_pred_train = rf_model.predict(X_train)

print(f"Training accuracy :{accuracy_score(y_train, y_pred_train)}")
print(f"Validation accuracy : {rf_model.oob_score_}")

#### We can see great improvement in training accuracy after using balanced class weights

### Looks like our model has overfitted, lets check the testing Accuracy

In [88]:
y_pred = rf_model.predict(X_test)

accuracy_score(y_test, y_pred)

#### Our Model has overfitted the training examples

### Fine Tuning

In [89]:
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier(class_weight='balanced', oob_score = True)
params = {
                'max_depth' : [1,2,3,4,5,6],
               'min_samples_leaf' : [0.01,0.02,0.04,0.06],
                'max_features' : [0.1,0.2,0.4,0.8],
                'n_estimators' : [100,150,200,250,300,500]
                
        }

cv = StratifiedKFold(n_splits=3, random_state=123)

rf_random_search_cv = RandomizedSearchCV(estimator=rf,
                          param_distributions=params,
                           n_iter=50,
                          cv=cv,
                          scoring='roc_auc',
                          n_jobs=-1,
                           verbose=1
                          )

rf_random_search_cv.fit(X_train, y_train)

In [90]:
rf_random_search_cv.best_params_

In [91]:
random_best_estimator = rf_random_search_cv.best_estimator_
y_pred = random_best_estimator.predict(X_test)
accuracy_score(y_test, y_pred)

### Selecting parameters from Random Search & more Exploration using Grid Search

In [92]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier(class_weight='balanced', 
                            oob_score = True, 
                             n_estimators = 500,
                             min_samples_leaf= 3,
                             max_features= 'sqrt',
                           max_depth=8)



params = {
                'min_samples_leaf': [2,3,4]
                
        }

rf_cv = GridSearchCV(estimator=rf,
                          param_grid=params,
                          cv=3,
                          scoring='roc_auc',
                          n_jobs=-1,
                           verbose=1
                          )

rf_cv.fit(X_train, y_train)

### Testing accuracy

In [97]:
print(rf_cv.best_params_)
y_pred = rf_cv.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred)

In [98]:
from sklearn.metrics import confusion_matrix, classification_report

cnf_matrix = confusion_matrix(y_test, y_pred)
class_names = ['-VE','+VE']
np.set_printoptions(precision=2)


plt.figure(figsize=(8,6))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, 
                      title='Normalized confusion matrix')

In [99]:
print(classification_report(y_test, y_pred))