<a href="https://colab.research.google.com/github/ashutosh3060/friday-burger-mojito/blob/master/eda_model_build.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Table of Contents:

0. Libraries
1. User-Defined Functions
2. Import Data
4. Exploratory Data Analysis
6. Data Preparation for Model Building
7. Model Build
8. Hyperparameter Tuning
9. Evaluation
10. Final Recommendation

## 0. Libraries

In [1]:
# warnings
import warnings
warnings.filterwarnings("ignore")

# Dataframe, numerical exp and other python-native libraries
import time
from collections import Counter
import pickle
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

# sklearn
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, confusion_matrix, classification_report, f1_score
# XGBoost
from xgboost import XGBClassifier, plot_importance

# imblearn for imbalanced data handling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Display Settings
sns.set_style('whitegrid')
sns.set(font_scale=1.25)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)

## 1. User-Defined Functions

In [None]:
def bivar_contns(nonreturn_df, return_df, contns_col):
    '''
    Plots the continuous variable's histogram for the returning and non-returning customers

    Inputs
    ----------
    nonreturn_df : dataframe
        Dataframe containing Non-returning customers only
    return_df : dataframe
        Dataframe containing Returning customers only
    contns_col : srtring
        Continuous variable name

    Output
    -------
    Histogram in subplots (2 plots)
        Side-by-side histograms of the continuous variable for Non-returning and Returning Customers
    '''
    fig, axes = plt.subplots(1, 2)
    fig.set_size_inches(12, 4)
    nonreturn_df.hist(contns_col, bins=100, ax=axes[0])
    axes[0].set_title('non-returning')
    axes[0].set_xlabel(contns_col)
    return_df.hist(contns_col, bins=100, ax=axes[1])
    axes[1].set_title('returning')
    axes[1].set_xlabel(contns_col)
    plt.show()

def bivar_multi_cat(df, feature, target_ftr, label_rotation=False, horizontal_layout=True):
    '''
    Plots the multi-value categorical variable's bar-graph showing categories in x-axis and #returning_customers in y-axis

    Parameters
    ----------
    df : dataframe
        Dataframe containing Non-returning customers only
    feature : string
        Categorical variable name
    target_ftr : srtring
        Target variable name

    Returns
    -------
    Bar-graph
        Bar graph: x-axis->Unique categories, y-axis->#Returning_customers
    '''
    plt.figure(figsize=(8,6))
    temp = df[feature].value_counts()
    df1 = pd.DataFrame({feature: temp.index,'# Returning Customers': temp.values})
    cat_perc = df[[feature, target_ftr]].groupby([feature],as_index=False).mean() # Calculate the percentage of target=1 per category value
    cat_perc.sort_values(by=target_ftr, ascending=False, inplace=True)
    sns.set_color_codes("pastel")
    s = sns.barplot(x = feature, y="# Returning Customers",data=df1)
    if(label_rotation):
        s.set_xticklabels('',rotation=60)
    s.set_title(feature)
    plt.tick_params(axis='both', which='major', labelsize=10)
    plt.show()

def bivar_binary_cat(df, feature, target_var):
    '''
    Plots Bar-graph using Cross-tab for binary categorical variable and target class

    Parameters
    ----------
    df : dataframe
        Dataframe containing Non-returning customers only
    feature : string
        Binary categorical variable name
    target_ftr : srtring
        Target variable name

    Returns
    -------
    Bar-graph
        Bar graph: x-axis->target_ftr, y-axis->Binary categorical variable
    '''
    ct = pd.crosstab(df[target_var],df_order_label[feature]).apply(lambda r: round((r/r.sum())*100,1), axis=1)
    ct.plot.bar(stacked=True) 
    plt.xlabel(target_var)
    plt.ylabel(feature)
    plt.title(f"{target_var}  vs  {feature}") 
    plt.legend(bbox_to_anchor=(1.25,1.15), loc="upper right")
    plt.show()