In [1]:
# importing libraries
import random            as rand                     # random number gen
import pandas            as pd                       # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn           as sns                      # enhanced data viz
import numpy             as np
from sklearn.model_selection import train_test_split # train-test split
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.linear_model import LogisticRegression  # logistic regression
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import export_graphviz             # exports graphics
from six import StringIO                             # saves objects in memory
from IPython.display import Image                    # displays on frontend
                                    
from sklearn.model_selection import RandomizedSearchCV     # hyperparameter tuning
from sklearn.metrics import make_scorer              # customizable scorer
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm

# specifying the file name
file = './Apprentice_Chef_Dataset.xlsx'

# reading the file and renaming it as "apprentice"
apprentice = pd.read_excel(file)


# Renaming variables

In [2]:
# Renaming LARGEST_ORDER_SIZE, since it is mislabeled and represents the 
# average number of meals ordered per customer
apprentice.rename(columns={"LARGEST_ORDER_SIZE" : "AVG_MEALS_ORDERED"}, inplace = True)

# checking the information about each variable to see data types
apprentice.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   REVENUE                      1946 non-null   float64
 1   CROSS_SELL_SUCCESS           1946 non-null   int64  
 2   NAME                         1946 non-null   object 
 3   EMAIL                        1946 non-null   object 
 4   FIRST_NAME                   1946 non-null   object 
 5   FAMILY_NAME                  1899 non-null   object 
 6   TOTAL_MEALS_ORDERED          1946 non-null   int64  
 7   UNIQUE_MEALS_PURCH           1946 non-null   int64  
 8   CONTACTS_W_CUSTOMER_SERVICE  1946 non-null   int64  
 9   PRODUCT_CATEGORIES_VIEWED    1946 non-null   int64  
 10  AVG_TIME_PER_SITE_VISIT      1946 non-null   float64
 11  MOBILE_NUMBER                1946 non-null   int64  
 12  CANCELLATIONS_BEFORE_NOON    1946 non-null   int64  
 13  CANCELLATIONS_AFTE


# Checking missing values

In [3]:
# checking for missing values
apprentice.isnull().sum()

# dropping the only column with missing values
apprentice.drop(['FAMILY_NAME'], axis = 1)

Unnamed: 0,REVENUE,CROSS_SELL_SUCCESS,NAME,EMAIL,FIRST_NAME,TOTAL_MEALS_ORDERED,UNIQUE_MEALS_PURCH,CONTACTS_W_CUSTOMER_SERVICE,PRODUCT_CATEGORIES_VIEWED,AVG_TIME_PER_SITE_VISIT,...,EARLY_DELIVERIES,LATE_DELIVERIES,PACKAGE_LOCKER,REFRIGERATED_LOCKER,AVG_PREP_VID_TIME,AVG_MEALS_ORDERED,MASTER_CLASSES_ATTENDED,MEDIAN_MEAL_RATING,AVG_CLICKS_PER_VISIT,TOTAL_PHOTOS_VIEWED
0,393.0,1,Saathos,saathos@unitedhealth.com,Saathos,14,6,12,10,48.00,...,0,2,0,0,33.4,1,0,1,17,0
1,1365.0,1,Alysanne Osgrey,alysanne.osgrey@ge.org,Alysanne,87,3,8,8,40.35,...,0,2,0,0,84.8,1,0,3,13,170
2,800.0,1,Edwyd Fossoway,edwyd.fossoway@jnj.com,Edwyd,15,7,11,5,19.77,...,0,1,0,0,63.0,1,0,2,16,0
3,600.0,1,Eleyna Westerling,eleyna.westerling@ge.org,Eleyna,13,6,11,5,90.00,...,0,3,0,0,43.8,1,0,2,14,0
4,1490.0,1,Elyn Norridge,elyn.norridge@jnj.com,Elyn,47,8,6,10,40.38,...,0,8,0,0,84.8,1,1,3,12,205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1941,3450.0,0,Obara Sand,obara.sand@yahoo.com,Obara,87,8,8,7,108.90,...,0,3,0,0,212.5,10,2,3,11,0
1942,5829.0,0,Quentyn Blackwood,quentyn.blackwood@yahoo.com,Quentyn,244,4,7,2,133.91,...,0,3,0,0,282.2,10,1,4,10,424
1943,1900.0,0,Rhonda Rowan,rhonda.rowan@gmail.com,Rhonda,57,2,8,4,102.71,...,3,7,0,0,254.4,10,0,4,12,480
1944,1600.0,0,Turnip,turnip@yahoo.com,Turnip,74,3,10,10,638.87,...,0,3,0,0,564.2,10,3,3,11,796


# Adding Gender

In [4]:
# guessing gender based on FIRST_NAME

# placeholder list
# placeholder_lst = []


# looping to guess gender
# for name in apprentice['FIRST_NAME']:
   # guess = gender.Detector().get_gender(name)
  #  print(guess)
   # placeholder_lst.append(guess)


# converting list into a series
# apprentice['gender_guess'] = pd.Series(placeholder_lst)


# checking results
# apprentice.head(n = 5)



In [5]:
# Since the gender detector didn't work with fantasy names, I decided to import
# a csv file found on internet with the gender of each character of GOT.
# In fact, it is statistically proven that the gender affects heavily alcohol consumption.
# Since we are talking about the success of a promotion that concerns wine,
# I thought it was important to find a way to specify gender. 
# There might be some mistakes, but overall it gives a general understanding of gender.


# Importing a csv where there is a gender classification of Game of Thrones characters

# specifying the file name
file = './character-GOT.csv'

# reading the file and renaming it as "got"
got = pd.read_csv(file)

# creating a list to store first names (easier to compare with our dataset)
firstname = []

# for loop to iterate and split names, to get only first names
for index, col in got.iterrows():
    
    # splitting email domain at '@'
    first_name = got.loc[index, 'Name'].split()[0]
    
    # appending placeholder_lst with the results
    firstname.append(first_name)

# including the list as a new column
got['First Name'] = firstname

#displayin first 10 rows
got.head(10)

Unnamed: 0,Name,Allegiances,Death Year,Book of Death,Death Chapter,Book Intro Chapter,Gender,Nobility,GoT,CoK,SoS,FfC,DwD,First Name
0,Addam Marbrand,Lannister,,,,56.0,1,1,1,1,1,1,0,Addam
1,Aegon Frey (Jinglebell),,299.0,3.0,51.0,49.0,1,1,0,0,1,0,0,Aegon
2,Aegon Targaryen,House Targaryen,,,,5.0,1,1,0,0,0,0,1,Aegon
3,Adrack Humble,House Greyjoy,300.0,5.0,20.0,20.0,1,1,0,0,0,0,1,Adrack
4,Aemon Costayne,Lannister,,,,,1,1,0,0,1,0,0,Aemon
5,Aemon Estermont,Baratheon,,,,,1,1,0,1,1,0,0,Aemon
6,Aemon Targaryen (son of Maekar I),Night's Watch,300.0,4.0,35.0,21.0,1,1,1,0,1,1,0,Aemon
7,Aenys Frey,,300.0,5.0,,59.0,0,1,1,1,1,0,1,Aenys
8,Aeron Greyjoy,House Greyjoy,,,,11.0,1,1,0,1,0,1,0,Aeron
9,Aethan,Night's Watch,,,,0.0,1,0,0,0,1,0,0,Aethan


In [6]:
# creating 2 lists to store male names and female names
males = []
females = []

# creating a new df with only the 2 columns that we will need
df = got.loc[ : , ['First Name', 'Gender']]

# for loop to store the names in the lists based on gender (1=male, 0=female)
for index, val in df.iterrows():
    if df.loc[index, 'Gender'] == 1:
        males.append(df.loc[index , 'First Name'])
    elif df.loc[index, 'Gender'] == 0:
        females.append(df.loc[index , 'First Name'])

        
# creating a new column for gender
apprentice['GENDER']  = 0    

# for loop to compare the names in our dataset with the lists created above
# and storing the results in the new column
for index, val in apprentice.iterrows():
    if apprentice.loc[index, 'FIRST_NAME'] in males:
        apprentice.loc[index , 'GENDER'] = 1
    elif apprentice.loc[index , 'FIRST_NAME'] in females:
        apprentice.loc[index , 'GENDER'] = 0

# displaying first 20 rows to check gender (mostly correct)
apprentice.head(20)

Unnamed: 0,REVENUE,CROSS_SELL_SUCCESS,NAME,EMAIL,FIRST_NAME,FAMILY_NAME,TOTAL_MEALS_ORDERED,UNIQUE_MEALS_PURCH,CONTACTS_W_CUSTOMER_SERVICE,PRODUCT_CATEGORIES_VIEWED,...,LATE_DELIVERIES,PACKAGE_LOCKER,REFRIGERATED_LOCKER,AVG_PREP_VID_TIME,AVG_MEALS_ORDERED,MASTER_CLASSES_ATTENDED,MEDIAN_MEAL_RATING,AVG_CLICKS_PER_VISIT,TOTAL_PHOTOS_VIEWED,GENDER
0,393.0,1,Saathos,saathos@unitedhealth.com,Saathos,Saathos,14,6,12,10,...,2,0,0,33.4,1,0,1,17,0,0
1,1365.0,1,Alysanne Osgrey,alysanne.osgrey@ge.org,Alysanne,Osgrey,87,3,8,8,...,2,0,0,84.8,1,0,3,13,170,0
2,800.0,1,Edwyd Fossoway,edwyd.fossoway@jnj.com,Edwyd,Fossoway,15,7,11,5,...,1,0,0,63.0,1,0,2,16,0,0
3,600.0,1,Eleyna Westerling,eleyna.westerling@ge.org,Eleyna,Westerling,13,6,11,5,...,3,0,0,43.8,1,0,2,14,0,0
4,1490.0,1,Elyn Norridge,elyn.norridge@jnj.com,Elyn,Norridge,47,8,6,10,...,8,0,0,84.8,1,1,3,12,205,0
5,1550.0,1,Genna Lannister,genna.lannister@protonmail.com,Genna,Lannister,36,2,9,1,...,3,1,0,78.8,1,1,3,12,0,0
6,1430.0,1,Olene Tyrell,olene.tyrell@mcdonalds.com,Olene,Tyrell,61,7,6,2,...,2,1,1,84.8,1,0,3,12,169,0
7,1321.25,1,Stevron Frey,stevron.frey@travelers.com,Stevron,Frey,13,1,12,3,...,0,0,0,63.0,1,0,1,15,0,1
8,1505.0,1,Praed,praed@nike.com,Praed,Praed,16,1,12,5,...,1,0,0,63.0,1,0,2,18,0,1
9,1493.0,0,Alysane Mormont,alysane.mormont@caterpillar.com,Alysane,Mormont,95,3,6,8,...,2,1,0,84.8,1,1,3,14,147,0


 # FEATURE ENGINEERING

In [7]:
## FEATURE ENGINEERING

# 1. NEW VARIABLE: N_NAMES - Number of names
#creating a new column to store the number of names
apprentice['N_NAMES']=0

# splitting the names to count them
for index, value in apprentice.iterrows():
    apprentice.loc[index,'N_NAMES'] = len(apprentice.loc[index,'NAME'].split(sep=' '))
    
apprentice.head(10)


Unnamed: 0,REVENUE,CROSS_SELL_SUCCESS,NAME,EMAIL,FIRST_NAME,FAMILY_NAME,TOTAL_MEALS_ORDERED,UNIQUE_MEALS_PURCH,CONTACTS_W_CUSTOMER_SERVICE,PRODUCT_CATEGORIES_VIEWED,...,PACKAGE_LOCKER,REFRIGERATED_LOCKER,AVG_PREP_VID_TIME,AVG_MEALS_ORDERED,MASTER_CLASSES_ATTENDED,MEDIAN_MEAL_RATING,AVG_CLICKS_PER_VISIT,TOTAL_PHOTOS_VIEWED,GENDER,N_NAMES
0,393.0,1,Saathos,saathos@unitedhealth.com,Saathos,Saathos,14,6,12,10,...,0,0,33.4,1,0,1,17,0,0,1
1,1365.0,1,Alysanne Osgrey,alysanne.osgrey@ge.org,Alysanne,Osgrey,87,3,8,8,...,0,0,84.8,1,0,3,13,170,0,2
2,800.0,1,Edwyd Fossoway,edwyd.fossoway@jnj.com,Edwyd,Fossoway,15,7,11,5,...,0,0,63.0,1,0,2,16,0,0,2
3,600.0,1,Eleyna Westerling,eleyna.westerling@ge.org,Eleyna,Westerling,13,6,11,5,...,0,0,43.8,1,0,2,14,0,0,2
4,1490.0,1,Elyn Norridge,elyn.norridge@jnj.com,Elyn,Norridge,47,8,6,10,...,0,0,84.8,1,1,3,12,205,0,2
5,1550.0,1,Genna Lannister,genna.lannister@protonmail.com,Genna,Lannister,36,2,9,1,...,1,0,78.8,1,1,3,12,0,0,2
6,1430.0,1,Olene Tyrell,olene.tyrell@mcdonalds.com,Olene,Tyrell,61,7,6,2,...,1,1,84.8,1,0,3,12,169,0,2
7,1321.25,1,Stevron Frey,stevron.frey@travelers.com,Stevron,Frey,13,1,12,3,...,0,0,63.0,1,0,1,15,0,1,2
8,1505.0,1,Praed,praed@nike.com,Praed,Praed,16,1,12,5,...,0,0,63.0,1,0,2,18,0,1,1
9,1493.0,0,Alysane Mormont,alysane.mormont@caterpillar.com,Alysane,Mormont,95,3,6,8,...,1,0,84.8,1,1,3,14,147,0,2


In [8]:
# 2. NEW VARIABLES: PERSONAL_EMAIL , WORK_EMAIL , JUNK_EMAIL

# placeholder list
placeholder_lst = []

# looping over each email address
for index, col in apprentice.iterrows():
    
    # splitting email domain at '@'
    split_email = apprentice.loc[index, 'EMAIL'].split(sep = '@')[-1]
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    

# including the list as a new column
apprentice['EMAIL_TYPE'] = placeholder_lst

# creating new columns for dummies
apprentice['PERSONAL_EMAIL'] = 0
apprentice['WORK_EMAIL'] = 0
apprentice['JUNK_EMAIL'] = 0

# creating lists for domain
junk = ['me.com', 'aol.com', 'hotmail.com', 'live.com', 'msn.com', 'passport.com']
personal = ['gmail.com', 'yahoo.com' 'protonmail.com']


# looping to split the email domains
for index, val in apprentice.iterrows():
    if apprentice.loc[index, 'EMAIL_TYPE'] in junk:
        apprentice.loc[index , 'JUNK_EMAIL'] = 1
    elif apprentice.loc[index , 'EMAIL_TYPE'] in personal:
        apprentice.loc[index , 'PERSONAL_EMAIL'] = 1
    else:
        apprentice.loc[index , 'WORK_EMAIL'] = 1


apprentice['JUNK_EMAIL'] = pd.to_numeric(apprentice['JUNK_EMAIL'])
apprentice['PERSONAL_EMAIL'] = pd.to_numeric(apprentice['PERSONAL_EMAIL'])
apprentice['WORK_EMAIL'] = pd.to_numeric(apprentice['WORK_EMAIL'])

In [9]:
chef = apprentice.drop(['NAME', 'FIRST_NAME', 'FAMILY_NAME', 'EMAIL', 
                        'CROSS_SELL_SUCCESS', 'EMAIL_TYPE'], axis=1)

# Standardizing

In [10]:
# INSTANTIATING a StandardScaler() object
scaler = StandardScaler()


# FITTING the scaler with the data
scaler.fit(chef)


# TRANSFORMING our data after fit
X_scaled = scaler.transform(chef)


# converting scaled data into a DataFrame
X_scaled_df = pd.DataFrame(X_scaled)


# adding labels to the scaled DataFrame
X_scaled_df.columns = chef.columns

#  Checking pre- and post-scaling of the data
print(f"""
Dataset BEFORE Scaling
----------------------
{pd.np.var(chef)}


Dataset AFTER Scaling
----------------------
{pd.np.var(X_scaled_df)}
""")

X_scaled_df.head(10)




Dataset BEFORE Scaling
----------------------
REVENUE                        1.295040e+06
TOTAL_MEALS_ORDERED            3.057600e+03
UNIQUE_MEALS_PURCH             6.257663e+00
CONTACTS_W_CUSTOMER_SERVICE    5.201168e+00
PRODUCT_CATEGORIES_VIEWED      9.261178e+00
AVG_TIME_PER_SITE_VISIT        3.884497e+03
MOBILE_NUMBER                  1.073443e-01
CANCELLATIONS_BEFORE_NOON      2.400263e+00
CANCELLATIONS_AFTER_NOON       1.867359e-01
TASTES_AND_PREFERENCES         2.040816e-01
PC_LOGINS                      3.371049e-01
MOBILE_LOGINS                  2.782182e-01
WEEKLY_PLAN                    1.841399e+02
EARLY_DELIVERIES               5.373137e+00
LATE_DELIVERIES                7.528947e+00
PACKAGE_LOCKER                 2.290003e-01
REFRIGERATED_LOCKER            1.002716e-01
AVG_PREP_VID_TIME              2.443767e+03
AVG_MEALS_ORDERED              2.400232e+00
MASTER_CLASSES_ATTENDED        4.117799e-01
MEDIAN_MEAL_RATING             5.720404e-01
AVG_CLICKS_PER_VISIT         

  {pd.np.var(chef)}
  {pd.np.var(X_scaled_df)}


Unnamed: 0,REVENUE,TOTAL_MEALS_ORDERED,UNIQUE_MEALS_PURCH,CONTACTS_W_CUSTOMER_SERVICE,PRODUCT_CATEGORIES_VIEWED,AVG_TIME_PER_SITE_VISIT,MOBILE_NUMBER,CANCELLATIONS_BEFORE_NOON,CANCELLATIONS_AFTER_NOON,TASTES_AND_PREFERENCES,...,AVG_MEALS_ORDERED,MASTER_CLASSES_ATTENDED,MEDIAN_MEAL_RATING,AVG_CLICKS_PER_VISIT,TOTAL_PHOTOS_VIEWED,GENDER,N_NAMES,PERSONAL_EMAIL,WORK_EMAIL,JUNK_EMAIL
0,-1.506411,-1.096546,0.437758,2.199609,1.51686,-0.827983,0.373288,1.029555,1.93002,0.632456,...,-2.218665,-0.941742,-2.373244,1.496513,-0.588137,-1.046291,-1.072283,-0.42944,0.742855,-0.499839
1,-0.652279,0.223632,-0.761507,0.44569,0.859661,-0.950725,0.373288,-0.906831,-0.384101,0.632456,...,-2.218665,-0.941742,0.271092,-0.217815,0.351258,-1.046291,0.212608,-0.42944,0.742855,-0.499839
2,-1.148765,-1.078461,0.837513,1.761129,-0.126138,-1.280926,0.373288,1.029555,-0.384101,0.632456,...,-2.218665,-0.941742,-1.051076,1.067931,-0.588137,-1.046291,0.212608,-0.42944,0.742855,-0.499839
3,-1.324512,-1.11463,0.437758,1.761129,-0.126138,-0.154104,0.373288,0.384093,-0.384101,0.632456,...,-2.218665,-0.941742,-1.051076,0.210767,-0.588137,-1.046291,0.212608,-0.42944,0.742855,-0.499839
4,-0.542437,-0.499753,1.237268,-0.431269,1.51686,-0.950244,0.373288,-0.906831,-0.384101,-1.581139,...,-2.218665,0.616617,0.271092,-0.646397,0.544663,-1.046291,0.212608,-0.42944,0.742855,-0.499839
5,-0.489713,-0.698684,-1.161262,0.88417,-1.440536,1.453258,0.373288,1.675016,-0.384101,0.632456,...,-2.218665,0.616617,0.271092,-0.646397,-0.588137,-1.046291,0.212608,-0.42944,0.742855,-0.499839
6,-0.595161,-0.246568,0.837513,-0.431269,-1.111936,0.875968,0.373288,-0.261369,-0.384101,0.632456,...,-2.218665,-0.941742,0.271092,-0.646397,0.345732,-1.046291,0.212608,-0.42944,0.742855,-0.499839
7,-0.690724,-1.11463,-1.561017,2.199609,-0.783337,2.071782,0.373288,-0.906831,-0.384101,0.632456,...,-2.218665,-0.941742,-2.373244,0.639349,-0.588137,0.955757,0.212608,-0.42944,0.742855,-0.499839
8,-0.529256,-1.060376,-1.561017,2.199609,-0.126138,-1.369332,0.373288,-0.906831,-0.384101,0.632456,...,-2.218665,-0.941742,-1.051076,1.925095,-0.588137,0.955757,-1.072283,-0.42944,0.742855,-0.499839
9,-0.539801,0.368309,-0.761507,-0.431269,0.859661,-0.807767,0.373288,-0.906831,-0.384101,-1.581139,...,-2.218665,0.616617,0.271092,0.210767,0.224163,-1.046291,0.212608,-0.42944,0.742855,-0.499839


In [11]:
# renaming the standardized dataset 
chef = X_scaled_df 

# Correlation

In [12]:
# taking the CROSS_SELL_SUCCESS from the original dataset
cross = apprentice['CROSS_SELL_SUCCESS']

# adding the column to the standardized dataset
chef = pd.concat([chef, cross],
                        axis = 1)

chef.head(10)
# running a correlation 
df_corr = chef.corr(method='pearson').round(2)

df_corr['CROSS_SELL_SUCCESS'].sort_values(ascending = False)

CROSS_SELL_SUCCESS             1.00
WORK_EMAIL                     0.22
CANCELLATIONS_BEFORE_NOON      0.16
N_NAMES                        0.16
MOBILE_NUMBER                  0.10
TASTES_AND_PREFERENCES         0.08
GENDER                         0.07
REFRIGERATED_LOCKER            0.07
PACKAGE_LOCKER                 0.04
MASTER_CLASSES_ATTENDED        0.04
CONTACTS_W_CUSTOMER_SERVICE    0.04
PC_LOGINS                      0.04
AVG_PREP_VID_TIME              0.03
MEDIAN_MEAL_RATING             0.03
AVG_MEALS_ORDERED              0.02
EARLY_DELIVERIES               0.02
PERSONAL_EMAIL                 0.02
AVG_TIME_PER_SITE_VISIT        0.01
TOTAL_MEALS_ORDERED            0.01
LATE_DELIVERIES                0.01
TOTAL_PHOTOS_VIEWED            0.01
PRODUCT_CATEGORIES_VIEWED      0.00
UNIQUE_MEALS_PURCH             0.00
REVENUE                        0.00
WEEKLY_PLAN                   -0.01
AVG_CLICKS_PER_VISIT          -0.04
CANCELLATIONS_AFTER_NOON      -0.05
MOBILE_LOGINS               

# Splitting original data

In [13]:
# declaring explanatory variables
apprentice_x = apprentice.drop(['NAME', 'FIRST_NAME', 'FAMILY_NAME', 'EMAIL', 
                        'CROSS_SELL_SUCCESS', 'EMAIL_TYPE'], axis=1)

# declaring response variable
apprentice_y = apprentice.loc[ : , 'CROSS_SELL_SUCCESS' ]

# train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
            apprentice_x,
            apprentice_y,
            test_size    = 0.25,
            random_state = 219,
            stratify     = apprentice_y)


# merging training data for statsmodels
apprentice_train = pd.concat([X_train, y_train], axis = 1)

# Splitting standardized data

In [14]:
# declaring explanatory variables
chef_x = chef.drop(['CROSS_SELL_SUCCESS'], axis = 1)

# declaring response variable
chef_y = chef.loc[ : , 'CROSS_SELL_SUCCESS' ]

# train-test split with stratification
X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(
            chef_x,
            chef_y,
            test_size    = 0.25,
            random_state = 219,
            stratify     = chef_y)


# merging training data for statsmodels
chef_train = pd.concat([X_train_st, y_train_st], axis = 1)

In [15]:
for val in apprentice_x:
    print(f" {val} + ")

 REVENUE + 
 TOTAL_MEALS_ORDERED + 
 UNIQUE_MEALS_PURCH + 
 CONTACTS_W_CUSTOMER_SERVICE + 
 PRODUCT_CATEGORIES_VIEWED + 
 AVG_TIME_PER_SITE_VISIT + 
 MOBILE_NUMBER + 
 CANCELLATIONS_BEFORE_NOON + 
 CANCELLATIONS_AFTER_NOON + 
 TASTES_AND_PREFERENCES + 
 PC_LOGINS + 
 MOBILE_LOGINS + 
 WEEKLY_PLAN + 
 EARLY_DELIVERIES + 
 LATE_DELIVERIES + 
 PACKAGE_LOCKER + 
 REFRIGERATED_LOCKER + 
 AVG_PREP_VID_TIME + 
 AVG_MEALS_ORDERED + 
 MASTER_CLASSES_ATTENDED + 
 MEDIAN_MEAL_RATING + 
 AVG_CLICKS_PER_VISIT + 
 TOTAL_PHOTOS_VIEWED + 
 GENDER + 
 N_NAMES + 
 PERSONAL_EMAIL + 
 WORK_EMAIL + 
 JUNK_EMAIL + 


# Logistic Regression

In [16]:
## WITH ORIGINAL DATASET

# building a logistic regression model using all the explanatory variables 

# instantiating a logistic regression model object
logistic_reg = smf.logit(formula = """CROSS_SELL_SUCCESS ~ 

                                             REVENUE + 
                                             TOTAL_MEALS_ORDERED + 
                                             UNIQUE_MEALS_PURCH + 
                                             CONTACTS_W_CUSTOMER_SERVICE + 
                                             PRODUCT_CATEGORIES_VIEWED + 
                                             AVG_TIME_PER_SITE_VISIT + 
                                             MOBILE_NUMBER + 
                                             CANCELLATIONS_BEFORE_NOON + 
                                             CANCELLATIONS_AFTER_NOON + 
                                             TASTES_AND_PREFERENCES + 
                                             PC_LOGINS + 
                                             MOBILE_LOGINS + 
                                             WEEKLY_PLAN + 
                                             EARLY_DELIVERIES + 
                                             LATE_DELIVERIES + 
                                             PACKAGE_LOCKER + 
                                             REFRIGERATED_LOCKER + 
                                             AVG_PREP_VID_TIME + 
                                             AVG_MEALS_ORDERED + 
                                             MASTER_CLASSES_ATTENDED + 
                                             MEDIAN_MEAL_RATING + 
                                             AVG_CLICKS_PER_VISIT + 
                                             TOTAL_PHOTOS_VIEWED + 
                                             GENDER + 
                                             N_NAMES + 
                                             PERSONAL_EMAIL + 
                                             WORK_EMAIL + 
                                             JUNK_EMAIL 
                                             """,
                                                     data    = apprentice_train)


# fitting the model object
results_logistic = logistic_reg.fit()


# checking the results SUMMARY
results_logistic.summary()


Optimization terminated successfully.
         Current function value: 0.536959
         Iterations 6


0,1,2,3
Dep. Variable:,CROSS_SELL_SUCCESS,No. Observations:,1459.0
Model:,Logit,Df Residuals:,1431.0
Method:,MLE,Df Model:,27.0
Date:,"Thu, 28 Jan 2021",Pseudo R-squ.:,0.1449
Time:,20:43:29,Log-Likelihood:,-783.42
converged:,True,LL-Null:,-916.19
Covariance Type:,nonrobust,LLR p-value:,4.864999999999999e-41

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.4602,5.13e+06,-4.79e-07,1.000,-1.01e+07,1.01e+07
REVENUE,-0.0002,9.02e-05,-2.208,0.027,-0.000,-2.24e-05
TOTAL_MEALS_ORDERED,-0.0004,0.001,-0.281,0.779,-0.003,0.002
UNIQUE_MEALS_PURCH,-0.0166,0.026,-0.633,0.527,-0.068,0.035
CONTACTS_W_CUSTOMER_SERVICE,0.0509,0.028,1.802,0.072,-0.004,0.106
PRODUCT_CATEGORIES_VIEWED,-0.0127,0.021,-0.618,0.536,-0.053,0.028
AVG_TIME_PER_SITE_VISIT,5.008e-05,0.001,0.048,0.962,-0.002,0.002
MOBILE_NUMBER,0.8535,0.177,4.813,0.000,0.506,1.201
CANCELLATIONS_BEFORE_NOON,0.2902,0.047,6.181,0.000,0.198,0.382


In [17]:
## WITH THE STANDARDIZED DATASET

logistic_st = smf.logit(formula = """CROSS_SELL_SUCCESS ~ 

                                            REVENUE + 
                                             TOTAL_MEALS_ORDERED + 
                                             UNIQUE_MEALS_PURCH + 
                                             CONTACTS_W_CUSTOMER_SERVICE + 
                                             PRODUCT_CATEGORIES_VIEWED + 
                                             AVG_TIME_PER_SITE_VISIT + 
                                             MOBILE_NUMBER + 
                                             CANCELLATIONS_BEFORE_NOON + 
                                             CANCELLATIONS_AFTER_NOON + 
                                             TASTES_AND_PREFERENCES + 
                                             PC_LOGINS + 
                                             MOBILE_LOGINS + 
                                             WEEKLY_PLAN + 
                                             EARLY_DELIVERIES + 
                                             LATE_DELIVERIES + 
                                             PACKAGE_LOCKER + 
                                             REFRIGERATED_LOCKER + 
                                             AVG_PREP_VID_TIME + 
                                             AVG_MEALS_ORDERED + 
                                             MASTER_CLASSES_ATTENDED + 
                                             MEDIAN_MEAL_RATING + 
                                             AVG_CLICKS_PER_VISIT + 
                                             TOTAL_PHOTOS_VIEWED + 
                                             GENDER + 
                                             N_NAMES + 
                                             PERSONAL_EMAIL + 
                                             WORK_EMAIL + 
                                             JUNK_EMAIL 
                                             """,
                                                     data    = chef_train)


# fitting the model object
results_logistic = logistic_st.fit()


# checking the results SUMMARY
results_logistic.summary()

Optimization terminated successfully.
         Current function value: 0.536959
         Iterations 6


0,1,2,3
Dep. Variable:,CROSS_SELL_SUCCESS,No. Observations:,1459.0
Model:,Logit,Df Residuals:,1431.0
Method:,MLE,Df Model:,27.0
Date:,"Thu, 28 Jan 2021",Pseudo R-squ.:,0.1449
Time:,20:43:29,Log-Likelihood:,-783.42
converged:,True,LL-Null:,-916.19
Covariance Type:,nonrobust,LLR p-value:,4.864999999999999e-41

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.9096,0.065,13.965,0.000,0.782,1.037
REVENUE,-0.2266,0.103,-2.208,0.027,-0.428,-0.026
TOTAL_MEALS_ORDERED,-0.0230,0.082,-0.281,0.779,-0.183,0.137
UNIQUE_MEALS_PURCH,-0.0414,0.065,-0.633,0.527,-0.170,0.087
CONTACTS_W_CUSTOMER_SERVICE,0.1161,0.064,1.802,0.072,-0.010,0.242
PRODUCT_CATEGORIES_VIEWED,-0.0386,0.063,-0.618,0.536,-0.161,0.084
AVG_TIME_PER_SITE_VISIT,0.0031,0.065,0.048,0.962,-0.125,0.131
MOBILE_NUMBER,0.2796,0.058,4.813,0.000,0.166,0.394
CANCELLATIONS_BEFORE_NOON,0.4495,0.073,6.181,0.000,0.307,0.592


In [18]:
# removing insignificant variables

logistic_st = smf.logit(formula = """CROSS_SELL_SUCCESS ~ 

                                            REVENUE + 
                                            
                                             CONTACTS_W_CUSTOMER_SERVICE + 
                    
                                             MOBILE_NUMBER + 
                                             CANCELLATIONS_BEFORE_NOON + 
                                              
                                             TASTES_AND_PREFERENCES + 
                                             PC_LOGINS + 
                                             
                                             EARLY_DELIVERIES + 
                                             
                                            
                                             REFRIGERATED_LOCKER + 
                                             AVG_PREP_VID_TIME + 
                                             
                                            
                                             GENDER + 
                                             N_NAMES + 
                                             PERSONAL_EMAIL + 
                                             WORK_EMAIL + 
                                             JUNK_EMAIL """,
                                                     data    = chef_train)


# fitting the model object
results_logistic = logistic_st.fit()


# checking the results SUMMARY
results_logistic.summary()

Optimization terminated successfully.
         Current function value: 0.542237
         Iterations 7


0,1,2,3
Dep. Variable:,CROSS_SELL_SUCCESS,No. Observations:,1459.0
Model:,Logit,Df Residuals:,1445.0
Method:,MLE,Df Model:,13.0
Date:,"Thu, 28 Jan 2021",Pseudo R-squ.:,0.1365
Time:,20:43:29,Log-Likelihood:,-791.12
converged:,True,LL-Null:,-916.19
Covariance Type:,nonrobust,LLR p-value:,5.993e-46

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.8959,0.064,13.895,0.000,0.769,1.022
REVENUE,-0.1148,0.082,-1.408,0.159,-0.275,0.045
CONTACTS_W_CUSTOMER_SERVICE,0.1060,0.063,1.693,0.090,-0.017,0.229
MOBILE_NUMBER,0.2741,0.057,4.771,0.000,0.162,0.387
CANCELLATIONS_BEFORE_NOON,0.4475,0.072,6.209,0.000,0.306,0.589
TASTES_AND_PREFERENCES,0.1633,0.061,2.680,0.007,0.044,0.283
PC_LOGINS,0.1149,0.062,1.849,0.065,-0.007,0.237
EARLY_DELIVERIES,0.1406,0.064,2.203,0.028,0.016,0.266
REFRIGERATED_LOCKER,0.1543,0.066,2.332,0.020,0.025,0.284


# RandomizedCV for Logistic Regression

In [19]:
dict = {'logit_small'     : ['CANCELLATIONS_BEFORE_NOON', 'MOBILE_NUMBER', 
                             'TASTES_AND_PREFERENCES', 'EARLY_DELIVERIES',
                            'JUNK_EMAIL','GENDER','WORK_EMAIL', 'N_NAMES']
}


In [28]:
# preparing explanatory and response variable

# Original Dataset

# declaring explanatory variables
apprentice_data = apprentice.loc[:, dict['logit_small']]

# declaring response variable
apprentice_target = apprentice['CROSS_SELL_SUCCESS']

In [21]:
# standardized dataset
# declaring explanatory variables
chef_data = chef.loc[:, dict['logit_small']]

# declaring response variable
chef_target = chef['CROSS_SELL_SUCCESS']

In [22]:
# preparing the train and test datasets with a stratified argument
# in order to balance the response variable in both datasets

# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            chef_data,
            chef_target,
            test_size    = 0.25,
            random_state = 219,
            stratify     = chef_target)  #use stratify on your y variable


# merging training data for statsmodels
chef_train = pd.concat([x_train, y_train], axis = 1)

# Randomized CV for C-Tree

In [23]:
# declaring a hyperparameter space
criterion_space = ['gini', 'entropy']
splitter_space  = ['best', 'random']
depth_space     = pd.np.arange(1, 9, 1)
leaf_space      = pd.np.arange(1, 100, 1)


# creating a hyperparameter grid
param_grid = {'criterion'        : criterion_space,
              'splitter'         : splitter_space,
              'max_depth'        : depth_space,
              'min_samples_leaf' : leaf_space}


# INSTANTIATING the model object without hyperparameters
tuned_tree = DecisionTreeClassifier(random_state = 219)


# RandomizedSearchCV object
tuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,
                                   param_distributions   = param_grid,
                                   cv                    = 3,
                                   n_iter                = 1000,
                                   random_state          = 219,
                                   scoring = make_scorer(roc_auc_score,
                                             needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
tuned_tree_cv.fit(chef_data, chef_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_tree_cv.best_params_)
print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

  depth_space     = pd.np.arange(1, 9, 1)
  leaf_space      = pd.np.arange(1, 100, 1)


Tuned Parameters  : {'splitter': 'best', 'min_samples_leaf': 1, 'max_depth': 3, 'criterion': 'gini'}
Tuned Training AUC: 0.7036


In [24]:
tuned_tree_cv.best_estimator_

DecisionTreeClassifier(max_depth=3, random_state=219)

In [25]:
y_train=y_train.astype(int)

In [26]:
## FINAL MODEL
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
tree_tuned = DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=1,
                       random_state=219, splitter='best')


# FIT step is not needed
tree_tuned_fit = tree_tuned.fit(x_train,y_train)

# PREDICTING based on the testing set
tree_tuned_pred = tree_tuned_fit.predict(x_test)


# SCORING the results
print('FINAL MODEL: TREE CLASSIFIER')
print('Training ACCURACY:', tree_tuned.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_tuned_pred).round(4))


# saving scoring data for future use
tree_tuned_train_score = tree_tuned.score(x_train, y_train).round(4) # accuracy
tree_tuned_test_score  = tree_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
tree_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = tree_tuned_pred).round(4) # auc

FINAL MODEL: TREE CLASSIFIER
Training ACCURACY: 0.7402
Testing  ACCURACY: 0.7762
AUC Score        : 0.732


In [27]:
# unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")


True Negatives : 95
False Positives: 61
False Negatives: 48
True Positives : 283

