In [1]:
# timeit

#####################
#### Final Model ####
#####################

#############################################################################
##  LOADING THE DATA
#############################################################################


# importing libraries
import pandas as pd                                                   # data science essentials
import numpy as np                                                    # data science essentials
from sklearn.model_selection import train_test_split, cross_val_score # train/test split & Cross Validation
import sklearn.linear_model                                           # Linear models
from sklearn.metrics import roc_auc_score                             # auc score
from sklearn.metrics import make_scorer                               # customizable scorer
from sklearn.model_selection import GridSearchCV                      # hyperparameter tuning

# CART model packages
from sklearn.tree import DecisionTreeClassifier                       # classification trees


# Loading the dataset
original_df =  pd.read_excel('Apprentice_Chef_Dataset.xlsx')

chef = original_df.copy()

#############################################################################

# Missing Value Imputation
# creating an imputation value
fill = 'Unknown'

# imputing 'FAMILY_NAME'
chef['FAMILY_NAME'] = chef['FAMILY_NAME'].fillna(fill)


#############################################################################
# FEATURE ENGINEERING
#############################################################################

# Feature Engineering Categorical Data

## Part 1: Working with Email Addresses
# Splitting EMAIL

# placeholder list
placeholder_lst = []

# looping over each email address
for index, col in chef.iterrows():
    
    # splitting email domain at '@'
    split_email = chef.loc[index, 'EMAIL'].split(sep = '@')
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    

# converting placeholder_lst into a DataFrame 
email_df = pd.DataFrame(placeholder_lst)


# renaming column to concatenate
email_df.columns = ['0' , 'EMAIL_DOMAIN']


# concatenating EMAIL_DOMAIN with chef DataFrame
chef = pd.concat([chef, email_df['EMAIL_DOMAIN']],
                     axis = 1)

# email domain types
PERSONAL_EMAIL_DOMAIN     = ['@gmail.com','@yahoo.com','@protonmail.com']

JUNK_EMAIL_DOMAIN         = ['@me.com','@aol.com','@hotmail.com','@live.com','@msn.com', '@passport.com']

PROFESSIONAL_EMAIL_DOMAIN = ['@mmm.com','@amex.com','@apple.com','@boeing.com','@caterpillar.com','@chevron.com',
                             '@cisco.com','@cocacola.com','@disney.com','@dupont.com','@exxon.com','@ge.org',
                             '@goldmansacs.com','@homedepot.com','@ibm.com','@intel.com','@jnj.com','@jpmorgan.com',
                             '@mcdonalds.com','@merck.com','@microsoft.com','@nike.com','@pfizer.com','@pg.com',
                             '@travelers.com','@unitedtech.com','@unitedhealth.com','@verizon.com','@visa.com',
                             '@walmart.com']


# placeholder list
placeholder_lst = []


# looping to group observations by domain type
for domain in chef['EMAIL_DOMAIN']:
    
    if '@' + domain in PERSONAL_EMAIL_DOMAIN:
        placeholder_lst.append('personal')
        

    elif '@' + domain in JUNK_EMAIL_DOMAIN:
        placeholder_lst.append('junk')
        
        
    elif '@' + domain in PROFESSIONAL_EMAIL_DOMAIN:
        placeholder_lst.append('professional')

        
    else:
            print('Unknown')


# concatenating with original DataFrame
chef['DOMAIN_GROUP'] = pd.Series(placeholder_lst)

#############################################################################

## Part 2: Working with First Name to identify Gender

# guessing gender based on NAME

# Gender Guesser (Python package) was used to identify the gender (Note: saving Runtime)
# placeholder list
placeholder_lst = ['unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','male','male','female','mostly_female','mostly_female','mostly_female','unknown','male','male','unknown','male','unknown','male','unknown','unknown','female','andy','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','male','unknown','unknown','unknown','unknown','female','female','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','mostly_male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','mostly_male','female','female','unknown','unknown','female','unknown','male','mostly_male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','andy','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','female','male','female','male','female','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','male','unknown','unknown','unknown','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','male','unknown','female','unknown','unknown','male','unknown','male','male','male','male','unknown','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','female','mostly_female','female','unknown','unknown','unknown','female','female','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','female','female','unknown','unknown','unknown','male','male','unknown','male','male','male','unknown','unknown','unknown','unknown','unknown','mostly_female','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','mostly_male','male','male','male','male','male','female','unknown','unknown','male','male','male','male','unknown','male','male','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','mostly_male','unknown','unknown','unknown','unknown','male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','male','male','male','male','male','male','male','male','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','male','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','male','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','mostly_female','mostly_female','mostly_female','unknown','unknown','unknown','male','male','unknown','male','male','male','unknown','male','male','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','female','unknown','unknown','unknown','female','female','unknown','male','unknown','unknown','female','female','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','male','unknown','unknown','male','unknown','unknown','male','unknown','unknown','unknown','male','unknown','unknown','mostly_male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','male','male','female','male','unknown','unknown','unknown','male','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','male','unknown','male','unknown','male','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','unknown','unknown','male','male','male','male','unknown','unknown','unknown','unknown','female','male','male','male','male','male','male','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','female','male','male','unknown','male','female','unknown','unknown','mostly_male','male','unknown','male','unknown','male','male','unknown','unknown','unknown','unknown','unknown','mostly_female','male','male','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','female','unknown','female','male','male','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','female','unknown','male','unknown','male','unknown','unknown','male','unknown','female','male','unknown','unknown','unknown','unknown','unknown','unknown','male','male','male','male','male','male','male','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','male','male','male','male','male','unknown','unknown','male','female','unknown','unknown','male','unknown','female','male','male','male','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','andy','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','male','male','male','male','male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','mostly_female','male','unknown','unknown','unknown','unknown','male','male','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','male','male','male','male','male','female','female','unknown','unknown','unknown','male','unknown','female','unknown','male','male','unknown','unknown','unknown','male','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown',
                   'male','unknown','male','unknown','male','unknown','unknown','male','male','male','unknown','unknown','unknown','unknown','unknown','male','male','male','male','male','mostly_male','mostly_male','mostly_male','mostly_male','unknown','unknown','unknown','male','male','unknown','unknown','male','unknown','unknown','male','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','mostly_male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','male','male','unknown','unknown','female','unknown','male','unknown','unknown','male','male','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','male','mostly_male','mostly_male','mostly_male','unknown','male','male','male','male','male','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','male','male','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','mostly_female','unknown','male','unknown','unknown','male','unknown','andy','unknown','unknown','unknown','unknown',
                   'female','female','male','unknown','unknown','male','male','unknown','mostly_male','female','unknown','unknown','male','unknown','male','male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','andy','unknown','unknown','unknown','unknown','male','male','male','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','unknown','unknown','male','male','unknown','mostly_male','mostly_male','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','female','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','male','male','unknown','unknown','unknown','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','male','male','male','male','male','male','male','mostly_male','mostly_male','unknown','male','unknown','unknown','unknown','female','unknown','unknown','male','andy','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','mostly_female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','male','unknown','male','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','andy','unknown','unknown','unknown','unknown','unknown','unknown','andy','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','female','unknown','mostly_female','mostly_female','unknown','unknown','unknown','unknown','unknown','female','male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','male','male','unknown','unknown','unknown','female','unknown','female','female','unknown','unknown','unknown','male','male','male','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','male','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','andy','unknown','male','unknown','female','female','unknown','male','unknown','unknown','male','unknown','female','unknown','unknown','unknown','male','unknown','female','unknown','male','unknown','female','male','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','female','female','unknown','unknown','unknown','unknown','unknown','male','unknown','male','male','male','unknown','male','unknown','male','unknown','male','mostly_female','male','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','male','female','unknown','female','unknown','unknown','male','male','male','unknown','unknown','male','unknown','unknown','female','unknown','unknown','unknown','unknown','male','mostly_male','mostly_female','mostly_female','female','unknown','unknown','unknown','unknown','male','male','female','unknown','male','unknown','unknown','unknown','female','unknown','unknown','unknown','male','unknown','male','unknown','male','unknown','male','unknown','unknown','unknown','unknown','unknown','male','female','female','male','female','unknown','male','male','unknown','unknown','male','female','female','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','female','unknown','male','male','unknown','female','unknown','mostly_female','male','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','male','male','male','mostly_male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','female','unknown','female','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','mostly_male','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','female','female','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','male','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','male','female','unknown','unknown','unknown','unknown','unknown','unknown','male','female','female','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','andy','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','mostly_female','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','female','female','unknown','female','female','female','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','female','male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','mostly_female','unknown','unknown','unknown','unknown','unknown','unknown','female','unknown','unknown','female','male','male','mostly_male','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','unknown','male','unknown','unknown','male','unknown','female','female','unknown','male','unknown','unknown','unknown','unknown','unknown','unknown','mostly_female','unknown','unknown']


# converting list into a series
chef['GENDER_GUESS_FIRST_NAME'] = pd.Series(placeholder_lst)

# Replace values (Grouping similar features)
chef['GENDER_GUESS_FIRST_NAME'] = chef["GENDER_GUESS_FIRST_NAME"].replace({"mostly_male": "male", 
                                                                           "mostly_female": "female",
                                                                           "andy": "unknown"})

#############################################################################

## Part 3: One Hot Encoding

# one hot encoding categorical variables
one_hot_DOMAIN_GROUP            = pd.get_dummies(chef['DOMAIN_GROUP'])
one_hot_GENDER_GUESS_FIRST_NAME = pd.get_dummies(chef['GENDER_GUESS_FIRST_NAME'])


# dropping categorical variables after they've been encoded
chef = chef.drop('DOMAIN_GROUP', axis = 1)
chef = chef.drop('GENDER_GUESS_FIRST_NAME', axis = 1)


# joining codings together
chef = chef.join([one_hot_DOMAIN_GROUP, one_hot_GENDER_GUESS_FIRST_NAME])


#############################################################################
# TRAIN/TEST SPLIT
#############################################################################

# declaring set of x-variables

# significant variables only
logit_sig = ['MOBILE_NUMBER', 'CANCELLATIONS_BEFORE_NOON', 'CANCELLATIONS_AFTER_NOON', 'TASTES_AND_PREFERENCES',
             'MOBILE_LOGINS', 'FOLLOWED_RECOMMENDATIONS_PCT', 'junk', 'professional', 'female', 'male','TOTAL_PHOTOS_VIEWED']

# preparing x-variables
chef_data = chef.loc[ : , logit_sig]

# preparing response variable
chef_target = chef.loc[ : , 'CROSS_SELL_SUCCESS']


# running train/test split 
X_train,X_test,y_train,y_test = train_test_split(chef_data,
                                                chef_target,
                                                test_size = 0.25,
                                                random_state = 222,
                                                stratify = chef_target)


#############################################################################
# Final Model: DECISION TREE CLASSIFIER (Tuned Hyperparameters)
#############################################################################

## building a model based on hyperparameter tuning results (GridSearchCV - 3 Fold Cross-Validation)

# declaring a hyperparameter space

criterion_space = ["gini", "entropy"]
max_depth_space = np.arange(1,15,1)
min_samples_split_space = np.arange(2,50)


# creating a hyperparameter grid

param_grid = {'criterion': criterion_space,
               'max_depth': max_depth_space,
               'min_samples_split' : min_samples_split_space}


# INSTANTIATING the model object 

dtree_model = DecisionTreeClassifier(random_state = 222)


# GridSearchCV object

dtree_cv = GridSearchCV(dtree_model, 
                        param_grid, 
                        cv=3,
                        scoring = make_scorer(roc_auc_score,
                                              needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)

dtree_cv.fit(chef_data,chef_target)


# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
dtree_tuned = dtree_cv.best_estimator_


# FIT step is not needed


# PREDICTING based on the testing set
dtree_tuned_pred = dtree_tuned.predict(X_test)


#############################################################################
# Final Model Score (score)
#############################################################################

# SCORING the results
dtree_tuned_train_score = dtree_tuned.score(X_train, y_train).round(4)
dtree_tuned_test_score  = dtree_tuned.score(X_test, y_test).round(4)
dtree_tuned_AUC_score   = roc_auc_score(y_true  = y_test,
                                       y_score = dtree_tuned_pred).round(4)


#Saving test score
test_score = dtree_tuned_AUC_score.round(4)

test_score


0.91