# **Part 3: Modeling and Evaluation**

## **Prepare Environment**

<br/>

### Imports

In [2]:
# Data analysis and data wrangling
import numpy as np
import pandas as pd

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # missing values

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

# Metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

# Other
from IPython.display import Image
import configparser
import warnings
import os
import time
import pprint

<br/>

### Prepare Principal Directory

In [3]:
def path_to_work(end_directory: str='notebooks'):
    curr_dir = os.path.dirname(os.path.realpath ("__file__")) 
    
    if curr_dir.endswith(end_directory):
        os.chdir('..')
        return f'Change directory to: {curr_dir}'
    
    return f'Current working directory: {curr_dir}'

In [4]:
path_to_work(end_directory='notebooks')

'Change directory to: /home/campos/projects/predict-which-customers-a-call-center-should-contact/notebooks'

<br/>

### Set Config

In [5]:
# Visualization inside the jupyter
%matplotlib inline

# Load the "autoreload" extension so that code can change
%load_ext autoreload

# ----------
# Plot
# ----------
# graph style
sns.set_style("darkgrid")
plt.style.use('fivethirtyeight')

# ----------
# Seaborn rcParams
# ----------
rc={'savefig.dpi': 500, 
    'figure.autolayout': True, 
    'figure.figsize': [17, 12], 
    'axes.labelsize': 18,
    'axes.titlesize': 18, 
    'font.size': 10, 
    'lines.linewidth': 1.0, 
    'lines.markersize': 8, 
    'legend.fontsize': 15,
    'xtick.labelsize': 15, 
    'ytick.labelsize': 15}

sns.set(context='notebook',  # notebook
        style='darkgrid',
        palette='deep',
        color_codes=True, 
        rc=rc)

# ----------
# Pandas
# ----------
# Floating point
pd.options.display.float_format = '{:.2f}'.format

# Print xxxx rows and all columns
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', None)

# ----------
# Python
# ----------
# pretty print
pp = pprint.PrettyPrinter(indent=4)

# Supress unnecessary warnings so that presentation looks clean
warnings.filterwarnings('ignore')

<br/>

### Load Data

In [6]:
%%time

df_callcenter = pd.read_csv('data/cleansing/callcenter_marketing_clenning.csv', 
                            encoding='utf8',
                            delimiter=',',
                            verbose=True)

Tokenization took: 20.55 ms
Type conversion took: 66.34 ms
Parser memory cleanup took: 0.05 ms
CPU times: user 97.7 ms, sys: 24.5 ms, total: 122 ms
Wall time: 119 ms


In [7]:
df_callcenter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41167 entries, 0 to 41166
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   idade                        41167 non-null  int64  
 1   profissao                    41167 non-null  int64  
 2   educacao                     41167 non-null  int64  
 3   meio_contato                 41167 non-null  int64  
 4   mes                          41167 non-null  int64  
 5   dia_da_semana                41167 non-null  int64  
 6   duracao                      41167 non-null  int64  
 7   dias_ultimo_contato          41167 non-null  int64  
 8   qtd_contatos_total           41167 non-null  int64  
 9   campanha_anterior            41167 non-null  int64  
 10  indice_precos_consumidor     41167 non-null  float64
 11  indice_confianca_consumidor  41167 non-null  float64
 12  euribor3m                    41167 non-null  float64
 13  numero_empregado

<br/>

### Global Variables

In [8]:
# Lists that will be manipulated in the data processing
list_columns = []
list_categorical_col = []
list_numerical_col = []
list_without_target_col = []

In [10]:
def get_col(df: pd.core.frame.DataFrame, type_descr: 'numpy') -> list:
    """
    Function get list columns 
    
    Args:
    type_descr
        np.number, np.object -> return list with all columns
        np.number            -> return list numerical columns 
        np.object            -> return list object columns
    """
    try:
        col = (df.describe(include=type_descr).columns) 
    except ValueError:
        print(f'Dataframe not contains {type_descr} columns !', end='\n')    
    else:
        return col.tolist()

In [13]:
def get_col_without_target(df: pd.core.frame.DataFrame,
                           list_columns: list,
                           target_col: str) -> list:

    col_target = list_columns.copy()
    
    col_target.remove(target_col)
    print(type(col_target))
    
    return col_target

In [14]:
list_numerical_col = get_col(df=df_callcenter,
                             type_descr=np.number)
list_categorical_col = get_col(df=df_callcenter,
                               type_descr=np.object)
list_columns = get_col(df=df_callcenter,
                       type_descr=[np.object, np.number])
list_without_target_col = get_col_without_target(df=df_callcenter,
                                                 list_columns=list_columns,
                                                 target_col='resultado')

Dataframe not contains <class 'object'> columns !
<class 'list'>


---

## **Training and Testing Data**

In [15]:
def cross_val_model(X,y, model, n_splits=3):
    'Do split dataset and calculate cross_score'
    print("Begin training", end='\n\n')
    start = time.time()
    
    X = np.array(X)
    y = np.array(y)
    folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2017).split(X, y))

    for j, (train_idx, test_idx) in enumerate(folds):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_holdout = X[test_idx]
        y_holdout = y[test_idx]

        print ("Fit %s fold %d" % (str(model).split('(')[0], j+1))
        model.fit(X_train, y_train)
        cross_score = cross_val_score(model, X_holdout, y_holdout, cv=3, scoring='roc_auc')
        print("\tcross_score: %.5f" % cross_score.mean())
    
    end = time.time()
    print("\nTraining done! Time Elapsed:", end - start, " seconds.")

In [17]:
X = df_callcenter[list_without_target_col]
y = df_callcenter['resultado'] # target

---

## **Build Models**

### Baseline

#### Linear Regression

In [18]:
# training model
X = df_callcenter[list_without_target_col]
y = df_callcenter['resultado']

print(X.shape)
print(y.shape)

(41167, 14)
(41167,)


In [19]:
# Visualize params

LinearRegression(n_jobs=-1)

LinearRegression(n_jobs=-1)

In [20]:
# create model
lr_model = LinearRegression(n_jobs=-1, normalize=False)

In [21]:
# split dataset and calculate cross_score
cross_val_model(X, y, lr_model)

Begin training

Fit LinearRegression fold 1
	cross_score: 0.78831
Fit LinearRegression fold 2
	cross_score: 0.82517
Fit LinearRegression fold 3
	cross_score: 0.82065

Training done! Time Elapsed: 0.3465287685394287  seconds.


#### Linear Regression with Regularization

In [22]:
# create model
lr_ridge_model = Ridge()

In [23]:
# split dataset and calculate cross_score
cross_val_model(X, y, lr_ridge_model)

Begin training

Fit Ridge fold 1
	cross_score: 0.78937
Fit Ridge fold 2
	cross_score: 0.82620
Fit Ridge fold 3
	cross_score: 0.81947

Training done! Time Elapsed: 0.24179339408874512  seconds.


#### Polynomial Regression

In [24]:
poly = PolynomialFeatures(degree=2)

X_poly = poly.fit_transform(X)
print(X_poly.shape)

(41167, 120)


In [25]:
# split dataset and calculate cross_score
cross_val_model(X_poly, y, lr_model)

Begin training

Fit LinearRegression fold 1
	cross_score: 0.59654
Fit LinearRegression fold 2
	cross_score: 0.77486
Fit LinearRegression fold 3
	cross_score: 0.59465

Training done! Time Elapsed: 4.3111491203308105  seconds.


### Benckmarks

#### RandomForest

In [28]:
# RandomForest params dict
rf_params_one = {}

rf_params_one['n_estimators'] = 10
rf_params_one['max_depth'] = 10
rf_params_one['min_samples_split'] = 10
rf_params_one['min_samples_leaf'] = 10 # end tree necessary 30 leaf
rf_params_one['n_jobs'] = -1 # run all process

In [29]:
# create model
rf_model_one = RandomForestClassifier(**rf_params_one)

# training model
X = df_callcenter[list_without_target_col]
y = df_callcenter['resultado']

In [30]:
# split dataset and calculate cross_score
cross_val_model(X, y, rf_model_one)

Begin training

Fit RandomForestClassifier fold 1
	cross_score: 0.22592
Fit RandomForestClassifier fold 2
	cross_score: 0.27058
Fit RandomForestClassifier fold 3
	cross_score: 0.18815

Training done! Time Elapsed: 1.903247356414795  seconds.


In [31]:
# RandomForest params dict
rf_params_two = {}

rf_params_two['n_estimators'] = 1
rf_params_two['max_depth'] = len(list_numerical_col)*2
rf_params_two['min_samples_split'] = len(list_numerical_col)
rf_params_two['min_samples_leaf'] = len(list_numerical_col)
rf_params_two['n_jobs'] = -1 # run all process

In [32]:
# create model
rf_model = RandomForestClassifier(**rf_params_two, criterion='entropy')

# training model
X = df_callcenter[list_without_target_col]
y = df_callcenter['resultado']

In [33]:
# split dataset and calculate cross_score
cross_val_model(X, y, rf_model)

Begin training

Fit RandomForestClassifier fold 1
	cross_score: 0.43202
Fit RandomForestClassifier fold 2
	cross_score: 0.45268
Fit RandomForestClassifier fold 3
	cross_score: 0.44761

Training done! Time Elapsed: 0.3521687984466553  seconds.


#### Random Forest Regressor

In [34]:
# 1st model Random Forest
rf_regressor_one = RandomForestRegressor(n_jobs = -1,
                                         verbose = 0)

In [35]:
# split dataset and calculate cross_score
cross_val_model(X, y, rf_regressor_one)

Begin training

Fit RandomForestRegressor fold 1
	cross_score: 0.80106
Fit RandomForestRegressor fold 2
	cross_score: 0.73401
Fit RandomForestRegressor fold 3
	cross_score: 0.70490

Training done! Time Elapsed: 6.8482842445373535  seconds.


In [36]:
# 2st model Random Forest
rf_regressor_two = RandomForestRegressor(n_estimators = 1000,
                                         max_leaf_nodes = len(list_numerical_col)*8,
                                         min_samples_leaf = len(list_numerical_col),
                                         max_depth = len(list_numerical_col)*4,
                                         n_jobs = -1,
                                         verbose = 0)

In [37]:
# split dataset and calculate cross_score
cross_val_model(X, y, rf_regressor_two)

Begin training

Fit RandomForestRegressor fold 1
	cross_score: 0.84680
Fit RandomForestRegressor fold 2
	cross_score: 0.81629
Fit RandomForestRegressor fold 3
	cross_score: 0.83830

Training done! Time Elapsed: 44.41213035583496  seconds.


---

## **Better Model**
Based on cross_score the model choose is **random forest regressor** with the parametersof 2º model that given a score > 0.84.

---