# 0.0 Imports

In [19]:
!pip install inflection

Collecting inflection
  Using cached inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)
Installing collected packages: inflection
Successfully installed inflection-0.5.1


In [20]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import inflection

from IPython.display import Image
from IPython.core.display import HTML

## 0.1 Helper functions

In [21]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:90% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()
jupyter_settings();

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [31]:
def get_summary(numerical_attributes):
    
    # Central Tendency - mean, meadina 
    ct1 = pd.DataFrame( numerical_attributes.apply( np.mean ) ).T
    ct2 = pd.DataFrame( numerical_attributes.apply( np.median ) ).T

    # dispersion - std, min, max, range, skew, kurtosis
    d1 = pd.DataFrame( numerical_attributes.apply( np.std ) ).T 
    d2 = pd.DataFrame( numerical_attributes.apply( min ) ).T 
    d3 = pd.DataFrame( numerical_attributes.apply( max ) ).T 
    d4 = pd.DataFrame( numerical_attributes.apply( lambda x: x.max() - x.min() ) ).T 
    d5 = pd.DataFrame( numerical_attributes.apply( lambda x: x.skew() ) ).T 
    d6 = pd.DataFrame( numerical_attributes.apply( lambda x: x.kurtosis() ) ).T 

    # concatenar
    m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
    m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
    
    return m

In [22]:
def cross_validation(X_train, y_train, n_folds, model_name, model, verbose=True):
    # generate k-fold
    f1_scores = []
    n_folds = n_folds
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    i=1
    if verbose:
        for train_ix, val_ix in kfold.split(X_train, y_train):
            print('Fold Nº: {}/{}'.format(i, n_folds))
            # get fold
            X_train_fold = X_train.iloc[train_ix]
            y_train_fold = y_train.iloc[train_ix]

            X_val_fold = X_train.iloc[val_ix]
            y_val_fold = y_train.iloc[val_ix]


            # training model
            m = model.fit(X_train_fold, y_train_fold)

            # prediction
            pred_val = m.predict(X_val_fold)

            # metric
            # F1-Score
            f1_fold = f1_score(pred_val,y_val_fold)
            f1_scores.append(f1_fold)

            i +=1
    return pd.DataFrame({'Model Name': model_name,
                         'Avg F1-Score':np.round(np.mean(f1_scores),3).astype(str)+ '+/-' + np.round(np.std(f1_scores),3).astype(str)}, index=[0])

In [23]:
def f1_score_metric(model_name, y_test, pred):
    return pd.DataFrame({'Model Name': model_name,
                        'F1 Score': f1_score(pred,y_test).round(3)}, index=[0])

In [42]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## 0.2 Load Data

In [24]:
df_raw = pd.read_csv('../data/raw/churn.csv')
df_raw.drop('RowNumber', axis=1, inplace=True)

# 1.0 Data Description

In [25]:
df1 = df_raw.copy()
df1.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## 1.1 Data dimension

In [30]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of columns: {}'.format(df1.shape[1]))

Number of rows: 10000
Number of columns: 13


## 1.2 Rename Columns

In [26]:
cols_old = df1.columns.to_list()

snakecase = lambda x: inflection.underscore(x)

cols_new = list(map(snakecase, cols_old))

#rename
df1.columns = cols_new
df1.columns

Index(['customer_id', 'surname', 'credit_score', 'geography', 'gender', 'age',
       'tenure', 'balance', 'num_of_products', 'has_cr_card',
       'is_active_member', 'estimated_salary', 'exited'],
      dtype='object')

## 1.3 Data Types

In [27]:
df1.dtypes

customer_id           int64
surname              object
credit_score          int64
geography            object
gender               object
age                   int64
tenure                int64
balance             float64
num_of_products       int64
has_cr_card           int64
is_active_member      int64
estimated_salary    float64
exited                int64
dtype: object

## 1.4 Check NAs

In [28]:
df1.isna().sum()

customer_id         0
surname             0
credit_score        0
geography           0
gender              0
age                 0
tenure              0
balance             0
num_of_products     0
has_cr_card         0
is_active_member    0
estimated_salary    0
exited              0
dtype: int64

## 1.5 Descriptive Statistics

In [33]:
num_attributes = df1.select_dtypes( include = [ 'int64', 'float64'] )
cat_attributes = df1.select_dtypes( exclude=['int64', 'float64', 'datetime64[ns]'] )

In [43]:
get_summary(num_attributes)

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,customer_id,15565701.0,15815690.0,249989.0,15690940.57,15690738.0,71932.59,0.0,-1.2
1,credit_score,350.0,850.0,500.0,650.53,652.0,96.65,-0.07,-0.43
2,age,18.0,92.0,74.0,38.92,37.0,10.49,1.01,1.4
3,tenure,0.0,10.0,10.0,5.01,5.0,2.89,0.01,-1.17
4,balance,0.0,250898.09,250898.09,76485.89,97198.54,62394.29,-0.14,-1.49
5,num_of_products,1.0,4.0,3.0,1.53,1.0,0.58,0.75,0.58
6,has_cr_card,0.0,1.0,1.0,0.71,1.0,0.46,-0.9,-1.19
7,is_active_member,0.0,1.0,1.0,0.52,1.0,0.5,-0.06,-2.0
8,estimated_salary,11.58,199992.48,199980.9,100090.24,100193.91,57507.62,0.0,-1.18
9,exited,0.0,1.0,1.0,0.2,0.0,0.4,1.47,0.17


# 2.0 Feature Engeenering

In [29]:
df2 = df1.copy()

In [None]:
# get feature of bined ages

# 3.0 Feature Filtering

In [37]:
df3 = df2.copy()

In [44]:
df3.drop(['customer_id','surname'], axis=1, inplace=True)

# 4.0 Exploratory Data Analysis

In [38]:
df4 = df3.copy()

# 5.0 Data Preparation

In [40]:
df5 = df3.copy()

# 6.0 Feature Selection

In [41]:
df6 = df5.copy()

# 7.0 Machine Learning Modelling

In [72]:
df7 = df6.copy()

# 8.0 Hyperparameter Fine-Tunning

In [92]:
df8 = df7.copy()

# 9.0 Churn Analysis

In [93]:
df9 = df8.copy()

# 10.0 Business Results

## 10.1 Business Analysis

# 11.0 Deploy

## 11.1 Churn Probability Class