# Modeling

From this notebook onwards the model for classification will be developed. First, we need some feature engineering.

In [1]:
#libs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os

In [2]:
DATA_CLEANED_DIR = os.path.join(os.getcwd(), os.pardir, 'data_sent_github')
print(DATA_CLEANED_DIR)

/work/churn_predictive_model/notebooks/../data_sent_github


In [3]:
df = pd.read_csv(DATA_CLEANED_DIR+'/cleaned_dataset.csv', index_col=[0])

In [4]:
df.head()

Unnamed: 0,CustomerId,Surname,Geography,Gender,HasCrCard,IsActiveMember,EstimatedSalary,application_date,exit_date,birth_date,...,birth_date_year,birth_date_month,birth_date_day,ContractId,Products,Date,Score,Value,eligible,client_age
0,14648573,NALLS,Spain,Male,1.0,0.0,140827.98,2019-06-19,,1979-02-27,...,1979.0,2.0,2.0,WWlyRDX8AsGnWUPYiYGjidGKI,Product B,2019-06,683,0.0,1,40
1,15165393,LABIANCA,Spain,Male,1.0,1.0,2612.65,2018-02-22,2019-06-11,1974-07-11,...,1974.0,7.0,7.0,orIBQM9pK7Z7AYBQok1jypK6t,Product D,2018-02,487,119657.24,0,43
3,15982728,GOUDEAU,France,Male,0.0,1.0,66465.09,2018-02-02,2019-06-01,1972-12-18,...,1972.0,12.0,12.0,sdnOZ3fBpu1sKsYrZJE9tfcRP,Product B,2018-02,595,43353.5,0,45
4,15434700,STIMMELL,Germany,Male,1.0,0.0,138615.32,2018-06-06,,1994-07-22,...,1994.0,7.0,7.0,dF2Z2DZoUYMmf0ST8keQjvKQL,Product A,2018-06,706,92357.9,1,23
5,15898769,ALCOCK,Germany,Female,0.0,0.0,197095.28,2018-06-09,2019-06-16,1998-05-14,...,1998.0,5.0,5.0,gfCNu3alAqn2HgWKamcXqOaij,Product D,2018-06,532,0.0,0,20


In [5]:
df.dtypes

CustomerId                  int64
Surname                    object
Geography                  object
Gender                     object
HasCrCard                 float64
IsActiveMember            float64
EstimatedSalary           float64
application_date           object
exit_date                  object
birth_date                 object
application_date_year       int64
application_date_month      int64
application_date_day        int64
exit_date_year            float64
exit_date_month           float64
exit_date_day             float64
birth_date_year           float64
birth_date_month          float64
birth_date_day            float64
ContractId                 object
Products                   object
Date                       object
Score                       int64
Value                     float64
eligible                    int64
client_age                  int64
dtype: object

In [69]:
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['HasCrCard'] = df['HasCrCard'].astype('category')
df['IsActiveMember'] = df['IsActiveMember'].astype('category')
df['Products'] = df['Products'].astype('category')

In [6]:
numerical_vars = ['EstimatedSalary', 'Score', 'Value', 'client_age']

In [7]:
categorical_vars = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Products']

In [10]:
target_var = ['eligible']

### Normalization of numerical variables

Since distributions are not normal and scales differ a lot from one variable to other, it's necessary to normalize them under Min-Max criterion.

![](https://cdn-images-1.medium.com/max/253/1*Dl3P3Rrzto258X0Ales9Xw.png)

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 

In [16]:
estimated_salary_scaled = scaler.fit_transform(df[['EstimatedSalary']])

In [18]:
score_scaled = scaler.fit_transform(df[['Score']])

In [20]:
value_scaled = scaler.fit_transform(df[['Value']])

In [22]:
client_age_scaled = scaler.fit_transform(df[['client_age']])

Getting a list of values.

In [31]:
estimated_salary_scaled = [float(i) for i in estimated_salary_scaled]
score_scaled = [float(i) for i in score_scaled]
value_scaled = [float(i) for i in value_scaled]
client_age_scaled = [float(i) for i in client_age_scaled]

## One-hot encoding of categorical variables

Now, let's enconde categorical variables. One-hot method is used.

In [32]:
import sklearn.preprocessing as preprocessing
encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')

Geography

In [34]:
encoder.fit(df[['Geography']].values)

OneHotEncoder(handle_unknown='ignore')

In [38]:
geography_encoded = encoder.transform([['France'], ['Germany'], ['Spain']]).toarray()

Gender

In [39]:
encoder.fit(df[['Gender']].values)

OneHotEncoder(handle_unknown='ignore')

In [40]:
encoder.transform([['Male'], ['Female']]).toarray()

array([[0., 1.],
       [1., 0.]])

In [73]:
geography_encoded = pd.get_dummies(df['Geography'])
gender_encoded = pd.get_dummies(df['Gender'])
has_cr_card_encoded = pd.get_dummies(df[['HasCrCard']])
is_active_member_encoded = pd.get_dummies(df[['IsActiveMember']])
products_encoded = pd.get_dummies(df[['Products']])

## Concatenating a feature dataframe 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fb2ec55e-ada8-4de4-93de-2d05f236c13b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>