## Exploratory Data Analysis

In [13]:
import pandas as pd
pd.set_option('display.max_columns', None) 

import numpy as np

In [69]:
dmo = pd.read_csv("../data/dmo.csv", sep = ';')
dmo.head()

Unnamed: 0,Client,Sex,Age,Tenure,Count_CA,Count_SA,Count_MF,Count_OVD,Count_CC,Count_CL,ActBal_CA,ActBal_SA,ActBal_MF,ActBal_OVD,ActBal_CC,ActBal_CL,VolumeCred,VolumeCred_CA,TransactionsCred,TransactionsCred_CA,VolumeDeb,VolumeDeb_CA,VolumeDebCash_Card,VolumeDebCashless_Card,VolumeDeb_PaymentOrder,TransactionsDeb,TransactionsDeb_CA,TransactionsDebCash_Card,TransactionsDebCashless_Card,TransactionsDeb_PaymentOrder,Sale_MF,Sale_CC,Sale_CL,Revenue_MF,Revenue_CC,Revenue_CL
0,1,0,51,7,1,,,1.0,,,1333.802857,,,0.0,,,1678.946429,1318.698214,25.0,23.0,1891.353214,1335.619643,250.0,283.089286,247.607143,50.0,30.0,3.0,12.0,9.0,1.0,0.0,0.0,26.972679,0.0,0.0
1,2,1,43,152,1,1.0,,,,,110.768571,13326.190357,,,,,708.129643,704.935714,4.0,3.0,386.442857,386.442857,307.142857,38.407143,35.714286,6.0,6.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,17,140,1,,1.0,,,,482.654643,,107.251786,,,,1607.149643,1607.149643,4.0,4.0,123.75,123.75,0.0,112.857143,10.678571,3.0,3.0,0.0,1.0,1.0,,,,,,
3,4,1,24,153,1,1.0,,,1.0,,1599.840714,76.437143,,,1110.381786,,1872.483571,1822.589643,9.0,4.0,3614.7475,3598.950357,714.285714,526.0375,1963.165357,41.0,36.0,6.0,12.0,13.0,,,,,,
4,5,0,58,200,1,1.0,,,,,5353.483929,8079.715714,,,,,4372.773929,2964.290357,23.0,14.0,5868.460714,4031.925,157.142857,832.175,1779.571429,44.0,41.0,4.0,17.0,13.0,,,,,,


### 1. Data dim

- 1,615 clients with "social-demographical" and "products owned + actual volumes" data
- 1,587 clients with "inflow/outflow" data
- 969 clients with "Sales and Revenues" data
- Missing values have been replaced with 'NaN'.

### 2. Missing values

In [70]:
dmo.isna().sum()

Client                             0
Sex                                0
Age                                0
Tenure                             0
Count_CA                           0
Count_SA                        1189
Count_MF                        1309
Count_OVD                       1196
Count_CC                        1445
Count_CL                        1480
ActBal_CA                          0
ActBal_SA                       1189
ActBal_MF                       1309
ActBal_OVD                      1196
ActBal_CC                       1445
ActBal_CL                       1480
VolumeCred                        28
VolumeCred_CA                     28
TransactionsCred                  28
TransactionsCred_CA               28
VolumeDeb                         28
VolumeDeb_CA                      28
VolumeDebCash_Card                28
VolumeDebCashless_Card            28
VolumeDeb_PaymentOrder            28
TransactionsDeb                   28
TransactionsDeb_CA                28
T

1. 28 clients are missing "inflow/outflow" data => their accounts aren't active => assign '0'
2. The same assumption applies for the clients with missingness for the current and saving accounts, mutual funds, overdrafts, credit card and loans => assign 0
3. 646 clients are missing sale and revenue values will be considered as test set

In [71]:
# Separate features to replace NaN with zeros

soc_dem = ['Sex', 'Age', 'Tenure']

products_act_bal = ['Count_CA', 'Count_SA', 'Count_MF', 'Count_OVD', 'Count_CC', 'Count_CL', 'ActBal_CA',\
                       'ActBal_SA', 'ActBal_MF', 'ActBal_OVD', 'ActBal_CC', 'ActBal_CL']

inflow_outflow = ['VolumeCred', 'VolumeCred_CA', 'TransactionsCred', 'TransactionsCred_CA', 'VolumeDeb',\
                  'VolumeDeb_CA', 'VolumeDebCash_Card', 'VolumeDebCashless_Card', 'VolumeDeb_PaymentOrder',\
                  'TransactionsDeb', 'TransactionsDeb_CA', 'TransactionsDebCash_Card',\
                  'TransactionsDebCashless_Card', 'TransactionsDeb_PaymentOrder']

sales = ['Sale_MF', 'Sale_CC', 'Sale_CL']
revenue = ['Revenue_MF', 'Revenue_CC', 'Revenue_CL']

In [74]:
dmo[products_act_bal] = dmo[products_act_bal].fillna(0)
dmo[inflow_outflow] = dmo[inflow_outflow].fillna(0)
dmo.isna().sum()

Client                            0
Sex                               0
Age                               0
Tenure                            0
Count_CA                          0
Count_SA                          0
Count_MF                          0
Count_OVD                         0
Count_CC                          0
Count_CL                          0
ActBal_CA                         0
ActBal_SA                         0
ActBal_MF                         0
ActBal_OVD                        0
ActBal_CC                         0
ActBal_CL                         0
VolumeCred                        0
VolumeCred_CA                     0
TransactionsCred                  0
TransactionsCred_CA               0
VolumeDeb                         0
VolumeDeb_CA                      0
VolumeDebCash_Card                0
VolumeDebCashless_Card            0
VolumeDeb_PaymentOrder            0
TransactionsDeb                   0
TransactionsDeb_CA                0
TransactionsDebCash_Card    

In [75]:
dmo.head()

Unnamed: 0,Client,Sex,Age,Tenure,Count_CA,Count_SA,Count_MF,Count_OVD,Count_CC,Count_CL,ActBal_CA,ActBal_SA,ActBal_MF,ActBal_OVD,ActBal_CC,ActBal_CL,VolumeCred,VolumeCred_CA,TransactionsCred,TransactionsCred_CA,VolumeDeb,VolumeDeb_CA,VolumeDebCash_Card,VolumeDebCashless_Card,VolumeDeb_PaymentOrder,TransactionsDeb,TransactionsDeb_CA,TransactionsDebCash_Card,TransactionsDebCashless_Card,TransactionsDeb_PaymentOrder,Sale_MF,Sale_CC,Sale_CL,Revenue_MF,Revenue_CC,Revenue_CL
0,1,0,51,7,1,0.0,0.0,1.0,0.0,0.0,1333.802857,0.0,0.0,0.0,0.0,0.0,1678.946429,1318.698214,25.0,23.0,1891.353214,1335.619643,250.0,283.089286,247.607143,50.0,30.0,3.0,12.0,9.0,1.0,0.0,0.0,26.972679,0.0,0.0
1,2,1,43,152,1,1.0,0.0,0.0,0.0,0.0,110.768571,13326.190357,0.0,0.0,0.0,0.0,708.129643,704.935714,4.0,3.0,386.442857,386.442857,307.142857,38.407143,35.714286,6.0,6.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,17,140,1,0.0,1.0,0.0,0.0,0.0,482.654643,0.0,107.251786,0.0,0.0,0.0,1607.149643,1607.149643,4.0,4.0,123.75,123.75,0.0,112.857143,10.678571,3.0,3.0,0.0,1.0,1.0,,,,,,
3,4,1,24,153,1,1.0,0.0,0.0,1.0,0.0,1599.840714,76.437143,0.0,0.0,1110.381786,0.0,1872.483571,1822.589643,9.0,4.0,3614.7475,3598.950357,714.285714,526.0375,1963.165357,41.0,36.0,6.0,12.0,13.0,,,,,,
4,5,0,58,200,1,1.0,0.0,0.0,0.0,0.0,5353.483929,8079.715714,0.0,0.0,0.0,0.0,4372.773929,2964.290357,23.0,14.0,5868.460714,4031.925,157.142857,832.175,1779.571429,44.0,41.0,4.0,17.0,13.0,,,,,,


### 3. Correct clients' age

1. Check if all clients are eligible to take a loan or posses a credit card
2. Check if the tenure < actual age

In [77]:
# To be eligible for a loan in Germany, a client must meet be at least 18 years old

print("{0:.0%} clients are under 18".format(len(dmo['Age'][dmo['Age'] < 18]) / dmo.shape[0]))

6% clients are under 18


Person under 18 years old can be clients of the bank but they cannot posses some products. Therefore, we cannot take them into account while building the models. Hence, we exclude theam from the dataset.

In [78]:
dmo_18plus = dmo.loc[(dmo["Age"] > 18)]
dmo_18plus.shape

(1506, 36)

In [79]:
# Check clients who have a tenure higher than their age
print("{0:.0%} of all clients have a tenure higher than their Age".format(len(dmo[dmo['Age'] < dmo['Tenure']/12])/ dmo.shape[0]))
print("{0:.0%} of clients over 18 have a tenure higher than their Age".format(len(dmo_18plus[dmo_18plus['Age'] < dmo_18plus['Tenure']/12])/ dmo_18plus.shape[0]))


3% of all clients have a tenure higher than their Age
0% of clients over 18 have a tenure higher than their Age


After excluding the clients under 18, there is no abnormalities in age - tenure were returned. If we eanted to proceed with the entire dataste, we would impute the data. For the sake of time, we will proceed with reduced DS.

### 4. Explore features

### 5. Split dataset into training, test and validation sets

In [81]:
dmo_train = dmo_18plus[~dmo_18plus['Sale_MF'].isna()]
dmo_test = dmo_18plus[dmo_18plus['Sale_MF'].isna()]