In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_excel("C:/Users/ManiShankar/Documents/LetsUpgrade/AI_ML/Materials/Datasets/Bank_Personal_Loan_Modelling.xlsx",sheet_name=1)

In [3]:
data.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [4]:
data.tail()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0
4999,5000,28,4,83,92612,3,0.8,1,0,0,0,0,1,1


In [5]:
data.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [6]:
# checking for null values
data.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [7]:
len(data)

5000

In [8]:
# checking for duplicated rows
data = data.drop_duplicates()
len(data)

5000

No rows were dropped and has any Nan Values

In [9]:
data.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object

No categorical Variable

In [10]:
data = data.astype(np.int64)

In [11]:
data.dtypes

ID                    int64
Age                   int64
Experience            int64
Income                int64
ZIP Code              int64
Family                int64
CCAvg                 int64
Education             int64
Mortgage              int64
Personal Loan         int64
Securities Account    int64
CD Account            int64
Online                int64
CreditCard            int64
dtype: object

In [12]:
data.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [13]:
# ZIP_Code does not seem to have any significance so we drop it
data.drop(['ZIP Code'], inplace = True, axis = 1)

In [14]:
data.columns

Index(['ID', 'Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

# Random Forest for finding important features for DecisionTree


In [15]:
rf_model = RandomForestClassifier(n_estimators = 1000, max_features = 2, oob_score = True)

In [16]:
features = ['ID', 'Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Securities Account', 'CD Account',
       'Online', 'CreditCard']

In [17]:
x = data[features]
y = pd.DataFrame({'BankLoan':data['Personal Loan']})

In [18]:
print(len(x))
print(len(y))

5000
5000


In [19]:
x.head(2)

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,4,1,1,0,1,0,0,0
1,2,45,19,34,3,1,1,0,1,0,0,0


In [20]:
y.head(2)

Unnamed: 0,BankLoan
0,0
1,0


In [21]:
rf_model.fit(x,y)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
rf_model.oob_score_

0.987

In [23]:
for feature,percent in zip(features,rf_model.feature_importances_) :
    print(feature,percent)

ID 0.05165575564698184
Age 0.04479060325260487
Experience 0.04394673690147719
Income 0.3384189503804467
Family 0.09381140201649593
CCAvg 0.15112925652089962
Education 0.15213916783718237
Mortgage 0.04331075212227144
Securities Account 0.005745603225654864
CD Account 0.056303316236331634
Online 0.00858136503080811
CreditCard 0.010167090828845518


# From above result we can colnclude
1. Income has high significance in Attrition determination
2. CCAvg, Education, Family has next significance after Income
3. CD Account, Mortage and Experience, Age has next significance
4. Securities Account, Online, CreditCard has least significance

In [24]:
data.columns

Index(['ID', 'Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

So we consider only ['Income', 'CCAvg', 'Education', 'Family', 'CD Account', 'Mortgage', 'Experience', 'Age']

# Decision Tree

In [25]:
tree_model = tree.DecisionTreeClassifier(max_depth=5)

In [26]:
x = data[['Income', 'CCAvg', 'Education', 'Family', 'CD Account', 'Mortgage', 'Experience', 'Age']]
y = data['Personal Loan']

In [27]:
print(len(x))
print(len(y))

5000
5000


In [28]:
tree_model.fit(x,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [29]:
with open("Model5.dot",'w') as f :
    f = tree.export_graphviz(tree_model,feature_names=['Income', 'CCAvg', 'Education', 'Family', 'CD Account', 'Mortgage', 'Experience', 'Age'],out_file=f)

# insights from DecisionTree

1. If Education <= 1.5 and Family <= 3.5 and Income >= 106.5 and CCAvg <= 2.5 then returns True.
2. If Income <= 106.5 and CCAvg <= 2.5 then returns true.
3. If Family <= 2.5 and Education >= 1.5 then returns False.
4. If Age >= 60.5 and Income <= 116.5 and Education <= 1.5 then returns False.