### 1. Which Consumer will Subscribe to a Term Loan?
* Dataset: Bank Marketing Campaign
* Source: UCI Machine Learning Repository

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

### Load the data

In [2]:
# Bring the dataset into the environment
bank = pd.read_csv("datasets/banks.csv", sep=";")

# view first 5 rows
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
# Check the shape of the dataset - observation and variables
bank.shape

(41188, 21)

In [4]:
# Show any missing values in the variables
bank.isna().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [5]:
# Show information about the data
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

In [6]:
# Display column names
bank.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [7]:
# Select features
bank.drop(['job', 'marital', 'education', 'poutcome','default','contact', 'month', 
           'day_of_week', 'duration', 'campaign', 'pdays','emp.var.rate', 'cons.price.idx',
           'cons.conf.idx', 'euribor3m', 'nr.employed'],axis=1, inplace=True)

In [8]:
bank.head()

Unnamed: 0,age,housing,loan,previous,y
0,56,no,no,0,no
1,57,no,no,0,no
2,37,yes,no,0,no
3,40,no,no,0,no
4,56,no,yes,0,no


In [9]:
# create a lamdba object
convert = lambda x: 0 if x=='no' else 1

# Apply lambda object to convert variable from categorical
# to nominal values
bank['HOUSING'] = bank['housing'].apply(convert)
bank['LOAN'] = bank['loan'].apply(convert)
bank['SUBSCRIPTION']=bank['y'].apply(convert)

# Create new columns
bank['AGE']=bank['age']
bank['PREVIOUS']=bank['previous']

In [10]:
bank.head()

Unnamed: 0,age,housing,loan,previous,y,HOUSING,LOAN,SUBSCRIPTION,AGE,PREVIOUS
0,56,no,no,0,no,0,0,0,56,0
1,57,no,no,0,no,0,0,0,57,0
2,37,yes,no,0,no,1,0,0,37,0
3,40,no,no,0,no,0,0,0,40,0
4,56,no,yes,0,no,0,1,0,56,0


In [11]:
# select the coded variables

bank.drop(['age','housing','loan','previous','y'],axis=1, inplace=True)

In [12]:
# show new dataset

bank.head()

Unnamed: 0,HOUSING,LOAN,SUBSCRIPTION,AGE,PREVIOUS
0,0,0,0,56,0
1,0,0,0,57,0
2,1,0,0,37,0
3,0,0,0,40,0
4,0,1,0,56,0


### Build First Model
* Use 3 features


In [13]:
# select 3 features
X = bank[['AGE','HOUSING','LOAN']]

# select target
y = bank['SUBSCRIPTION']

In [14]:
# Split the data set into training and testing sets

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=20)

# Show shape of training and testing sets
X_train.shape, X_test.shape

((28831, 3), (12357, 3))

In [15]:
#Show shape of the trainging and testing target sets

y_train.shape, y_test.shape

((28831,), (12357,))

In [16]:
# Call decision tree classifier

model3 = DecisionTreeClassifier()

model3.fit(X_train,y_train)

prediction = model3.predict(X_test)

In [17]:
# show classification report

print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     11000
           1       0.47      0.03      0.06      1357

    accuracy                           0.89     12357
   macro avg       0.68      0.51      0.50     12357
weighted avg       0.85      0.89      0.84     12357



### Build Second Model
* Use 4 features

In [18]:
# Select 4 features
X4 = bank[['AGE','HOUSING','LOAN','PREVIOUS']]
y4 = bank['SUBSCRIPTION']

In [19]:
# Split the data set into training and testing sets

X4_train,X4_test,y4_train,y4_test = train_test_split(X4,y4, test_size=0.3, random_state=20)

# Show shape of training and testing sets
X4_train.shape, X4_test.shape

((28831, 4), (12357, 4))

In [20]:
#Show shape of the trainging and testing target sets

y4_train.shape, y4_test.shape

((28831,), (12357,))

In [21]:
# Call decision tree classifier

model4 = DecisionTreeClassifier()

# fit model
model4.fit(X4_train,y4_train)

# make prediction
prediction4 = model4.predict(X4_test)

In [22]:
# show classification report for 4 features

print(classification_report(y4_test,prediction4))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     11000
           1       0.44      0.08      0.14      1357

    accuracy                           0.89     12357
   macro avg       0.67      0.53      0.54     12357
weighted avg       0.85      0.89      0.85     12357



### 2. Does Random Forest Classifier Predicts Better than Decision Tree?
#### Random Forest Model for 3 Features

In [23]:
# Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

In [24]:
# Instantiate Random Forest Classifier for 3 features
randForest_3 = RandomForestClassifier(n_estimators=100)

In [25]:
# Fit model
randForest_3.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
# Make prediction
randForestPred_3 = randForest_3.predict(X_test)

In [27]:
# Generate confusion matrix output
confusion_matrix(y_test,randForestPred_3)

array([[10944,    56],
       [ 1303,    54]], dtype=int64)

In [28]:
# Print model 3 accuracy report
print(classification_report(y_test, randForestPred_3))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94     11000
           1       0.49      0.04      0.07      1357

    accuracy                           0.89     12357
   macro avg       0.69      0.52      0.51     12357
weighted avg       0.85      0.89      0.85     12357



#### Random Forest Model for 4 Features

In [29]:
# Use 4 features
# Instantiate Random Forest Classifier for 4 features
randForest_4 = RandomForestClassifier(n_estimators=100)

In [30]:
# fit model
# Fit model
randForest_4.fit(X4_train,y4_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
# Make prediction
randForestPred_4 = randForest_4.predict(X4_test)

In [32]:
# Generate confusion matrix output
confusion_matrix(y4_test,randForestPred_4)

array([[10843,   157],
       [ 1230,   127]], dtype=int64)

In [33]:
# Print model 4 accuracy report
print(classification_report(y4_test, randForestPred_4))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     11000
           1       0.45      0.09      0.15      1357

    accuracy                           0.89     12357
   macro avg       0.67      0.54      0.55     12357
weighted avg       0.85      0.89      0.85     12357



### Conclusion
* Both models predict quiet well on average (f1-score = 0.85)
* Rarely no difference in accuracy of model 3 and model 4
* Decision Tree and Random Forest classifiers have similar prediction accuracy on average

In [1]:
###