In [1]:
import warnings
warnings.filterwarnings('ignore')

#Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# Remove scientific notations and display numbers with 2 decimal points instead
pd.options.display.float_format = '{:,.4f}'.format

In [2]:
data = pd.read_csv('40_bank-full.csv')
data.head()

FileNotFoundError: [Errno 2] File 40_bank-full.csv does not exist: '40_bank-full.csv'

In [None]:
#data
print('*******************Dataset Info************************')
print(data.info())
print('*******************DataSet Shape***********************')
print(data.shape)
print('*********************Describe**************************')
print(data.describe())
print('*******************Number of nulls*********************')
print(data.isnull().sum())
print('*******************************************************')

- There are Object Data Type features are present. Need to encode them to 0,1 using One hot encoding.
- Few features have values like Unknown.
- There are no missing values.



In [None]:
#Convert Day to Category as it's numerical value is not relevant
data['day']=data['day'].astype('category')

#Convert Target to Category as it's Object type
data['Target']=data['Target'].astype('category')
print(data.info())

In [None]:
#Check the unique values of Job feature
data.job.value_counts()

In [None]:
data.drop(data[data.job=="unknown"].index,axis=0,inplace=True)

In [None]:
#Check the uniqe values of marital
data.marital.value_counts()

In [None]:
data.education.value_counts()

In [None]:
#There are 1822 rows with missing education data. So can remove these rows.
data.drop(data[data.education=="unknown"].index,axis=0,inplace=True)

In [None]:
data.default.value_counts()

In [None]:
data.contact.value_counts()

In [None]:
#There are high number of unknown entries in `contact`. It's better to drop the entire column.
data.drop("contact",axis=1,inplace=True)

In [None]:
data.month.value_counts()

In [None]:
data.poutcome.value_counts()

In [None]:
#There are high number of unkown entries in `poutcome` attribute. It's better to drop the column.
data.drop("poutcome",axis=1,inplace=True)

In [None]:
data.describe()

There are few outliers in numerical attributes.

1. "balance" column has a minimum value of -8019 which is negative, but the average annual balance should not be negative. This either can be a typo or an outlier.
2. There is a significant difference in 75% and max values of "age" attribute.
3. Similarly, there are outliers in "duration" and "campaign".
4. "previous" and "pdays" have majority of entries with -1 or 0 (missing data or client not contacted), we can drop this column.

Univariate Analysis

In [None]:
sns.pairplot(data)

The distribution of all numerical variables other than age is highly skewed - hence we might want to transform or bin some of these variables

*** Check Presence of the outliers

In [None]:
sns.boxplot(data["age"])
plt.show()

In [None]:
data.age.max()

In [None]:
data.age.min()

In [None]:
#both max and min ages (95,18) are realistic, and thus, we won't remove them

In [None]:
sns.boxplot(data["balance"])
plt.show()

In [None]:
data.balance.mean()

In [None]:
data["balance_outliers"] = zscore(data["balance"])
data.drop(data[(data["balance_outliers"]>3) | (data["balance_outliers"] < -3)].index,axis=0,inplace=True)

In [None]:
# We don't need the zscore column anymore
data.drop("balance_outliers",axis=1,inplace=True)

In [None]:
sns.boxplot(data["balance"])
plt.show()

In [None]:
sum(data.balance<-2500)

In [None]:
data.drop(data[data.balance<-2500].index,axis=0,inplace=True)
sns.boxplot(data["balance"])
plt.show()

In [None]:
sns.boxplot(data["day"])
plt.show()

In [None]:
sns.boxplot(data["duration"])
plt.show()

We know that duration is not known in advance and by the end of the call, when we know the duration, we also know the result. So, we can drop this attribute.

In [None]:
data.drop("duration",axis=1,inplace=True)

In [None]:
sns.boxplot(data["pdays"])
plt.show()

Because of the unbalanced distribution in value ranges for pdays attribute, it's better drop it.

In [None]:
data.drop("pdays",axis=1,inplace=True)

In [None]:
sns.boxplot(data["previous"])
plt.show()

In [None]:
sum(data.previous>50)

In [None]:
sum(data.previous ==0)

In [None]:
sum(data.previous > 0)

*** There are very few records has values, rest of them are 0 so better to drop it

In [None]:
data.drop("previous",axis=1,inplace=True)

In [None]:
data.shape

In [None]:
data['job'].value_counts()

In [None]:
sns.countplot(data['marital']);

In [None]:
sns.countplot(data['education']);

In [None]:
sns.countplot(data['default']);

In [None]:
#**default - yes is a very small % - we can consider deleting this column**

In [None]:
sns.countplot(data['housing']);

In [None]:
sns.countplot(data['loan']);

In [None]:
sns.countplot(data['Target']);

In [None]:
data['Target'].value_counts(normalize=True)

### The response rate is only 11.6%. Hence the Y variable has a high class imbalance. Hence accuracy will not be a reliable model performance measure. 

### FN is very critical for this business case because a false negative is a customer who will potentially subscribe for a loan but who has been classified as 'will not subscribe'. Hence the most relevant model performance measure is recall

## Bivariate analysis

In [None]:
data.groupby(["Target"]).mean()

#### The mean balance is higher for customers who subscribe to the term deposit compared to those who dont


#### number of days that passed by after the client was last contacted from a previous campaign is higher for people who have subscribed

#### number of contacts performed before this campaign is also higher for customers who subscribe


### All of the above facts indicate that customers with a higher balance and those who have been contacted frequently before the campaign tend to subscribe for the term deposit

In [None]:
pd.crosstab(data['job'], data['Target'], normalize='index').sort_values(by='yes',ascending=False )

*** The highest conversion is for students (28%) and lowest is for blue-collar(7%)

In [None]:
pd.crosstab(data['marital'], data['Target'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
pd.crosstab(data['education'], data['Target'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
pd.crosstab(data['default'], data['Target'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
data['default'].value_counts(normalize=True)

In [None]:
### Since default - yes is only 2% of the data and the conversion is also comparitively lower for default - yes, we can remove this column
data.drop(['default'], axis=1, inplace=True)

In [None]:
data.columns

In [None]:
pd.crosstab(data['housing'], data['Target'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
pd.crosstab(data['loan'], data['Target'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
pd.crosstab(data['day'], data['Target'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
pd.crosstab(data['month'], data['Target'], normalize='index').sort_values(by='yes',ascending=False )

### Data Preperation

In [None]:
data.job.unique()

In [None]:
# One hot embedding and remove original job column
data = pd.concat([data,
                    pd.get_dummies(data.job,drop_first=True)
                   ], axis=1).drop("job",axis=1)
data.head()

In [None]:
data.marital.unique()

In [None]:
# Perform one hot embedding and remove original marital column
data = pd.concat([data,
                    pd.get_dummies(data.marital,drop_first=True)
                   ], axis=1).drop("marital",axis=1)
data.head()

In [None]:
data.education.unique()

In [None]:
# Perform one hot embedding and remove original education column
data = pd.concat([data,
                    pd.get_dummies(data.education,drop_first=True)
                   ], axis=1).drop("education",axis=1)
data.head()

In [None]:
data.housing.unique()

In [None]:
data.housing = data.housing.map({"yes":1,"no":0})
data.head()

In [None]:
data.loan = data.loan.map({"yes":1,"no":0})
data.head()

In [None]:
data.Target = data.Target.map({"yes":1,"no":0})
data.head()

In [None]:
# Perform one hot embedding and remove original month column
data = pd.concat([data,
                    pd.get_dummies(data.month,drop_first=True)
                   ], axis=1).drop("month",axis=1)
data.head()

In [None]:
data.shape

### Model Building

In [None]:
# Separating dependent and independent variables
X = data.drop(['Target'], axis = 1)
y = data['Target']

# Splitting the data into training and test set in the ratio of 70:30 respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Logistic Regression Classifier

In [None]:
lr = LogisticRegression(random_state=7)
lr.fit(X_train,y_train)
print(lr.score(X_train, y_train))
lr_acc=lr.score(X_test, y_test)
print(lr_acc)
predictionsLR = lr.predict(X_test)
print('Logistic Regression:',accuracy_score(y_test, predictionsLR))
# confusion matrix
cmLR = confusion_matrix(y_test, predictionsLR)
print(cmLR)
# classification report
print(classification_report(y_test, predictionsLR))

In [None]:
df_cmLR = pd.DataFrame(cmLR, index = [i for i in ["Yes","No"]],
                  columns = [i for i in ["Yes","No"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cmLR, annot=True ,fmt='g')

In [None]:
resultsDf = pd.DataFrame({'Method':['Logistic Regression'], 'accuracy': lr_acc})
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf

### Decision Tree Classifies

In [None]:
dTree = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTree.fit(X_train,y_train)
print(dTree.score(X_train, y_train))
dt_acc=dTree.score(X_test, y_test)
print(dt_acc)
predictionsDT = dTree.predict(X_test)
print('Decision Tree:',accuracy_score(y_test, predictionsDT))
# confusion matrix
cmDT = confusion_matrix(y_test, predictionsDT)
print(cmDT)
# classification report
print(classification_report(y_test, predictionsDT))

In [None]:
df_cmDT = pd.DataFrame(cmDT, index = [i for i in ["Yes","No"]],
                  columns = [i for i in ["Yes","No"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cmDT, annot=True ,fmt='g')

In [None]:
tempResultsDf = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': [dt_acc]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf

### Ensemble Models

*** Bagging ***

In [None]:
bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
print(bgcl.score(X_train, y_train))
bg_acc=bgcl.score(X_test, y_test)
print(bg_acc)
predictionsBG = bgcl.predict(X_test)
print('Bagging Classifier ',accuracy_score(y_test, predictionsDT))
# confusion matrix
cmBG = confusion_matrix(y_test, predictionsBG)
print(cmBG)
# classification report
print(classification_report(y_test, predictionsDT))

In [None]:
df_cmBG = pd.DataFrame(cmBG, index = [i for i in ["Yes","No"]],
                  columns = [i for i in ["Yes","No"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cmBG, annot=True ,fmt='g')

In [None]:
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [bg_acc]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf

*** AdaBoost ***

In [None]:
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
abcl = abcl.fit(X_train, y_train)
print(abcl.score(X_train, y_train))
ab_acc=abcl.score(X_test, y_test)
print(ab_acc)
predictionsAB = abcl.predict(X_test)
print('AdaBoost Classifier ',accuracy_score(y_test, predictionsAB))
# confusion matrix
cmAB = confusion_matrix(y_test, predictionsAB)
print(cmBG)
# classification report
print(classification_report(y_test, predictionsAB))

In [None]:
df_cmAB = pd.DataFrame(cmAB, index = [i for i in ["Yes","No"]],
                  columns = [i for i in ["Yes","No"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cmAB, annot=True ,fmt='g')

In [None]:
tempResultsDf = pd.DataFrame({'Method':['Ada boost'], 'accuracy': [ab_acc]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf

*** Gradient Boost ***

In [None]:
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gbcl = gbcl.fit(X_train, y_train)
print(gbcl.score(X_train, y_train))
gb_acc=gbcl.score(X_test, y_test)
print(gb_acc)
predictionsGB = gbcl.predict(X_test)
print('Gradient Classifier ',accuracy_score(y_test, predictionsGB))
# confusion matrix
cmGB = confusion_matrix(y_test, predictionsGB)
print(cmGB)
# classification report
print(classification_report(y_test, predictionsGB))

In [None]:
df_cmGB = pd.DataFrame(cmGB, index = [i for i in ["Yes","No"]],
                  columns = [i for i in ["Yes","No"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cmGB, annot=True ,fmt='g')

In [None]:
tempResultsDf = pd.DataFrame({'Method':['Gradient boost'], 'accuracy': [gb_acc]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf

*** Random Forest Classifier ***

In [None]:
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
rfcl = rfcl.fit(X_train, y_train)
print(rfcl.score(X_train, y_train))
rf_acc=rfcl.score(X_test, y_test)
print(rf_acc)
predictionsRF = rfcl.predict(X_test)
print('Gradient Classifier ',accuracy_score(y_test, predictionsRF))
# confusion matrix
cmRF = confusion_matrix(y_test, predictionsRF)
print(cmRF)
# classification report
print(classification_report(y_test, predictionsRF))

In [None]:
df_cmRF = pd.DataFrame(cmRF, index = [i for i in ["Yes","No"]],
                  columns = [i for i in ["Yes","No"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cmRF, annot=True ,fmt='g')

In [None]:
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [rf_acc]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf

*** Based onthe Above Gradient Boost is the best model to predict***