# BANK MARKETING PREDICTION

### Importing all the libraries

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.special import expit


In [None]:
# Reading CSV file
df = pd.read_csv('C:\\Users\\Darshu\\Documents\\bank_prediction.csv')
print("Dataset with rows {} and columns {}".format(df.shape[0],df.shape[1]))
df.head()

In [None]:
df.head()
df.shape

In [None]:
# Basic information about the data
df.info()

In [None]:
#Finding null values
df.isnull().sum()

## Univariate Analysis 

In [None]:
df.describe()

In [None]:
df.pdays.describe()

The statistical summary of our ‘pdays’ column looks a little bit off . All the percentiles are -1 and mean also seems to be less compare to max values

In [None]:
print(df.pdays.mean())
print(df.pdays.median())
print(df.pdays.min())
print(df.pdays.max())

In [None]:
df["pdays"].value_counts()

 -1 is the missing values, so we can consider removing it when describing.

In [None]:
#Describing the pdays column again, this time limiting to the relevant values of pdays.
df["pdays"][df["pdays"]>0].describe()

In [None]:
#check
#Now we can see that the mean, median are acceptable value and also as mean is greater than median 
#we can say that it is right skewed and therefore we should standardardize it

In [None]:
# Understanding all columns
for col in df.select_dtypes(include='object').columns:
    print(col)
    print(df[col].unique())

In [None]:
#Plotting a horizontal bar graph with the median values of balance for each education level value. 
df.groupby(['education'])['balance'].median().plot.barh()

Thus, we can conclude from graph that customer with tertiary level of education has highest median value for balance.

In [None]:
# Box plot for pdays (without considering -1 values)
data = df[df.pdays != -1]
df.pdays.plot.box()
plt.show()

From the above box plot we can see that there are outliers present in pday

In [None]:
df.salary.plot.box()
plt.show()

#### The final goal is to make a predictive model to predict if the customer will respond positively to the campaign or not. The target variable is “response”.


In [None]:
df1=df.copy()

In [None]:
df1.drop(df1[df1['pdays'] < 0].index, inplace = True) 

In [None]:
df1.pdays.describe()

In [None]:
#Plotting a horizontal bar graph with the median values of balance for each education level value. 
df1.groupby(['education'])['balance'].median().plot.barh()

In [None]:
# Box plot for pdays (without considering -1 values)
#data = df[df.pdays != -1]
df1.pdays.plot.box()
plt.show()

 From the above box plot we can see that there are outliers present in pdays.

In [None]:
df1.salary.plot.box()
plt.show()

### Bi- variate Analysis

#### Converting the response variable to a convenient form

In [None]:
df1.response.value_counts(normalize=True)

In [None]:
df1.replace({'response': {"yes": 1,'no':0}},inplace=True)

In [None]:
df1.response.value_counts()

In [None]:
# here we are seperating object and numerical data types 
numeric = []
categorical = []
for col in df.columns:
    if df[col].dtype=='O':
        categorical.append(col)
    else:
        numeric.append(col)

In [None]:
print("Object data type features ",categorical)
print("Numerical data type features ",numeric)

In [None]:
from numpy import median
for col in categorical[1:]:
    plt.figure(figsize=(8,6))
    sns.violinplot(df1[col],df1["response"])
    plt.title("Response vs "+col,fontsize=15)
    plt.xlabel(col,fontsize=10)
    plt.ylabel("Response",fontsize=10)
    plt.show()

In [None]:
### identigyind categorical and numerical columns
#cols = df.columns
#numeric= df._get_numeric_data().columns
#categorical = list(set(cols) - set(numeric))

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df1.corr(),annot=True,cmap='RdBu_r')
plt.title("Correlation Of Each Numerical Features")
plt.show()

#### Visualizing Categorical Features with Response

In [None]:
for i in df1[categorical]:
    sns.barplot(df1.response,df1[i])
    plt.show()

In [None]:
# Analysing balance with housing and loan
plt.rcParams['figure.figsize']=(10,5)
plt.subplot(121)
sns.stripplot(x='housing',y='balance',data=df1)
plt.subplot(122)
sns.stripplot(x='loan',y='balance',data=df1)

With or without housing loans and personal loans will greatly affect the balance, and those without housing loans and personal loans will have more banlance.

#### Marketing analysis

In [None]:
#Response for different user age
plt.figure(figsize=(16,8))
plt.subplot(211)
sns.distplot(df1[df1.response==1].age)
#distplot
plt.ylabel('response=yes')
plt.subplot(212)
sns.distplot(df1[df1.response==0].age)
plt.ylabel('response=no')

##### It can be seen that the main response in business crowd is for 20-60 years old. It can be seen that people after the age of 20 are happy to have deposit, while the younger people under 20 are more likely to have no deposit.

### Label Encoding of Categorical Variables 

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
df2 = df1[categorical].apply(LabelEncoder().fit_transform)
df2

In [None]:
f_data = df2.join(df1[numeric])

In [None]:
f_data

In [None]:
f_data.shape

In [None]:
f_data.head()

In [None]:
f_data.shape

In [None]:
df.shape

In [None]:
f_data.columns

## Predictive model 1: Logistic regression
 

In [None]:
#Predictive model 1: Logistic regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
np.random.seed(42)


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
X = f_data.drop("response", axis=1)
X.head()

In [None]:
y= f_data[['response']]
y.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
cv_score= cross_val_score(lr,X_train,y_train, cv=5)
np.mean(cv_score)

In [None]:
lr.classes_

In [None]:
lr.coef_

In [None]:
y_pred =lr.predict(X_test)

In [None]:
print('Report:\n',classification_report(y_test, y_pred))
print("F1 Score:",f1_score(y_pred,y_test))
print('confusion Matrix:\n',confusion_matrix(y_pred,y_test))
print('cross validation:',cross_val_score(lr, X, y, cv=5))

In [None]:
confusion_matrix(y_pred,y_test)

In [None]:
f1_score(y_pred,y_test)

In [None]:
# Use RFE to select top n features in an automated fashion (choose n as you see fit)

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(lr,5)
rfe.fit(X_train,y_train)

In [None]:
print(X_train.columns[rfe.support_])

In [None]:
rfe.ranking_

In [None]:
cols = X_train.columns[rfe.support_]
cols

In [None]:
import statsmodels.api as sm

In [None]:
log1 = sm.GLM(y_train,(sm.add_constant(X_train[cols])), family = sm.families.Binomial())
log1.fit().summary()

the rfe has selected features but some featues are high value but just build a model

In [None]:
lr.fit(X_train[cols],y_train)

In [None]:
lr.classes_

In [None]:
lr.coef_

In [None]:
y_pred2 = lr.predict(X_test[cols])

In [None]:
print('Report:\n',classification_report(y_test, y_pred2))
print("F1 Score:",f1_score(y_pred2,y_test))
print('confusion Matrix:\n',confusion_matrix(y_pred2,y_test))
print('kfold cross validation:\n',cross_val_score(lr, X, y, cv=5))


In [None]:
# Some feature has higher p value so lets check with another approach ie.. vif
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
X_train_sm = sm.add_constant(X_train[cols])
X_train_sm.head()

In [None]:
lr1 = sm.OLS(y_train, X_train_sm).fit()

In [None]:
lr1.summary()

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### After looking to the various factors such as vif, lets take features manually and build a model

In [None]:
F_cols = ['education', 'job', 'poutcome', 'marital', 'month','targeted', 'housing', 'age', 'salary','day','duration']

In [None]:
log = sm.GLM(y_train,(sm.add_constant(X_train[F_cols])), family = sm.families.Binomial())
log.fit().summary()

In [None]:
lr.fit(X_train[F_cols],y_train)

In [None]:
lr.classes_

In [None]:
lr.coef_

In [None]:
predf = lr.predict(X_test[F_cols])

In [None]:
def LR_matric():    
    print('Report:\n',classification_report(y_test,predf ))
    print("F1 Score:",f1_score(predf,y_test))
    print('confusion Matrix:\n',confusion_matrix(predf,y_test))
    print('kfold cross validation:\n',cross_val_score(lr, X, y, cv=5))
  

In [None]:
LR_matric()

In [None]:
print("Important Features are :", F_cols)

### Predictive model 2: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=5,random_state=42,max_leaf_nodes=50)
rfc.fit(X_train,y_train)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
cv1_score= cross_val_score(rfc,X_train,y_train, cv=5)
np.mean(cv1_score)

In [None]:
y_pred1 = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred1))

In [None]:
f1_score(y_test,y_pred1)

In [None]:
confusion_matrix(y_test,y_pred1)

In [None]:
def RF_matric():    
    print('Report:\n',classification_report(y_test, y_pred1))
    print("F1 Score:",f1_score(y_pred1,y_test))
    print('confusion Matrix:\n',confusion_matrix(y_pred1,y_test))
    print('cross validation:',cross_val_score(rfc, X, y, cv=5))
 

In [None]:
RF_matric()

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test,y_pred1)

## RFE

In [None]:
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
rfe1 = RFE(rfc, 5)
rfe1.fit(X_train,y_train)

In [None]:
rfe1.support_

In [None]:
X_train.columns[rfe1.support_]

In [None]:
cols = X_train.columns[rfe1.support_]

In [None]:
rfc.fit(X_train[cols],y_train)

In [None]:
y_pred3 = rfc.predict(X_test[cols])

In [None]:
f1_score(y_pred3,y_test)

In [None]:
confusion_matrix(y_pred3,y_test)

Housing, month, pdays, poutcome, duration are imp feature from RANDOM FOREST perspective.

Housing, month, pdays, poutcome, duration this are importent factor to calculate y variable both logistic and random forest