In [2]:
from pandas import read_csv, get_dummies, Series, DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline    #method 3
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV   #method 3

In [3]:
#Reading the Dataset
data = read_csv('/content/drive/MyDrive/ML_Stats/bank.csv')
data.info()

#Exploring the data
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


(4521, 17)

In [4]:
# Data Cleaning
# Check for missing values
print(data.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [6]:
#Encoding
# a. Binary Encoding
data['default'] = data['default'].map({'yes':1, 'no':0})
data['housing'] = data['housing'].map({'yes':1, 'no':0})
data['loan'] = data['loan'].map({'yes':1, 'no':0})
data['y'] = data['y'].map({'yes':1, 'no':0})    #converting target variable to binary

# b. One-hot encoding - Dimentionality reduction
#to avoid overfitting - "Curse of Dimentionality"
#Label Encoding
data['education'] = data['education'].map({'primary':1,'secondary':2,'tertiary':3, 'unknown':4})
data['contact'] = data['contact'].map({'cellular':1,'unknown':2,'telephone':3})
data['month'] = data['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12})
data['poutcome'] = data['poutcome'].map({'unknown':1,'failure':2,'other':3,'success':4})
#data.info()

#get dummies
data2 = get_dummies(data, columns = ['job', 'marital'], dtype=int)
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   age                4521 non-null   int64
 1   education          4521 non-null   int64
 2   default            4521 non-null   int64
 3   balance            4521 non-null   int64
 4   housing            4521 non-null   int64
 5   loan               4521 non-null   int64
 6   contact            4521 non-null   int64
 7   day                4521 non-null   int64
 8   month              4521 non-null   int64
 9   duration           4521 non-null   int64
 10  campaign           4521 non-null   int64
 11  pdays              4521 non-null   int64
 12  previous           4521 non-null   int64
 13  poutcome           4521 non-null   int64
 14  y                  4521 non-null   int64
 15  job_admin.         4521 non-null   int64
 16  job_blue-collar    4521 non-null   int64
 17  job_entreprene

In [7]:
#X & Y - dropping the output
X = data2.drop('y', axis = 1)
Y = data2['y']

#Scaling
X_scaled = StandardScaler().fit_transform(X)

#Spliting data
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.3, random_state=61)

#balancing the data - SMOTE
X_train, Y_train = SMOTE (random_state = 61).fit_resample(X_train,Y_train)


## Random-forest Classifier

In [8]:
# Random Forest Classifier (method 2) - to get the best value for Hpyprparameter - n_estimators
RF_classifier2 = RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=36)  #building model/classifier

no_trees = {'n_estimators': [50, 100, 200, 250, 300, 350, 400, 450]}
RF_grid_search = GridSearchCV(estimator=RF_classifier2, param_grid=no_trees, scoring='recall', cv=5)

RF_grid_search.fit(X_scaled, Y)   #fit: training, testing, evaluating & ranking

RF_best_parameters = RF_grid_search.best_params_
print(RF_best_parameters)
RF_best_result = RF_grid_search.best_score_
print(RF_best_result)

{'n_estimators': 50}
0.2379120879120879


In [12]:
#n_etimators found - go ahead with Method 1
RF_classifier = RandomForestClassifier(n_estimators=50, criterion='entropy', max_features='sqrt', random_state=36)  #building model/classifier
RF_classifier.fit(X_train, Y_train)        #training
Y_pred_RF = RF_classifier.predict(X_test)  #testing

#evaluation  &  confusion matrix
RF_Accuracy = metrics.accuracy_score(Y_test, Y_pred_RF)
print('Accuracy of Random Forest is :', round(RF_Accuracy,4))

RF_recall = metrics.recall_score(Y_test, Y_pred_RF)
print('Recall value for Random Forest is :', round(RF_recall,4))

RF_precision = metrics.precision_score(Y_test, Y_pred_RF)
print('Precisoin value for Random Forest is :', round(RF_precision,4))

RF_f1 = metrics.f1_score(Y_test, Y_pred_RF)
print('f1 score of Random forest is :', round(RF_f1,4))

print('')
RF_conf_matrix = metrics.confusion_matrix(Y_test, Y_pred_RF)
print("Confusion MAtrix of Random Forest is :")
print(RF_conf_matrix)

#important features
RF_imp_features = Series(RF_classifier.feature_importances_, index=list(X)).sort_values(ascending=False)
print('')
print(RF_imp_features)

Accuracy of Random Forest is : 0.9013
Recall value for Random Forest is : 0.4204
Precisoin value for Random Forest is : 0.6055
f1 score of Random forest is : 0.4962

Confusion MAtrix of Random Forest is :
[[1157   43]
 [  91   66]]


## Support Vector Classification

In [19]:
#Initiating with Method 3 - to get values of Hyperparameters: c & kernel
SV_classifier2 = Pipeline([('balancing', SMOTE(random_state = 101)), ('classification', SVC())])  #building model/classifier

kernels_c = {'classification__kernel': ['linear','poly','rbf','sigmoid'], 'classification__C': [.001,.01,.1,1,10,100]}
SV_grid_search = GridSearchCV(estimator = SV_classifier2, param_grid=kernels_c, scoring='recall', cv=5)  # Initialize GridSearchCV

SV_grid_search.fit(X_scaled, Y)   # Fit the model #fit: training, testing, evaluating & ranking

SV_best_parameters = SV_grid_search.best_params_
print(SV_best_parameters)
SV_best_result = SV_grid_search.best_score_
print(SV_best_result)

{'classification__C': 0.001, 'classification__kernel': 'rbf'}
0.8157326007326008


In [20]:
#building SVM using method 1
#from sklearn.svm import SVC - importing library
SV_classifier = SVC(kernel = 'rbf', C = 0.001)        # 1.building model/classifier
SV_classifier.fit(X_train, Y_train)       # 2.training
Y_pred_SV = SV_classifier.predict(X_test)  # 3.testing

# 4.evaluation          & Confusion Matrix
Accuracy_SV = metrics.accuracy_score(Y_test, Y_pred_SV)      #Calculating Accuracy
print('Support Vector Accuracy is: ', round(Accuracy_SV,4))

recall_SV = metrics.recall_score(Y_test, Y_pred_SV)         #Calculating Recall
print('Support Vector Recall is: ', round(recall_SV,4))

precision_SV = metrics.precision_score(Y_test, Y_pred_SV)    #Calculating Precision
print('Support Vector Precision is: ', round(precision_SV,4))

f1_SV = metrics.f1_score(Y_test, Y_pred_SV)               #Calculating f1
print('Support Vector f1 is: ', round(f1_SV,4))

print('')
conf_matrix_SV = metrics.confusion_matrix(Y_test, Y_pred_SV)   #Calculating Confusion Matrix
print('Support Vector Confusion Matrix is: ')
print(conf_matrix_SV)

Support Vector Accuracy is:  0.6514
Support Vector Recall is:  0.8153
Support Vector Precision is:  0.2238
Support Vector f1 is:  0.3512

Support Vector Confusion Matrix is: 
[[756 444]
 [ 29 128]]
