### Project3a - Majority Voting

<span class="mark">Use base classifiers:
    LogisticRegression,
    KNeighborsClassifier,
    RandomForestClassifier
and VotingClassifier from sklearn to construct Ensemble learning.
Compare the scores.</span>

You are working in the credit card division of your bank. The operations head of your
company has requested your help in determining whether a customer is creditworthy
or not. You have been provided with credit card operations data.
This dataset contains credit card applications with around 15 variables. The variables
are a mix of continuous and categorical data pertaining to credit card operations.
The label for the dataset is a flag, which indicates whether the application has been
approved or not.
You want to fit some benchmark models and try some ensemble learning methods on
the dataset to address the problem and come up with a tool for predicting whether or
not a given customer should be approved for their credit application.

In [1]:
import pandas as pd

In [2]:
# Loading the data using pandas

credData = pd.read_csv('crx.data',sep=",",header = None,na_values = "?")
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [3]:
# Changing the Classes to 1 & 0
credData.loc[credData[15] == '+' , 15] = 1
credData.loc[credData[15] == '-' , 15] = 0
credData.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.000,u,g,w,v,1.250,t,t,1,f,g,202.0,0,1
1,a,58.67,4.460,u,g,q,h,3.040,t,t,6,f,g,43.0,560,1
2,a,24.50,0.500,u,g,q,h,1.500,t,f,0,f,g,280.0,824,1
3,b,27.83,1.540,u,g,w,v,3.750,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.710,t,f,0,f,s,120.0,0,1
5,b,32.08,4.000,u,g,m,v,2.500,t,f,0,t,g,360.0,0,1
6,b,33.17,1.040,u,g,r,h,6.500,t,f,0,t,g,164.0,31285,1
7,a,22.92,11.585,u,g,cc,v,0.040,t,f,0,f,g,80.0,1349,1
8,b,54.42,0.500,y,p,k,h,3.960,t,f,0,f,g,180.0,314,1
9,b,42.50,4.915,y,p,w,v,3.165,t,f,0,t,g,52.0,1442,1


In [4]:
# Dropping all the rows with na values
newcred = credData.dropna(axis = 0)
newcred.shape

(653, 16)

In [5]:
# Seperating the categorical variables to make dummy variables

credCat = pd.get_dummies(newcred[[0,3,4,5,6,8,9,11,12]])


In [6]:
# Seperating the numerical variables

credNum = newcred[[1,2,7,10,13,14]]


In [7]:
# Making the X variable which is a concatenation of categorical and numerical data

X = pd.concat([credCat,credNum],axis = 1)
print(X.shape)

# Seperating the label as y variable
y = newcred[15]
print(y.shape)
print(y.dtype)
y = y.astype(int) 
print(y.dtypes)

(653, 46)
(653,)
int64
int32


In [8]:
# Normalising the data sets
# Import library function
from sklearn import preprocessing
# Creating the scaling function
minmaxScaler = preprocessing.MinMaxScaler()
# Transforming with the scaler function
X_tran = pd.DataFrame(minmaxScaler.fit_transform(X))
# Printing the output
X_tran.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.271111,0.0,0.04386,0.014925,0.101,0.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.713016,0.159286,0.106667,0.089552,0.0215,0.0056
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.170635,0.017857,0.052632,0.0,0.14,0.00824
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.223492,0.055,0.131579,0.074627,0.05,3e-05
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.101905,0.200893,0.06,0.0,0.06,0.0


In [9]:
# Splitting the data set to train and test sets
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tran, y, test_size=0.3, random_state=123)



**Majority Voting**

In [10]:
# Defining the Voting classifier and three individual learners
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Defining the three models for LogisticRegression, KNeighborsClassifier and RandomForestClassifier.
# Enter your code here.
log_clf = LogisticRegression(solver='lbfgs')
knn_clf = KNeighborsClassifier()
rnd_clf = RandomForestClassifier(n_estimators=100)

In [11]:
# Defining the ensemble model using VotingClassifier (Combine three models)
# Enter your code here.
voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('knn', knn_clf), ('rf', rnd_clf)],
    voting = 'hard')

In [12]:
# Fitting the model on the training set (for each indivisual classifier model and ensemble model)
# Enter your code here.
log_clf.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)
rnd_clf.fit(X_train, y_train)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                

In [13]:
# Predicting accuracy on the test set using .score() function from fit models.
# Enter your code here.
print("LogisticRegression train/test score ->", log_clf.score(X_train, y_train),"/", log_clf.score(X_test, y_test))
print("KNeighborsClassifier train/test score ->", knn_clf.score(X_train, y_train),"/", knn_clf.score(X_test, y_test))
print("RandomForestClassifier train/test score ->", rnd_clf.score(X_train, y_train),"/", rnd_clf.score(X_test, y_test))
print("Majority Voting train/test score ->", voting_clf.score(X_train, y_train),"/", voting_clf.score(X_test, y_test))

LogisticRegression train/test score -> 0.8774617067833698 / 0.8877551020408163
KNeighborsClassifier train/test score -> 0.8905908096280087 / 0.8673469387755102
RandomForestClassifier train/test score -> 1.0 / 0.9132653061224489
Majority Voting train/test score -> 0.9146608315098468 / 0.9030612244897959


In [14]:
# Generating the predictions on the test set
# Enter your code here.
log_pred = log_clf.predict(X_test)
knn_pred = knn_clf.predict(X_test)
rnd_pred = rnd_clf.predict(X_test)
mv_pred = voting_clf.predict(X_test)

In [15]:
# Printing the confusion matrix
from sklearn.metrics import confusion_matrix
# Confusion matrix for the test set
# Enter your code here.
print("LogisticRegression confusion_matrix")
print(confusion_matrix(y_test, log_pred))
print("KNeighborsClassifier confusion_matrix")
print(confusion_matrix(y_test, knn_pred))
print("RandomForestClassifier confusion_matrix")
print(confusion_matrix(y_test, rnd_pred))
print("Majority Voting confusion_matrix")
print(confusion_matrix(y_test, mv_pred))

LogisticRegression confusion_matrix
[[93 14]
 [ 8 81]]
KNeighborsClassifier confusion_matrix
[[98  9]
 [17 72]]
RandomForestClassifier confusion_matrix
[[97 10]
 [ 7 82]]
Majority Voting confusion_matrix
[[95 12]
 [ 7 82]]


In [16]:
# Printing the classification report
from sklearn.metrics import classification_report
# Enter your code here.
print("LogisticRegression classification_report")
print(classification_report(y_test, log_pred))
print("KNeighborsClassifier classification_report")
print(classification_report(y_test, knn_pred))
print("RandomForestClassifier classification_report")
print(classification_report(y_test, rnd_pred))
print("Majority Voting classification_report")
print(classification_report(y_test, mv_pred))

LogisticRegression classification_report
              precision    recall  f1-score   support

           0       0.92      0.87      0.89       107
           1       0.85      0.91      0.88        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196

KNeighborsClassifier classification_report
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       107
           1       0.89      0.81      0.85        89

    accuracy                           0.87       196
   macro avg       0.87      0.86      0.86       196
weighted avg       0.87      0.87      0.87       196

RandomForestClassifier classification_report
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       107
           1       0.89      0.92      0.91        89

    accuracy                           0.91       196