## Importing the important libraries:

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# I will compare the performance of 7 different classification algorithms on the same dataset and decide which is the best.

Reading in the dataset:
This dataset contains information about some features of patients and the likelihood of having a heart attack.
(output : 0= less chance of heart attack | 1= more chance of heart attack)

The features of the dataset:

*   Age : Age of the patient
*   Sex : Sex of the patient
*   exang: exercise induced angina (1 = yes; 0 = no)
*   ca: number of major vessels (0-3)
*   cp : Chest Pain type chest pain type(Value 1: typical angina | Value 2: atypical angina | Value 3: non-anginal pain | Value 4: asymptomatic)
*   trtbps : resting blood pressure (in mm Hg)
*   chol : cholestoral in mg/dl fetched via BMI sensor
*   fbs : fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
*   rest_ecg : resting electrocardiographic results(Value 0: normal | Value 1: having ST-T wave abnormality | Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria)
*   thalach : maximum heart rate achieved


*   (https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset)


In [None]:
heart_data = pd.read_csv('/content/heart.csv')

In [None]:
heart_data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
heart_data['age'].count()

303

We can see here that the is no null values in the heart_data dataset and there are no duplicates

In [None]:
print(heart_data.isna().sum())
duplicate = heart_data[heart_data.duplicated(keep='last')]
duplicate.shape

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64


(1, 14)

Splitting the data into input and output, train and test

In [None]:
X=heart_data.drop('output',axis=1)
y=heart_data['output']


X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.25,stratify=y)

The calsses are balanced by default, so we dont have to do anything with that:

In [None]:
y_train.value_counts()

1    124
0    103
Name: output, dtype: int64

we have to do some scaling in order for the alogrithm to work efficiently:

In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test)

Next i will instasiate the MLP(Multilayer Perceptron) Classifier, and fit the model to the data. We have some scores to measure the algorithm.
Scores(The higher the number the better the algorithm):

*   Confusion Matrix
*   Accuracy: (TP+TN)/(TP+FP+TN+FN) 
*   Precision: (TP)/(TP+FP)

*   Recall: (TP)/(TP+FN)
*   F1 score:2*((precision*recall)/(precisio+recall))

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, alpha=1e-4,
                    solver='adam', verbose=10, activation='relu', random_state=1,
                    learning_rate_init=0.1)


mlp.fit(X_train, y_train)
mlp_score=mlp.score(X_train, y_train)
print("Training set score: %f" % mlp_score)
print("Test set score: %f" % mlp.score(X_test, y_test))

Iteration 1, loss = 0.72680957
Iteration 2, loss = 0.37081074
Iteration 3, loss = 0.35300210
Iteration 4, loss = 0.37889418
Iteration 5, loss = 0.30333785
Iteration 6, loss = 0.35433521
Iteration 7, loss = 0.33134458
Iteration 8, loss = 0.33303388
Iteration 9, loss = 0.26795896
Iteration 10, loss = 0.25501936
Iteration 11, loss = 0.21995701
Iteration 12, loss = 0.25056878
Iteration 13, loss = 0.23825977
Iteration 14, loss = 0.25111930
Iteration 15, loss = 0.25185875
Iteration 16, loss = 0.20013062
Iteration 17, loss = 0.19501216
Iteration 18, loss = 0.21100152
Iteration 19, loss = 0.18530990
Iteration 20, loss = 0.15929954
Iteration 21, loss = 0.16370038
Iteration 22, loss = 0.15044205
Iteration 23, loss = 0.15440229
Iteration 24, loss = 0.15546570
Iteration 25, loss = 0.14546695
Iteration 26, loss = 0.13626121
Iteration 27, loss = 0.13190747
Iteration 28, loss = 0.11542884
Iteration 29, loss = 0.08641921
Iteration 30, loss = 0.07899996
Iteration 31, loss = 0.10117194
Iteration 32, los



Next step is to do the prediction. We can see some more scores.

In [None]:
MLP_predict = mlp.predict(X_test)

accuracy_MLP = accuracy_score(y_test, MLP_predict)
print('Accuracy: ',accuracy_MLP)

print(classification_report(y_test,MLP_predict))

Accuracy:  0.7631578947368421
              precision    recall  f1-score   support

           0       0.76      0.71      0.74        35
           1       0.77      0.80      0.79        41

    accuracy                           0.76        76
   macro avg       0.76      0.76      0.76        76
weighted avg       0.76      0.76      0.76        76



In [None]:
recall_MLP = recall_score(y_test, MLP_predict)
f1_MLP = f1_score(y_test, MLP_predict)
matrix_MLP = confusion_matrix(y_test, MLP_predict)
print(matrix_MLP)

[[25 10]
 [ 8 33]]


Prediction using Logistic Regression algortihm

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',random_state=0)
classifier.fit(X_train, y_train)
lr_score = classifier.score(X_train, y_train)
print("Training set score: %f" % lr_score)
print("Test set score: %f" % classifier.score(X_test, y_test))

Training set score: 0.837004
Test set score: 0.842105


In [None]:
LR_predict = classifier.predict(X_test)

accuracy_LR = accuracy_score(y_test, LR_predict)

print('Accuracy: ',accuracy_LR)
print(classification_report(y_test,LR_predict))

Accuracy:  0.8421052631578947
              precision    recall  f1-score   support

           0       0.87      0.77      0.82        35
           1       0.82      0.90      0.86        41

    accuracy                           0.84        76
   macro avg       0.85      0.84      0.84        76
weighted avg       0.84      0.84      0.84        76



In [None]:
recall_LR = recall_score(y_test, LR_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_LR = f1_score(y_test, LR_predict)
matrix_LR = confusion_matrix(y_test, LR_predict)
print(matrix_LR)

[[27  8]
 [ 4 37]]


Prediction using Decision Tree Classifier algorithm:

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

dtc.fit(X_train, y_train)
dtc_score=dtc.score(X_train, y_train)
print("Training set score: %f" % dtc_score)
print("Test set score: %f" % dtc.score(X_test, y_test))

Training set score: 1.000000
Test set score: 0.697368


In [None]:
DTC_predict = dtc.predict(X_test)

accuracy_DTC = accuracy_score(y_test, DTC_predict)
print('Accuracy: ',accuracy_DTC)

print(classification_report(y_test,DTC_predict))

Accuracy:  0.6973684210526315
              precision    recall  f1-score   support

           0       0.68      0.66      0.67        35
           1       0.71      0.73      0.72        41

    accuracy                           0.70        76
   macro avg       0.70      0.69      0.69        76
weighted avg       0.70      0.70      0.70        76



In [None]:
recall_DTC = recall_score(y_test, DTC_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_DTC = f1_score(y_test, DTC_predict)
matrix_DTC = confusion_matrix(y_test, DTC_predict)
print(matrix_DTC)

[[23 12]
 [11 30]]


Prediction using Random Forest Classifier algorithm:

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)
rfc_score=rfc.score(X_train, y_train)
print("Training set score: %f" % rfc_score)
print("Test set score: %f" % rfc.score(X_test, y_test))

Training set score: 1.000000
Test set score: 0.789474


In [None]:
RFC_predict = rfc.predict(X_test)

accuracy_RFC = accuracy_score(y_test, RFC_predict)
print('Accuracy: ',accuracy_RFC)

print(classification_report(y_test,RFC_predict))

Accuracy:  0.7894736842105263
              precision    recall  f1-score   support

           0       0.81      0.71      0.76        35
           1       0.78      0.85      0.81        41

    accuracy                           0.79        76
   macro avg       0.79      0.78      0.79        76
weighted avg       0.79      0.79      0.79        76



In [None]:
recall_RFC = recall_score(y_test, RFC_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_RFC = f1_score(y_test, RFC_predict)
matrix_RFC = confusion_matrix(y_test, RFC_predict)
print(matrix_RFC)

[[25 10]
 [ 6 35]]


Prediction Using the Support Vector Machine algorithm:

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
svc_score=svc.score(X_train, y_train)
print("Training set score: %f" %svc_score)
print("Test set score: %f" % svc.score(X_test, y_test))

Training set score: 0.938326
Test set score: 0.802632


In [None]:
SVC_predict = svc.predict(X_test)

accuracy_SVC = accuracy_score(y_test, SVC_predict)
print('Accuracy: ',accuracy_SVC)

print(classification_report(y_test,SVC_predict))

Accuracy:  0.8026315789473685
              precision    recall  f1-score   support

           0       0.83      0.71      0.77        35
           1       0.78      0.88      0.83        41

    accuracy                           0.80        76
   macro avg       0.81      0.80      0.80        76
weighted avg       0.81      0.80      0.80        76



In [None]:
recall_SVC = recall_score(y_test, SVC_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_SVC = f1_score(y_test, SVC_predict)
matrix_SVC = confusion_matrix(y_test, SVC_predict)
print(matrix_SVC)

[[25 10]
 [ 5 36]]


Prediction using the K-Nearest Neighbour algorithm:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_score=knn.score(X_train, y_train)
print("Training set score: %f" % knn_score)
print("Test set score: %f" % knn.score(X_test, y_test))

Training set score: 0.876652
Test set score: 0.802632


In [None]:
KNN_predict = knn.predict(X_test)

accuracy_KNN = accuracy_score(y_test, KNN_predict)
print('Accuracy: ',accuracy_KNN)

print(classification_report(y_test,KNN_predict))

Accuracy:  0.8026315789473685
              precision    recall  f1-score   support

           0       0.81      0.74      0.78        35
           1       0.80      0.85      0.82        41

    accuracy                           0.80        76
   macro avg       0.80      0.80      0.80        76
weighted avg       0.80      0.80      0.80        76



In [None]:
recall_KNN = recall_score(y_test, KNN_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_KNN = f1_score(y_test, KNN_predict)
matrix_KNN = confusion_matrix(y_test, KNN_predict)
print(matrix_KNN)

[[26  9]
 [ 6 35]]


Prediction using the Naive Bayes algorithm:

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_score=gnb.score(X_train, y_train)
print("Training set score: %f" % gnb_score)
print("Test set score: %f" % gnb.score(X_test, y_test))

Training set score: 0.841410
Test set score: 0.802632


In [None]:
GNB_predict = gnb.predict(X_test)

accuracy_GNB = accuracy_score(y_test, GNB_predict)
print('Accuracy: ',accuracy_GNB)

print(classification_report(y_test,GNB_predict))

Accuracy:  0.8026315789473685
              precision    recall  f1-score   support

           0       0.79      0.77      0.78        35
           1       0.81      0.83      0.82        41

    accuracy                           0.80        76
   macro avg       0.80      0.80      0.80        76
weighted avg       0.80      0.80      0.80        76



In [None]:
recall_GNB = recall_score(y_test, GNB_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_GNB = f1_score(y_test, GNB_predict)
matrix_GNB = confusion_matrix(y_test, GNB_predict)
print(matrix_GNB)

[[27  8]
 [ 7 34]]


Table of the accuracy scores and the training set scores:

In [None]:
acc_scores=[accuracy_MLP,accuracy_LR,accuracy_DTC,accuracy_RFC,accuracy_SVC,accuracy_KNN,accuracy_GNB]

training_set_score=[mlp_score,lr_score,dtc_score,rfc_score,svc_score,knn_score,gnb_score]

f1_scores=[f1_MLP,f1_LR,f1_DTC,f1_RFC,f1_SVC,f1_KNN,f1_GNB]

recall_scores=[recall_MLP,recall_LR,recall_DTC,recall_RFC,recall_SVC,recall_KNN,recall_GNB]

algorithms=['MLP','LR','DTC','RFC','SVC','KNN','GNB']

accuracy_df=pd.DataFrame({'Accuracy_score': acc_scores, 'Trainig_score': training_set_score,'F1_scores': f1_scores,'Recall':recall_scores},index=algorithms)

accuracy_df

Unnamed: 0,Accuracy_score,Trainig_score,F1_scores,Recall
MLP,0.763158,0.982379,0.785714,0.804878
LR,0.842105,0.837004,0.860465,0.902439
DTC,0.697368,1.0,0.722892,0.731707
RFC,0.789474,1.0,0.813953,0.853659
SVC,0.802632,0.938326,0.827586,0.878049
KNN,0.802632,0.876652,0.823529,0.853659
GNB,0.802632,0.84141,0.819277,0.829268


# I have tried the the same thing with a different dataset which has more data.

This dataset contains data of 32561 persons and the purpose is to create a model which classifies whether a person will get a 50K job or not.

(https://www.kaggle.com/datasets/galshochat/classification-problem-yes-or-no-50k-salary)

In [None]:
names=['Age', 'Workclass' ,'fnlwgt' ,'Education','Education-num', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex' ,'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native_country' , 'Salary']
adult_data= pd.read_csv('/content/adult.data',names=names, sep = ',')
adult_data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native_country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


There is no null or duplicates in the adult_data

In [None]:
print(adult_data.isna().sum())
duplicate = adult_data[adult_data.duplicated(keep='last')]
duplicate.shape

Age               0
Workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital-Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native_country    0
Salary            0
dtype: int64


(24, 15)

Listing the numeric and non-numeric columns:

In [None]:
cat_df = adult_data.select_dtypes(include=['object'])
num_df = adult_data.select_dtypes(exclude=['object'])

def printColumnTypes(non_numeric_df, numeric_df):
    '''separates non-numeric and numeric columns'''
    print("Non-Numeric columns:")
    for col in non_numeric_df:
        print(f"{col}")
    print("")
    print("Numeric columns:")
    for col in numeric_df:
        print(f"{col}")
        
printColumnTypes(cat_df, num_df)

Non-Numeric columns:
Workclass
Education
Marital-Status
Occupation
Relationship
Race
Sex
Native_country
Salary

Numeric columns:
Age
fnlwgt
Education-num
Capital-gain
Capital-loss
Hours-per-week


Checing weather the dataset has faulty or 0 values

In [None]:
adult_data['Workclass'].value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: Workclass, dtype: int64

This loop removes the rows from the data frame that does not contain any actual value just a ' ?'

In [None]:
to_drop=['Native_country','Workclass','Occupation']
for x in adult_data.index:
  if adult_data.loc[x, "Workclass"] == ' ?' or adult_data.loc[x, "Occupation"] == ' ?' or adult_data.loc[x, "Native_country"] == ' ?' or adult_data.loc[x, "Workclass"] == ' Without-pay':
    adult_data.drop(x, inplace = True) 

adult_data.count()

Age               30148
Workclass         30148
fnlwgt            30148
Education         30148
Education-num     30148
Marital-Status    30148
Occupation        30148
Relationship      30148
Race              30148
Sex               30148
Capital-gain      30148
Capital-loss      30148
Hours-per-week    30148
Native_country    30148
Salary            30148
dtype: int64

Convertin non numeric values into numeric like
(Salary =>  0 '<=50K')
(Salary =>  1 '>50K')

(Im not sure if this is the best aproach but i found this)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

names = ['Workclass','Education','Marital-Status','Occupation','Relationship','Race','Sex','Native_country','Salary']

label = le.fit_transform(adult_data['Workclass'])
adult_data.drop('Workclass', axis=1, inplace=True)
adult_data['Workclass'] = label

label = le.fit_transform(adult_data['Education'])
adult_data.drop('Education', axis=1, inplace=True)
adult_data['Education'] = label

label = le.fit_transform(adult_data['Marital-Status'])
adult_data.drop('Marital-Status', axis=1, inplace=True)
adult_data['Marital-Status'] = label

label = le.fit_transform(adult_data['Occupation'])
adult_data.drop('Occupation', axis=1, inplace=True)
adult_data['Occupation'] = label

label = le.fit_transform(adult_data['Relationship'])
adult_data.drop('Relationship', axis=1, inplace=True)
adult_data['Relationship'] = label

label = le.fit_transform(adult_data['Race'])
adult_data.drop('Race', axis=1, inplace=True)
adult_data['Race'] = label

label = le.fit_transform(adult_data['Sex'])
adult_data.drop('Sex', axis=1, inplace=True)
adult_data['Sex'] = label

label = le.fit_transform(adult_data['Native_country'])
adult_data.drop('Native_country', axis=1, inplace=True)
adult_data['Native_country'] = label

label = le.fit_transform(adult_data['Salary'])
adult_data.drop('Salary', axis=1, inplace=True)
adult_data['Salary'] = label

In [None]:
adult_data.head(15)

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass,Education,Marital-Status,Occupation,Relationship,Race,Sex,Native_country,Salary
0,39,77516,13,2174,0,40,5,9,4,0,1,4,1,38,0
1,50,83311,13,0,0,13,4,9,2,3,0,4,1,38,0
2,38,215646,9,0,0,40,2,11,0,5,1,4,1,38,0
3,53,234721,7,0,0,40,2,1,2,5,0,2,1,38,0
4,28,338409,13,0,0,40,2,9,2,9,5,2,0,4,0
5,37,284582,14,0,0,40,2,12,2,3,5,4,0,38,0
6,49,160187,5,0,0,16,2,6,3,7,1,2,0,22,0
7,52,209642,9,0,0,45,4,11,2,3,0,4,1,38,1
8,31,45781,14,14084,0,50,2,12,4,9,1,4,0,38,1
9,42,159449,13,5178,0,40,2,9,2,3,0,4,1,38,1


Splitting the data into input and output, train and test

In [None]:
X=adult_data.drop('Salary',axis=1)
y=adult_data['Salary']

print(X.shape,y.shape)
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.4)

(30148, 14) (30148,)


In [None]:
y_train.value_counts()

0    13573
1     4515
Name: Salary, dtype: int64

we have to do some scaling in order for the alogrithm to work efficiently:

In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test)

Next i will instasiate the MLP Classifier, and fit the model to the data. We have some scores to measure the fit.

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
                    solver='adam', verbose=10, activation='logistic', random_state=1,
                    learning_rate_init=0.01)


mlp.fit(X_train, y_train)
mlp_score=mlp.score(X_train, y_train)
print("Training set score: %f" % mlp_score)
print("Test set score: %f" % mlp.score(X_test, y_test))

Iteration 1, loss = 0.43919508
Iteration 2, loss = 0.39281770
Iteration 3, loss = 0.37854102
Iteration 4, loss = 0.35846603
Iteration 5, loss = 0.34228414
Iteration 6, loss = 0.33491250
Iteration 7, loss = 0.33189848
Iteration 8, loss = 0.32916024
Iteration 9, loss = 0.32758904
Iteration 10, loss = 0.32549859
Training set score: 0.847800
Test set score: 0.842952




Next step is to do the prediction. We can see some scores here as well and we can evaluate that the prediction is very accurate.

In [None]:
MLP_predict = mlp.predict(X_test)

accuracy_MLP = accuracy_score(y_test, MLP_predict)
print('Accuracy: ',accuracy_MLP)

print(classification_report(y_test,MLP_predict))

Accuracy:  0.8429519071310116
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      9067
           1       0.73      0.58      0.65      2993

    accuracy                           0.84     12060
   macro avg       0.80      0.75      0.77     12060
weighted avg       0.84      0.84      0.84     12060



In [None]:
recall_MLP = recall_score(y_test, MLP_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_MLP = f1_score(y_test, MLP_predict)
matrix_MLP = confusion_matrix(y_test, MLP_predict)
print(matrix_MLP)

[[8434  633]
 [1261 1732]]


Prediction using Logistic Regression algortihm

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',random_state=0)
classifier.fit(X_train, y_train)
lr_score = classifier.score(X_train, y_train)
print("Training set score: %f" % lr_score)
print("Test set score: %f" % classifier.score(X_test, y_test))

Training set score: 0.821760
Test set score: 0.818988


In [None]:
LR_predict = classifier.predict(X_test)

accuracy_LR = accuracy_score(y_test, LR_predict)

print('Accuracy: ',accuracy_LR)
print(classification_report(y_test,LR_predict))

Accuracy:  0.8189883913764511
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      9067
           1       0.71      0.46      0.56      2993

    accuracy                           0.82     12060
   macro avg       0.77      0.70      0.72     12060
weighted avg       0.81      0.82      0.80     12060



In [None]:
recall_LR = recall_score(y_test, LR_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_LR = f1_score(y_test, LR_predict)
matrix_LR = confusion_matrix(y_test, LR_predict)
print(matrix_LR)

[[8507  560]
 [1623 1370]]


Prediction using Decision Tree Classifier algorithm:

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

dtc.fit(X_train, y_train)
dtc_score=dtc.score(X_train, y_train)
print("Training set score: %f" % dtc_score)
print("Test set score: %f" % dtc.score(X_test, y_test))

Training set score: 1.000000
Test set score: 0.804395


In [None]:
DTC_predict = dtc.predict(X_test)

accuracy_DTC = accuracy_score(y_test, DTC_predict)
print('Accuracy: ',accuracy_DTC)

print(classification_report(y_test,DTC_predict))

Accuracy:  0.8043946932006634
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      9067
           1       0.60      0.63      0.61      2993

    accuracy                           0.80     12060
   macro avg       0.74      0.74      0.74     12060
weighted avg       0.81      0.80      0.81     12060



In [None]:
recall_DTC = recall_score(y_test, DTC_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_DTC = f1_score(y_test, DTC_predict)
matrix_DTC = confusion_matrix(y_test, DTC_predict)
print(matrix_DTC)

[[7829 1238]
 [1121 1872]]


Prediction using Random Forest Classifier algorithm:

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)
rfc_score=rfc.score(X_train, y_train)
print("Training set score: %f" % rfc_score)
print("Test set score: %f" % rfc.score(X_test, y_test))

Training set score: 1.000000
Test set score: 0.850995


In [None]:
RFC_predict = rfc.predict(X_test)

accuracy_RFC = accuracy_score(y_test, RFC_predict)
print('Accuracy: ',accuracy_RFC)

print(classification_report(y_test,RFC_predict))

Accuracy:  0.8509950248756218
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      9067
           1       0.73      0.63      0.68      2993

    accuracy                           0.85     12060
   macro avg       0.81      0.78      0.79     12060
weighted avg       0.85      0.85      0.85     12060



In [None]:
recall_RFC = recall_score(y_test, RFC_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_RFC = f1_score(y_test, RFC_predict)
matrix_RFC = confusion_matrix(y_test, RFC_predict)
print(matrix_RFC)

[[8376  691]
 [1106 1887]]


Prediction Using the Support Vector Machine algorithm:

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
svc_score=svc.score(X_train, y_train)
print("Training set score: %f" %svc_score)
print("Test set score: %f" % svc.score(X_test, y_test))

Training set score: 0.853826
Test set score: 0.844113


In [None]:
SVC_predict = svc.predict(X_test)

accuracy_SVC = accuracy_score(y_test, SVC_predict)
print('Accuracy: ',accuracy_SVC)

print(classification_report(y_test,SVC_predict))

Accuracy:  0.8441127694859039
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      9067
           1       0.75      0.55      0.64      2993

    accuracy                           0.84     12060
   macro avg       0.81      0.75      0.77     12060
weighted avg       0.84      0.84      0.84     12060



In [None]:
recall_SVC = recall_score(y_test, SVC_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_SVC = f1_score(y_test, SVC_predict)
matrix_SVC = confusion_matrix(y_test, SVC_predict)
print(matrix_SVC)

[[8524  543]
 [1337 1656]]


Prediction using the K-Nearest Neighbour algorithm:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_score=knn.score(X_train, y_train)
print("Training set score: %f" % knn_score)
print("Test set score: %f" % knn.score(X_test, y_test))

Training set score: 0.872346
Test set score: 0.826036


In [None]:
KNN_predict = knn.predict(X_test)

accuracy_KNN = accuracy_score(y_test, KNN_predict)
print('Accuracy: ',accuracy_KNN)

print(classification_report(y_test,KNN_predict))

Accuracy:  0.8260364842454395
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      9067
           1       0.67      0.60      0.63      2993

    accuracy                           0.83     12060
   macro avg       0.77      0.75      0.76     12060
weighted avg       0.82      0.83      0.82     12060



In [None]:
recall_KNN = recall_score(y_test, KNN_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_KNN = f1_score(y_test, KNN_predict)
matrix_KNN = confusion_matrix(y_test, KNN_predict)
print(matrix_KNN)

[[8165  902]
 [1196 1797]]


Prediction using the Naive Bayes algorithm:

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_score=gnb.score(X_train, y_train)
print("Training set score: %f" % gnb_score)
print("Test set score: %f" % gnb.score(X_test, y_test))

Training set score: 0.795721
Test set score: 0.797015


In [None]:
GNB_predict = gnb.predict(X_test)

accuracy_GNB = accuracy_score(y_test, GNB_predict)
print('Accuracy: ',accuracy_GNB)

print(classification_report(y_test,GNB_predict))

Accuracy:  0.7970149253731343
              precision    recall  f1-score   support

           0       0.81      0.95      0.88      9067
           1       0.68      0.34      0.46      2993

    accuracy                           0.80     12060
   macro avg       0.75      0.64      0.67     12060
weighted avg       0.78      0.80      0.77     12060



In [None]:
recall_GNB = recall_score(y_test, GNB_predict)
# f1: 2 tp / (2 tp + fp + fn)
f1_GNB = f1_score(y_test, GNB_predict)
matrix_GNB = confusion_matrix(y_test, GNB_predict)
print(matrix_GNB)

[[8585  482]
 [1966 1027]]


Table of the accuracy scores and the training set scores:

In [None]:
acc_scores=[accuracy_MLP,accuracy_LR,accuracy_DTC,accuracy_RFC,accuracy_SVC,accuracy_KNN,accuracy_GNB]

training_set_score=[mlp_score,lr_score,dtc_score,rfc_score,svc_score,knn_score,gnb_score]

f1_scores=[f1_MLP,f1_LR,f1_DTC,f1_RFC,f1_SVC,f1_KNN,f1_GNB]

recall_scores=[recall_MLP,recall_LR,recall_DTC,recall_RFC,recall_SVC,recall_KNN,recall_GNB]

algorithms=['MLP2','LR2','DTC2','RFC2','SVC2','KNN2','GNB2']

accuracy_df_2=pd.DataFrame({'Accuracy_score': acc_scores, 'Trainig_score': training_set_score,'F1_scores': f1_scores,'Recall':recall_scores},index=algorithms)

accuracy_df_2

Unnamed: 0,Accuracy_score,Trainig_score,F1_scores,Recall
MLP2,0.842952,0.8478,0.64651,0.578684
LR2,0.818988,0.82176,0.556571,0.457735
DTC2,0.804395,1.0,0.613469,0.625459
RFC2,0.850995,1.0,0.677437,0.630471
SVC2,0.844113,0.853826,0.637904,0.553291
KNN2,0.826036,0.872346,0.631413,0.600401
GNB2,0.797015,0.795721,0.456242,0.343134


# Comparing the two datasets:




In [None]:
big_df = pd.DataFrame()

big_df = pd.concat([accuracy_df,accuracy_df_2],axis=0)
big_df

Unnamed: 0,Accuracy_score,Trainig_score,F1_scores,Recall
MLP,0.763158,0.982379,0.785714,0.804878
LR,0.842105,0.837004,0.860465,0.902439
DTC,0.697368,1.0,0.722892,0.731707
RFC,0.789474,1.0,0.813953,0.853659
SVC,0.802632,0.938326,0.827586,0.878049
KNN,0.802632,0.876652,0.823529,0.853659
GNB,0.802632,0.84141,0.819277,0.829268
MLP2,0.842952,0.8478,0.64651,0.578684
LR2,0.818988,0.82176,0.556571,0.457735
DTC2,0.804395,1.0,0.613469,0.625459
