# Implementation of Evaluation Metrics from the Scratch
## and comparing the results with sklearn
<ul>
    <li>Confusion Matrix</li>
    <li>Accuracy</li>
    <li>Precision</li>
    <li>Recall</li>
    <li>F1-Score</li>
</ul>

In [1]:
# importing essential libraries
import pandas as pd
import numpy as np

In [2]:
# loading the dataset
df = pd.read_csv('data.csv')
# printing shape of dataframe
print(df.shape)
df.head()

(15758, 3)


Unnamed: 0,y_act,y_pred_random_forest,y_pred_logistic
0,1,0.639816,0.531904
1,0,0.490993,0.414496
2,1,0.623815,0.569883
3,1,0.506616,0.443674
4,0,0.418302,0.369532


In [3]:
# values below 0.5 will be 0
# values above 0.5 will be 1
# hence declaring the threshold
thresh = 0.5

# converting 2 columns according to the threshold
df['y_pred_rf'] = (df.y_pred_random_forest >= 0.5).astype('int')
df['y_pred_lr'] = (df.y_pred_logistic >= 0.5).astype('int')
df.head()

Unnamed: 0,y_act,y_pred_random_forest,y_pred_logistic,y_pred_rf,y_pred_lr
0,1,0.639816,0.531904,1,1
1,0,0.490993,0.414496,0,0
2,1,0.623815,0.569883,1,1
3,1,0.506616,0.443674,1,0
4,0,0.418302,0.369532,0,0


In [4]:
# function to calculate true positives, true negatives, false positives, false negatives
def compute_tp_tn_fp_fn(y_act,y_pred):
    '''
    True Positive: actual = 1, predicted = 1
    True Negatives: actual = 0, predicted = 0
    False Postives: actual = 0, predicted = 1
    False Negative: actual = 1, predicted = 0
    '''
    tp = sum( (y_act == 1) & (y_pred == 1))
    tn = sum( (y_act == 0) & (y_pred == 0))
    fp = sum( (y_act == 0) & (y_pred == 1))
    fn = sum( (y_act == 1) & (y_pred == 0))
    return tp,tn,fp,fn

In [6]:
# finding tp,tn,fp,fn for Logistic regression prediction scores using 
# manually made function
tp_lr, tn_lr, fp_lr, fn_lr = compute_tp_tn_fp_fn(\
                                df.y_act,df.y_pred_lr)
print('TP for Logistic Reg :', tp_lr)
print('TN for Logistic Reg :', tn_lr)
print('FP for Logistic Reg :', fp_lr)
print('FN for Logistic Reg :', fn_lr)


TP for Logistic Reg : 4279
TN for Logistic Reg : 5425
FP for Logistic Reg : 2454
FN for Logistic Reg : 3600


In [8]:
# finding tp,tn,fp,fn for Random Forest prediction scores using
# manually made function
tp_rf, tn_rf, fp_rf, fn_rf = compute_tp_tn_fp_fn(df.y_act, df.y_pred_rf)
print('TP for Random Forest :', tp_rf)
print('TN for Random Forest :', tn_rf)
print('FP for Random Forest :', fp_rf)
print('FN for Random Forest :', fn_rf)

TP for Random Forest : 5047
TN for Random Forest : 5519
FP for Random Forest : 2360
FN for Random Forest : 2832


In [11]:
# importing confusion matrix from sklearn
from sklearn.metrics import confusion_matrix

# obtaining tp,tn,fp,fn from the confusion matrix formed by sklearn
tn_rf1, fp_rf1, fn_rf1, tp_rf1 = confusion_matrix(df.y_act, df.y_pred_rf).ravel()

In [12]:
# printing tp,tn,fp,fn for Random Forest using sklearn
print('TP for Random Forest :', tp_rf1)
print('TN for Random Forest :', tn_rf1)
print('FP for Random Forest :', fp_rf1)
print('FN for Random Forest :', fn_rf1)

TP for Random Forest : 5047
TN for Random Forest : 5519
FP for Random Forest : 2360
FN for Random Forest : 2832


In [13]:
# defining a function to compute accuracy
def compute_accuracy(tp,tn,fp,fn):
    '''
    Accuracy = tp+tn/tp+tn+fp+fn
    '''
    return ((tp+tn)*100)/float(tp+tn+fp+fn)

In [14]:
# printing the accuracy scores of Logistic Regression & Random Forest
# using the user-made function
print('Accuracy for logistic regression :',\
     compute_accuracy(tp_lr,tn_lr,fp_lr,fn_lr))
print('Accuracy for random forest :',\
     compute_accuracy(tp_rf,tn_rf,fp_rf,fn_rf))

Accuracy for logistic regression : 61.58141896179718
Accuracy for random forest : 67.05165630156111


In [16]:
# importing accuracy_score from sklearn
from sklearn.metrics import accuracy_score

In [17]:
# printing the accuracy scores of Logistic Regression & Random Forest
# using the sklearn's accuracy_score function
print('Accuracy for logistic regression : ',\
     100*accuracy_score(df.y_act,df.y_pred_lr))
print('Accuracy for random forest : ',\
     100*accuracy_score(df.y_act,df.y_pred_rf))

Accuracy for logistic regression :  61.58141896179718
Accuracy for random forest :  67.05165630156111


In [18]:
# defining function to compute precision
def compute_precision(tp,fp):
    '''
    precision = tp/tp+fp
    '''
    return (tp*100)/float(tp+fp)

In [19]:
# printing the precision of Logistic Regression & Random Forest
# using the user-made function
print('Precision for Logistic Regression: ',\
     compute_precision(tp_lr,fp_lr))
print('Precision for Random Forest: ',\
     compute_precision(tp_rf,fp_rf))

Precision for Logistic Regression:  63.55265112134264
Precision for Random Forest:  68.1382476036182


In [21]:
# importing precision_score from sklearn
from sklearn.metrics import precision_score
# printing the precision of Logistic Regression & Random Forest
# using the sklearn's precision function
print('Precision for Logistic Regression :', \
      100* precision_score(df.y_act, df.y_pred_lr))
print('Precision for Random Forest :',\
      100* precision_score(df.y_act,df.y_pred_rf))

Precision for Logistic Regression : 63.55265112134264
Precision for Random Forest : 68.1382476036182


In [22]:
# defining function to compute recall
def compute_recall(tp,fn):
    '''
    precision = tp/tp+fn
    '''
    return (tp*100)/float(tp+fn)

In [23]:
# printing the recall of Logistic Regression & Random Forest
# using the user-made function
print('Recall for Logistic Regression: ',\
     compute_recall(tp_lr,fn_lr))
print('Recall for Random Forest: ',\
     compute_recall(tp_rf,fn_rf))

Precision for Logistic Regression:  54.30892245208783
Precision for Random Forest:  64.05635232897576


In [24]:
# importing recall_score from sklearn
from sklearn.metrics import recall_score
# printing the recall of Logistic Regression & Random Forest
# using the sklearn's recall function
print('Recall for Logistic Regression :', \
      100* recall_score(df.y_act, df.y_pred_lr))
print('Recall for Random Forest :',\
      100* recall_score(df.y_act,df.y_pred_rf))

Recall for Logistic Regression : 54.30892245208783
Recall for Random Forest : 64.05635232897576


In [26]:
# defining function to compute f1 score
def compute_f1_score(y_true,y_pred):
    # calculates f1 score
    tp,tn,fp,fn = compute_tp_tn_fp_fn(y_true,y_pred)
    precision = compute_precision(tp, fp)/100
    recall = compute_recall(tp, fn)/100
    f1_score = (2*precision*recall)/(recall+precision)
    return f1_score

In [27]:
# printing the f1-score of Logistic Regression & Random Forest
# using the user-made function
print('F1 score for Logistic Regression :', compute_f1_score(df.y_act, 
                                                             df.y_pred_lr))
print('F1 score for Random Forest :', compute_f1_score(df.y_act, 
                                                             df.y_pred_rf))

F1 score for Logistic Regression : 0.5856830002737475
F1 score for Random Forest : 0.660342797330891


In [28]:
# importing f1_score from sklearn
from sklearn.metrics import f1_score
# printing the f1-score of Logistic Regression & Random Forest
# using the sklearn's f1_score function
print('F1 score for Logistic Regression :', f1_score(df.y_act, df.y_pred_lr))
print('F1 score for Random Forest :', f1_score(df.y_act, df.y_pred_rf))

F1 score for Logistic Regression : 0.5856830002737475
F1 score for Random Forest : 0.660342797330891
