# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
## Loading data

data=pd.read_csv("5_a.csv")
data.head(5)


## defining a function to calculate confusion matrix, F1 score and accuracy

def performance(data):
    
    ## mapping the probability values to class labels
    
    data['y_predicted'] = [0.0 if x <0.5 else 1.0 for x in data['proba']]
    
    ## finding confusion matrix
    ## initializing confusion matrix elements and running loop to get all values(TN,TP,FP,FN)
    
    TP,TN,FN,FP=0,0,0,0
    for i in range(len(data['y'])):
        if data['y'][i]==1 and data['y_predicted'][i]==1:
            TP+=1
        elif data['y'][i]==0 and data['y_predicted'][i]==0:
            TN+=1
        
        elif data['y'][i]==0 and data['y_predicted'][i]==1:
            FP+=1
        elif data['y'][i]==1 and data['y_predicted'][i]==0:
            FN+=1

    confusion_matrix=[[TN,FN],[FP,TP]]

## Finding precision and recall 
        
    total_positive,total_negative=data['y'].value_counts()
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    
## Finding F1 score

    F_1_score=2*((precision*recall)/(precision+recall))
    
## finding accuracy
    
    accuracy=(TP + TN)/(TP + TN + FP + FN)
    
## printing the outputs
    
    print('Accurarcy : ',accuracy,'\n\n'+'F1 score : ',F_1_score,'\n\n'+'confusion matrix \n',\
          confusion_matrix[0],'\n',confusion_matrix[1])
    

    
## calling the above defined function to printing the output 

performance(data)



Accurarcy :  0.9900990099009901 

F1 score :  0.9950248756218906 

confusion matrix 
 [0, 0] 
 [100, 10000]


In [3]:
## calculating AUC Score

from tqdm import tqdm
unique_probability=(data['proba'].round(decimals=2)).unique()
list(unique_probability)
unique_probability.sort()
n_thresholds=list(unique_probability)
n_thresholds.reverse()
n_thresholds=n_thresholds


## comparing with different values of thresholds
TPR,FPR=[],[]
for i in tqdm(range(len(n_thresholds))):
    threshold=n_thresholds[i]
    
    data['y_predicted'] = [0.0 if x <threshold else 1.0 for x in data['proba']]
    
    
    
    
    TP,TN,FN,FP=0,0,0,0
    for i in range(len(data['y'])):
        if data['y'][i]==1 and data['y_predicted'][i]==1:
            TP+=1
        elif data['y'][i]==0 and data['y_predicted'][i]==0:
            TN+=1
        
        elif data['y'][i]==0 and data['y_predicted'][i]==1:
            FP+=1
        elif data['y'][i]==1 and data['y_predicted'][i]==0:
            FN+=1
    tpr=TP/(FN+TP)
    fpr=FP/(TN+FP)
    TPR.append(tpr)
    FPR.append(fpr)

## Finding the value of AUC

tpr_array=np.array(TPR)
fpr_array=np.array(FPR)

AUC_Score=np.trapz(tpr_array, fpr_array)

print('AUC score : ',AUC_Score)

100%|██████████| 41/41 [01:05<00:00,  1.60s/it]

AUC score :  0.4875514999999999





<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [4]:
## Loading data

data=pd.read_csv("5_b.csv")
data.head(5)

   
## calling the above defined function to printing the output 

performance(data)



Accurarcy :  0.9718811881188119 

F1 score :  0.2791878172588833 

confusion matrix 
 [9761, 45] 
 [239, 55]


In [5]:
## calculating AUC Score

from tqdm import tqdm
unique_probability=(data['proba'].round(decimals=2)).unique()
list(unique_probability)
unique_probability.sort()
n_thresholds=list(unique_probability)
n_thresholds.reverse()
n_thresholds=n_thresholds


## comparing with different values of thresholds
TPR,FPR=[],[]
for i in tqdm(range(len(n_thresholds))):
    threshold=n_thresholds[i]
    
    data['y_predicted'] = [0.0 if x <threshold else 1.0 for x in data['proba']]
    
    
    
    
    TP,TN,FN,FP=0,0,0,0
    for i in range(len(data['y'])):
        if data['y'][i]==1 and data['y_predicted'][i]==1:
            TP+=1
        elif data['y'][i]==0 and data['y_predicted'][i]==0:
            TN+=1
        
        elif data['y'][i]==0 and data['y_predicted'][i]==1:
            FP+=1
        elif data['y'][i]==1 and data['y_predicted'][i]==0:
            FN+=1
    tpr=TP/(FN+TP)
    fpr=FP/(TN+FP)
    TPR.append(tpr)
    FPR.append(fpr)

## Finding the value of AUC

tpr_array=np.array(TPR)
fpr_array=np.array(FPR)

AUC_Score=np.trapz(tpr_array, fpr_array)

print('AUC score : ',AUC_Score)

100%|██████████| 51/51 [01:16<00:00,  1.49s/it]

AUC score :  0.9372849999999999





<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [6]:
## loading data
data=pd.read_csv('5_c.csv')

thresholds=(data['prob'].round(decimals=3)).unique()
list1=list(thresholds)
list1.sort()



dict={}
for i in tqdm(range(len(list1))):
    
    data['y_predicted'] = [0.0 if x <list1[i] else 1.0 for x in data['prob']]
    
    FN,FP=0,0
    for j in range(len(data['y'])):
        if data['y'][j]==0 and data['y_predicted'][j]==1:
            FP+=1
        elif data['y'][j]==1 and data['y_predicted'][j]==0:
            FN+=1
    A=(500*FN)+(100*FP)
    dict[list1[i]]=A
    

min_A = min(dict.values()) 
min_threshold = [key for key in dict if dict[key] == min_A] 
print("Threshold for minimum value of A : " + str(min_threshold))    
        




100%|██████████| 782/782 [04:08<00:00,  3.15it/s]

Threshold for minimum value of A : [0.23]





<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [7]:
## loading data
data=pd.read_csv('5_d.csv')

## calculating mean square error

MSE=0
for i in range(len(data['y'])):
    diff=data['y'][i]-data['pred'][i]
    diff=diff**2
    MSE+=diff
    
MSE=MSE/len(data['y'])
print('Mean square error : ',MSE)


## calculating mean of actual values and putting for zero values and calculating MAPE
MAPE=0


for i in range(len(data['y'])):
    diff=(data['y'][i]-data['pred'][i])
    MAPE+=abs(diff)
    
MAPE=MAPE/data['y'].sum()
print('Mean absolute percentage error : ',MAPE)

## calculating Total sum of square for calculating R^2 
avg=data['y'].mean()
TSS=0
for i in range(len(data['y'])):
    diff=(data['y'][i]-avg)
    TSS+=diff**2
    


## using above calculated values and putting in formula of R^2 value

R_squared=1-(MSE/TSS)

print('R^2 value is : ',R_squared)
    

Mean square error :  177.16569974554707
Mean absolute percentage error :  0.1291202994009687
R^2 value is :  0.9999997223809077
