# Compute performance metrics for the given Y and Y_score without sklearn

In [86]:
import numpy as np
import pandas as pd
from tqdm import tqdm
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [87]:
def calculateConfusionMatrix(y,proba,threshold):
    ypred = proba.apply(lambda prob: 1 if prob>threshold else 0)
    data = pd.concat([y,ypred],keys=['y','ypred'],axis = 1)
    TN = len(data[lambda row : (row.ypred == 0) & (row.y == 0)]);
    #print(TN)
    FN = len(data[lambda row : (row.ypred == 0) & (row.y == 1)]);
    #print(FN)
    TP = len(data[lambda row : (row.ypred == 1) & (row.y == 1)]);
    #print(TP)
    FP = len(data[lambda row : (row.ypred == 1) & (row.y == 0)]);
    confusion_matrix = [[TN,FN],[FP,TP]];
    return confusion_matrix;

def calculate_auc(data):
    #sort proba values
    data = data.sort_values(by = ['proba'],ascending = [False]);
    thresholds = data.proba.unique();
    confusionMatrix = [];
    tpr_array = [];
    fpr_array = [];
    for threshold in tqdm(thresholds):
        confusionMatrix=calculateConfusionMatrix(data.y,data.proba,threshold);
        TPR = confusionMatrix[1][1]/(confusionMatrix[1][1]+confusionMatrix[0][1]);
        FPR = confusionMatrix[1][0]/(confusionMatrix[0][0]+confusionMatrix[1][0]);
        tpr_array.append(TPR);
        fpr_array.append(FPR);      
    return np.trapz(tpr_array,fpr_array);
    
    

In [88]:
# write your code here
data = pd.read_csv("5_a.csv");
print(data.head(5));
print(data['y'].value_counts());
data['ypred'] = data['proba'].apply(lambda prob : 1 if prob >= 0.5 else 0);
#confusion Matrix
#[TN,FN]
#[FP,TP]
confusionMatrix = calculateConfusionMatrix(data.y,data.proba,0.5);
print( "Confusion Matrix : ",confusionMatrix);

precision = (confusionMatrix[1][1]/(confusionMatrix[1][0]+confusionMatrix[1][1]));
print("precision: ", precision);

recall = (confusionMatrix[1][1]/(confusionMatrix[0][1]+confusionMatrix[1][1]));
print("recall: ", recall);

f1_score = 2*((precision*recall)/(precision+recall))
print("f1_score: ", f1_score);

#calculate AUC 
auc = calculate_auc(data);
print("AUC: ", auc);

#accuracy = (correctly classified points/totoal number of points)
accuracy = (confusionMatrix[0][0]+confusionMatrix[1][1])/(len(data));
print("accuracy:",accuracy);

  0%|                                                                                | 5/10100 [00:00<03:42, 45.44it/s]

     y     proba
0  1.0  0.637387
1  1.0  0.635165
2  1.0  0.766586
3  1.0  0.724564
4  1.0  0.889199
1.0    10000
0.0      100
Name: y, dtype: int64
Confusion Matrix :  [[0, 0], [100, 10000]]
precision:  0.9900990099009901
recall:  1.0
f1_score:  0.9950248756218906


100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [04:07<00:00, 40.78it/s]

AUC:  0.48829900000000004
accuracy: 0.9900990099009901





<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [89]:
# write your code
data = pd.read_csv("5_b.csv");
print(data['y'].value_counts());

confusionMatrix = calculateConfusionMatrix(data.y,data.proba,0.5);
print( "Confusion Matrix : ",confusionMatrix);

precision = (confusionMatrix[1][1]/(confusionMatrix[1][0]+confusionMatrix[1][1]));
print("precision: ", precision);

recall = (confusionMatrix[1][1]/(confusionMatrix[0][1]+confusionMatrix[1][1]));
print("recall: ", recall);

f1_score = 2*((precision*recall)/(precision+recall));
print("f1_score: ", f1_score);

#calculate AUC 
auc = calculate_auc(data);
print("AUC: ", auc);

#accuracy = (correctly classified points/totoal number of points)
accuracy = (confusionMatrix[0][0]+confusionMatrix[1][1])/(len(data));
print("accuracy:",accuracy);

  0%|                                                                                | 6/10100 [00:00<02:55, 57.36it/s]

0.0    10000
1.0      100
Name: y, dtype: int64
Confusion Matrix :  [[9761, 45], [239, 55]]
precision:  0.1870748299319728
recall:  0.55
f1_score:  0.2791878172588833


100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [02:28<00:00, 67.90it/s]

AUC:  0.9376570000000001
accuracy: 0.9718811881188119





<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [84]:
def calculateBestThreshold(data):
    data = data.sort_values(by = ['prob'],ascending = [False])
    thresholds = data.prob.unique();
    minA = -1
    minThreshold = -1
    for threshold in tqdm(thresholds):
        confusionMatrix=calculateConfusionMatrix(data.y,data.prob,threshold);
        FN = confusionMatrix[0][1]
        FP = confusionMatrix[1][0]
        A = 500*FN + 100*FP;
        if(minA == -1):
            minA = A
            minThreshold = threshold
        if(A < minA):
            minA = A
            minThreshold = threshold
    return minThreshold

In [85]:
 # write your code
data = pd.read_csv("5_c.csv");
print(data.describe())
print(data['y'].value_counts());
optimalThreshold = calculateBestThreshold(data);
print("Best Threshold :",optimalThreshold);

  0%|▎                                                                              | 13/2791 [00:00<00:22, 121.46it/s]

                 y         prob
count  2852.000000  2852.000000
mean      0.367111     0.370069
std       0.482102     0.207414
min       0.000000     0.028038
25%       0.000000     0.201460
50%       0.000000     0.336935
75%       1.000000     0.509001
max       1.000000     0.957747
0    1805
1    1047
Name: y, dtype: int64


100%|█████████████████████████████████████████████████████████████████████████████| 2791/2791 [00:25<00:00, 109.98it/s]

Best Threshold : 0.22987164436159915





<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [131]:
def absoluteDifference(num1,num2):
    if(num1 > num2):
        return num1 - num2
    return num2 - num1

def calculateMeanSquaredError(data):
    squaredError = data.apply(lambda row: (row['y'] - row['pred'])**2, axis = 1)
    return squaredError.mean()

def calculateMAPE(data):
    absDiff = data.apply( lambda row: absoluteDifference(row.y,row.pred),axis = 1 )
    data['absDiff'] = absDiff
    return (data['absDiff'].sum())/(data['y'].sum())
    
def calculateR2(data):
    meanOfActualValue = data['y'].mean();
    SSres = data.apply(lambda row: (row['y'] - row['pred'])**2, axis = 1)
    SStotal = data.apply(lambda row: (row['y'] - meanOfActualValue)**2, axis = 1)
    return 1 - (SSres.sum()/SStotal.sum())
    

In [132]:
data = pd.read_csv("5_d.csv");
print(data.head(5))
meanSquaredError = calculateMeanSquaredError(data);
print("Mean Squared Error: ",meanSquaredError )

mape = calculateMAPE(data) 
print("MAPE:",mape)

r2 = calculateR2(data);
print("R^2:",r2)

       y   pred
0  101.0  100.0
1  120.0  100.0
2  131.0  113.0
3  164.0  125.0
4  154.0  152.0
Mean Squared Error:  177.16569974554707
MAPE: 0.1291202994009687
R^2: 0.9563582786990937
