# Compute performance metrics for the given Y and Y_score without sklearn

In [3]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

* Solution 5 A

In [4]:
#load dataset using pandas
df5a =  pd.read_csv('5_a.csv')
print(len(df5a))
df5a.head()

10100


Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [5]:
#function to derive predicted class label
def func1(x):
    return 0 if x<0.5 else 1
print(func1(0.6))

1


In [6]:
#creating a copy of original dataset
df_5a = df5a.copy()
df_5a['proba'] = df_5a['proba'].apply(func1)

In [29]:
print('Positive data points:',len(df_5a[df_5a['proba'] ==1]))
print('Negative data points:',len(df_5a[df_5a['proba'] ==0]))

Positive data points: 10100
Negative data points: 0


In [30]:
df_5a.head()

Unnamed: 0,y,proba
0,1.0,1
1,1.0,1
2,1.0,1
3,1.0,1
4,1.0,1


In [56]:
tp = np.sum(df_5a[(df_5a['y']==1) & (df_5a['proba']==1)])
print(tp.get('y'))

10000.0


In [61]:
tp = len(data.query('y == 1 & proba == 1'))

tp , tn, fp, fn = (0, 0, 0, 0)
for i in range(len(data)):
    if (data['y'][i] == 1.0) and  (data['y_pred'][i] == 1):
        tp += 1
        ...

10000

### Confusion matrix

In [8]:
tp , tn, fp, fn = (0, 0, 0, 0)
for i in range(len(df_5a)):
    if (df_5a['y'][i] == 1.0) and  (df_5a['proba'][i] == 1):
        tp += 1
    if (df_5a['y'][i] == 0.0) and  (df_5a['proba'][i] == 0):
        tn += 1
    if (df_5a['y'][i] == 0.0) and  (df_5a['proba'][i] == 1):
        fp += 1
    if (df_5a['y'][i] == 1.0) and  (df_5a['proba'][i] == 0):
        fn += 1
print(tp , tn, fp, fn)
print(sum((tp , tn, fp, fn)))
print('Confusion matrix : \n', np.array([[tp , fp],[fn, tn]]))

10000 0 100 0
10100
Confusion matrix : 
 [[10000   100]
 [    0     0]]


* F1 = 2 * (precision * recall) / (precision + recall)
* Precision = tp/(tp+fp)
* Recall = tp/(tp+fn)
* F1_Ccore = 2 * (precision * recall) / (precision + recall)
* Accuracy = (tp+tn)/(tp+fp+tn+fn)
* TPR(True Positive Rate) = tp/(tp+fn)
* FPR(False Positive Rate) = fp/(tn+fp

In [9]:
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2 * (precision * recall) / (precision + recall)
accuracy = (tp+tn)/(tp+fp+tn+fn)

### AUC calculation

In [10]:
%%time
# AUC calculation
unique_prob = sorted(set(df5a['proba']),reverse= True)
#Creating list of uniques probabilities so that we will iterate over this and have threshold value each once.

y_act = list(df5a['y'])   # list of actual y values
tpr = []
fpr = []
for thres in unique_prob:  # to keep each prob value as threshold
    tp, fp, tn, fn = (0, 0, 0, 0)
    y_pred = []
    for i in range(len(df5a['proba'])):  # to derive predicted class for each threshold
        if df5a['proba'][i] > thres :
            y_pred.append(1)
        else:
            y_pred.append(0)
        if y_act[i] == 1 and y_pred[i] == 1:
            tp += 1
        if y_act[i] == 0 and y_pred[i] == 1:
            fp += 1
        if y_act[i] == 0  and y_pred[i] == 0:
            tn +=1
        if y_act[i] == 1 and y_pred[i] == 0:
            fn += 1
            
    tpr.append(round(tp/(tp+fn), 3))
    fpr.append(round(fp/(tn+fp), 3))
print(tpr[:10], fpr[:10])

[0.0, 0.0, 0.0, 0.0, 0.0, 0.001, 0.001, 0.001, 0.001, 0.001] [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Wall time: 8min 23s


In [11]:
auc = np.trapz(tpr, fpr)

In [12]:
print('Precision: ', precision)
print('Recall: ', recall)
print('F1 Score: ', f1_score)
print('AUC Score: ', auc)
print('Accuracy: ', accuracy)

Precision:  0.9900990099009901
Recall:  1.0
F1 Score:  0.9950248756218906
AUC Score:  0.48823000000000005
Accuracy:  0.9900990099009901


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

* Solution 5b

In [68]:
# write your code
df5b =  pd.read_csv('5_b.csv')
print(len(df5b))
df5b.head()

10100


Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [69]:
# creating copy of original dataframe
df_5b = df5b.copy()
df_5b['proba'] = df_5b['proba'].apply(func1)
df_5b.head()

Unnamed: 0,y,proba
0,0.0,0
1,0.0,0
2,0.0,0
3,0.0,0
4,0.0,0


In [70]:
print('Positive data points:',len(df_5b[df_5b['proba'] ==1]))
print('Negative data points:',len(df_5b[df_5b['proba'] ==0]))

Positive data points: 294
Negative data points: 9806


### Confusion matrix

In [71]:
tp , tn, fp, fn = (0, 0, 0, 0)
for i in range(len(df_5a)):
    if (df_5b['y'][i] == 1.0) and  (df_5b['proba'][i] == 1):
        tp += 1
    if (df_5b['y'][i] == 0.0) and  (df_5b['proba'][i] == 0):
        tn += 1
    if (df_5b['y'][i] == 0.0) and  (df_5b['proba'][i] == 1):
        fp += 1
    if (df_5b['y'][i] == 1.0) and  (df_5b['proba'][i] == 0):
        fn += 1
print(tp , tn, fp, fn)
print(sum((tp , tn, fp, fn)))
print('Confusion matrix : \n', np.array([[('TP',tp) ,('FP', fp)],[('FN', fn), ('TN', tn)]]))

55 9761 239 45
10100
Confusion matrix : 
 [[['TP' '55']
  ['FP' '239']]

 [['FN' '45']
  ['TN' '9761']]]


In [72]:
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2 * (precision * recall) / (precision + recall)
accuracy = (tp+tn)/(tp+fp+tn+fn)

In [73]:
%%time
# AUC calculation
unique_prob = sorted(set(df5b['proba']),reverse= True)
#Creating list of uniques probabilities so that we will iterate over this and have threshold value each once.

y_act = list(df5b['y'])  # list of actual y values
tpr = []
fpr = []
for thres in unique_prob:  # to keep each prob value as threshold
    tp, fp, tn, fn = (0, 0, 0, 0)
    y_pred = []
    for i in range(len(df5b['proba'])):    # to derive predicted class for each threshold
        if df5b['proba'][i] > thres :
            y_pred.append(1)
        else:
            y_pred.append(0)
        if y_act[i] == 1 and y_pred[i] == 1:
            tp += 1
        if y_act[i] == 0 and y_pred[i] == 1:
            fp += 1
        if y_act[i] == 0  and y_pred[i] == 0:
            tn +=1
        if y_act[i] == 1 and y_pred[i] == 0:
            fn += 1
            
    tpr.append(round(tp/(tp+fn), 3))
    fpr.append(round(fp/(tn+fp), 3))
print(tpr[:10], fpr[:10])

[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09] [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Wall time: 8min 52s


In [74]:
auc = np.trapz(tpr, fpr)

In [75]:
print('Precision: ', precision)
print('Recall: ', recall)
print('F1 Score: ', f1_score)
print('AUC Score: ', auc)
print('Accuracy: ', accuracy)

Precision:  0.1870748299319728
Recall:  0.55
F1 Score:  0.2791878172588833
AUC Score:  0.9377300000000001
Accuracy:  0.9718811881188119


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

* Solution 5c

In [76]:
# loading data set using pandas
df_5c = pd.read_csv('5_c.csv')
df_5c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [77]:
%%time
# 5c_solution
#Creating list of uniques probabilities so that we will iterate over this and have threshold value each once.

unique_prob = sorted(set(df_5c['prob']),reverse= True)
y_act = list(df_5c['y'])  # list of actual y values
A = []
for thres in unique_prob: # to keep each prob value as threshold
    fp, fn = (0, 0)
    y_pred = []
    for i in range(len(df_5c['prob'])): # to derive predicted class for each threshold
        if df_5c['prob'][i] > thres :
            y_pred.append(1)
        else:
            y_pred.append(0)
        if y_act[i] == 0 and y_pred[i] == 1:
            fp += 1
        if y_act[i] == 1 and y_pred[i] == 0:
            fn += 1
    a = 500*fn + 100*fp        # metric A calculation
    A.append((a, thres))       # appending a in list A with corresponding prob value
print(A[:5])                   # print to check 1st 5 values in A

[(523500, 0.9577467989277196), (523000, 0.9514369163158778), (522500, 0.9486377939984604), (522000, 0.9440936134070964), (521500, 0.9411131844327256)]
Wall time: 38.7 s


In [78]:
dict_a = dict(A)               #converted to dictionary, dict_a
print('Lowest A: ', sorted(dict_a)[0])  #sorted based on key value a, ans 1st item will be lowest
print('Prob which gives lowest value of A:', dict_a[sorted(dict_a)[0]])  #corresponding to that key, will be probabilty which yields lowest A value

Lowest A:  141000
Prob which gives lowest value of A: 0.2298716443615991


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

* Solution 5d

In [79]:
#load data with pandas
df5d = pd.read_csv('5_d.csv')
df5d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [80]:
print(len(df5d))

157200


* MSE = Means Square Error, mean of sq of actual minus predicted.
* MAPE = Here we have calculated modified mean absolute percentage error, here due to zero devision error( some of actual value maybe 0) we have taken mean of actual values.
* SStot = Total squared error(i.e sum of squares of actual - predicted )
* SSres = Sum of residual squared error
* R^2_error = 1 - SSres/SStot

In [81]:
MSE = np.square(np.subtract(df5d['y'], df5d['pred'])).mean()
MAPE = np.mean( np.abs(df5d['y'] - df5d['pred']) )/ np.mean(df5d['y'])
SStot = np.sum(np.square(df5d['y'] - np.mean(df5d['y'])))
SSres = np.sum( np.square(df5d['y'] - df5d['pred']) )
R_Sq_error = 1 - SSres/SStot

In [82]:
print('Mean Squared Error: ',MSE)
print('Modified Mean Absolute Percentage Error: ',MAPE)
print('R^2 Error: ',R_Sq_error)

Mean Squared Error:  177.16569974554707
Modified Mean Absolute Percentage Error:  0.1291202994009687
R^2 Error:  0.9563582786990937


## END