# Compute performance metrics for the given Y and Y_score without sklearn

In [40]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages


## A. Compute performance metrics for the given data '5_a.csv'
 <pre>  <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

###### Import Data

In [41]:
df_a=pd.read_csv('5_a.csv')
df_a.head

<bound method NDFrame.head of          y     proba
0      1.0  0.637387
1      1.0  0.635165
2      1.0  0.766586
3      1.0  0.724564
4      1.0  0.889199
...    ...       ...
10095  1.0  0.665371
10096  1.0  0.607961
10097  1.0  0.777724
10098  1.0  0.846036
10099  1.0  0.679507

[10100 rows x 2 columns]>

##### write your code here for task A

In [42]:
# check Imbalance data
df_a.groupby('y').count()

Unnamed: 0_level_0,proba
y,Unnamed: 1_level_1
0.0,100
1.0,10000


##### Converting probality value to output class label

In [44]:
df_a['ypred'] = df_a['proba'].apply(lambda x: 0 if x < 0.5 else 1)
df_a

Unnamed: 0,y,proba,ypred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1
...,...,...,...
10095,1.0,0.665371,1
10096,1.0,0.607961,1
10097,1.0,0.777724,1
10098,1.0,0.846036,1


###### Confusion Matrix

In [45]:
#df_a['y'] == 1
#df_a[df_a['y']==1]
#(df_a['y']==1) & (df_a['ypred']==1)
#df_a[(df_a['y']==1) & (df_a['ypred']==1)]

def confusion_matrix(data):
    connt_tp =len( data[(data['y'] == 1) & (data['ypred'] == 1)] )
    connt_fp =len( data[(data['y'] == 0) & (data['ypred'] == 1)] )
    connt_tn =len( data[(data['y'] == 0) & (data['ypred'] == 0)] )
    connt_fn =len( data[(data['y'] == 1) & (data['ypred'] == 0)] )
    return connt_tp, connt_fp, connt_tn, connt_fn

##### F1 Score

In [46]:
def f1_score(data):
    tp,fp,tn,fn=confusion_matrix(data)
    precision=tp/(tp+fp)                    # calculating precision and recall
    recall=tp/(tp+fn)                    
    f1=2*((precision*recall)/(precision+recall))
    return f1

###### Accuracy Score

In [47]:
def accuracy(data):
    tp,fp,tn,fn = confusion_matrix(data)
    acc=((tp+tn)/(tp+fp+fn+tn))          
    return acc

###### AUC Score

In [54]:
def auc_score(data):
    from tqdm import tqdm
    tpr_lst=[]
    fpr_lst=[]
    
    sort= data.sort_values('proba',ascending=False)  
    sort.drop_duplicates(subset='proba',keep='first',inplace=False)
    #print(sort)
    #sort = sort['proba'].unique()
    for i in tqdm(range(0,len(sort))):
        sort['ypred']=np.where(sort['proba']>=sort.iloc[i]['proba'],1,0) # predicting the y based on each threshold
        TP,FP,TN,FN=confusion_matrix(sort)    # for each threshold calculating confusion matrix
        FPR = FP/(TN+FP)
        TPR = TP/(TP+FN)
        tpr_lst.append(TPR)
        fpr_lst.append(FPR)
    c=np.trapz(tpr_lst, fpr_lst)
    return c

### A. data : '5_a.csv'

###### 1.confusion matrix for 5_a.csv

In [50]:
# 1.confusion matrix of data 5_a.csv
tp,fp,tn,fn = confusion_matrix(df_a)
print("FALSE NEGATIVE :",fn)
print("FALSE POSITIVE :",fp)
print("TRUE NEGATIVE :",tn)
print("TRUE POSITIVE :",tp)

confusion_matrix_show = []
confusion_matrix_show.append(tp)
confusion_matrix_show.append(fp)
confusion_matrix_show.append(fn)
confusion_matrix_show.append(tn)
#print(confusion_matrix_show)
X = np.reshape(confusion_matrix_show, (2, 2))
print("\n\nConfusion Matrix: ")
print(X)

FALSE NEGATIVE : 0
FALSE POSITIVE : 100
TRUE NEGATIVE : 0
TRUE POSITIVE : 10000


Confusion Matrix: 
[[10000   100]
 [    0     0]]


###### 2. f1 score for 5_a.csv

In [51]:
f1=f1_score(df_a)
print("F1 SCORE :",f1)

F1 SCORE : 0.9950248756218906


###### 3. accuracy Score for 5_a.csv

In [52]:
acc=accuracy(df_a)
print('ACCURACY Score :',acc)

ACCURACY Score : 0.9900990099009901


##### 4. auc value for 5_a.csv

In [55]:
auc=auc_score(df_a)
print('AUC VALUE :',auc)

100%|███████████████████████████████████████████████████████████████████████████| 10100/10100 [01:39<00:00, 101.18it/s]

AUC VALUE : 0.48829900000000004







## B. Compute performance metrics for the given data '5_b.csv'
<pre>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a>
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [56]:
df_b=pd.read_csv('5_b.csv')
df_b.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [None]:
# write your code here for task B

In [57]:
# check Imbalance data
df_b.groupby('y').count()

Unnamed: 0_level_0,proba
y,Unnamed: 1_level_1
0.0,10000
1.0,100


In [58]:
df_b['ypred'] = df_b['proba'].apply(lambda x: 0 if x < 0.5 else 1)
df_b

Unnamed: 0,y,proba,ypred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0
...,...,...,...
10095,0.0,0.474401,0
10096,0.0,0.128403,0
10097,0.0,0.499331,0
10098,0.0,0.157616,0


## B. data : '5_b.csv'

##### 1.confusion matrix for 5_b.csv

In [59]:
# 1.confusion matrix of data 5_b.csv
tp,fp,tn,fn = confusion_matrix(df_b)
print("FALSE NEGATIVE :",fn)
print("FALSE POSITIVE :",fp)
print("TRUE NEGATIVE :",tn)
print("TRUE POSITIVE :",tp)

confusion_matrix_show = []
confusion_matrix_show.append(tp)
confusion_matrix_show.append(fp)
confusion_matrix_show.append(fn)
confusion_matrix_show.append(tn)
#print(confusion_matrix_show)
X = np.reshape(confusion_matrix_show, (2, 2))
print("\n\nConfusion Matrix: ")
print(X)

FALSE NEGATIVE : 45
FALSE POSITIVE : 239
TRUE NEGATIVE : 9761
TRUE POSITIVE : 55


Confusion Matrix: 
[[  55  239]
 [  45 9761]]


##### 2. f1 score for 5_b.csv

In [60]:
f1=f1_score(df_b)
print("F1 SCORE :",f1)

F1 SCORE : 0.2791878172588833


###### 3. accuracy Score for 5_b.csv

In [61]:
acc=accuracy(df_b)
print('ACCURACY VALUE :',acc)

ACCURACY VALUE : 0.9718811881188119


##### 4. auc value for 5_b.csv

In [62]:
auc=auc_score(df_b)
print("AUC VALUE :",auc)

100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [01:51<00:00, 90.48it/s]

AUC VALUE : 0.9377570000000001





### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [63]:
df_c=pd.read_csv('5_c.csv')
df_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [64]:
# check Imbalance data
df_c.groupby('y').count()

Unnamed: 0_level_0,prob
y,Unnamed: 1_level_1
0,1805
1,1047


In [None]:
 # write your code for task C

In [67]:
df_c['ypred'] = df_c['prob'].apply(lambda x: 0 if x < 0.5 else 1)
df_c

Unnamed: 0,y,prob,ypred
0,0,0.458521,0
1,0,0.505037,1
2,0,0.418652,0
3,0,0.412057,0
4,0,0.375579,0
...,...,...,...
2847,1,0.491663,0
2848,1,0.292109,0
2849,1,0.659161,1
2850,1,0.456265,0


In [68]:
from tqdm import tqdm  
def best_threshold(data):
    check=0
    thresh=[]
    A=[]
    sorted= data.sort_values("prob",ascending=False) 
    for i in tqdm(range(0,len(sorted))):
        if check==(sorted.iloc[i]['prob']): # check unique probability
            continue
        check=sorted.iloc[i]['prob'] 
        thresh.append(check)
        sorted['ypred']=np.where(sorted['prob']>=sorted.iloc[i]['prob'],1,0)
        TP,FP,TN,FN=confusion_matrix(sorted) # calculating confusion matrix for each threshold
        value=500*FN+100*FP
        A.append(value)  # calculating the metric A
    index=A.index(min(A)) # finding the index of A with minimium value
    return thresh[index]

In [70]:
best=best_threshold(df_c)
print('BEST THRESHOLD :',best)

100%|██████████████████████████████████████████████████████████████████████████████| 2852/2852 [00:32<00:00, 88.23it/s]

BEST THRESHOLD : 0.2300390278970873






## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [71]:
df_d=pd.read_csv('5_d.csv')
df_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [72]:
len(df_d['y']) , len(df_d['pred'])

(157200, 157200)

In [38]:
 # write your code for task 5d

##### Compute Mean Square Error

In [75]:
MSE = np.sum(np.power(df_d['y'] - df_d['pred'], 2)) / len(df_d['y'])
print("Mean Square Error: ", MSE)

Mean Square Error:  177.16569974554707


###### Compute Mean Absolute Percentage Error

In [76]:
MAPE = np.sum(np.absolute(df_d['y'] - df_d['pred'])) / np.sum(df_d['y'])
print("Mean Absolute Percentage Error: ", MAPE*100)

Mean Absolute Percentage Error:  12.91202994009687


###### Compute R^2 error

In [77]:
y_mean = np.mean(np.absolute(df_d['y']))
tot_sum_of_sq = np.sum(np.power(df_d['y'] - y_mean, 2))
sum_sq_of_res = np.sum(np.power(df_d['y'] - df_d['pred'], 2))
coeff_of_det = 1 - (sum_sq_of_res / tot_sum_of_sq)
print("R^2 error: ", coeff_of_det)

R^2 error:  0.9563582786990937
