# Compute performance metrics for the given Y and Y_score without sklearn

In [55]:
# import necessary libraries
import numpy as np
import pandas as pd

# Helper Functions

## Function for predicting Class label given probability scores

In [56]:
def predict_on_probabalities(x, threshold):
    '''
    params:
    x -> probablity value
    threshold -> threshold probablity value
    return -> 0.0 or 1.0 based on x < threshold respectively
    '''
    # any proba less than threshold will be labelled as 0.0 else will be lebelled as 1.0
    return 0.0 if x < threshold else 1.0

## Function for creating confusion_matrix

In [57]:
def create_confusion_matrix(dataset, target_column, predicted_column):
    '''
    params:
    dataset -> pandas dataframe for which the confusion matrix is to be calculated
    target_column -> the depenent feature in the given dataset
    predicted_column -> the predicted feature in the given dataset
    return:
    a multi-index and multi-level column based Pandas DataFrame representing the confusion matrix
    '''
    # sort the unique values to form nice confusion_matrix
    unique_values = list(np.sort(dataset[target_column].unique()))
    # figure out the number of unique classes in the target feature
    number_of_classes = dataset[target_column].nunique()
    # set our multiindex such that the outer level is pred and inner level are the unique_class_labels
    index = pd.MultiIndex.from_product([['pred'], unique_values])
    # set our multi-level column such that outer layer is actual and inner level are the unique_class labels
    columns = pd.MultiIndex.from_product([['actual'], unique_values])
    # create a confusion_matrix dataframe with the above index and columns with c*c(c is number of unique classes) elements set to zero
    confusion_matrix = pd.DataFrame(np.zeros((number_of_classes,number_of_classes)),index = index,columns=columns)
    # loop through every unique_class_label
    for unique_value in unique_values:
        # loop through every unique_class_label to determine the actual true and false number for the outer unique_class_label
        for every_class in unique_values:
            # set the appropriate element of confusion_matrix
            confusion_matrix.loc[('pred', every_class),('actual',unique_value)] = len(dataset[(dataset[target_column] == unique_value) & (dataset[predicted_column] == every_class)])
    # return the confusion_matrix
    return confusion_matrix

## Function for calculating F1-Score

In [58]:
def calculate_f1_score(confusion_matrix, class_label):
    '''
    params:
    confusion_matrix -> The confusion_matrix for which the f1-score has to be calculated
    class_label -> for which class_label in the confusion_matrix you need to calculate f1-score
    returns:
    f1_score (float_type)
    '''
    # calculate number of true_positive + false_positive from the confusion_matrix
    tp_plus_fp = confusion_matrix.loc[('pred',class_label)].sum()
    # calculate number of true_positives
    tp = confusion_matrix.loc[('pred',class_label),('actual',class_label)]
    # calculate number of true_positives + false_negatives
    tp_plus_fn = confusion_matrix['actual',class_label].sum()
    # calculate precision
    precision = tp/tp_plus_fp
    # calculate recall
    recall = tp/tp_plus_fn
    # calculate the harmonic mean of precision and recall to calculate f1-score
    f1_score = (2*precision*recall)/(precision+recall)
    # return f1-score
    return f1_score.round(3)


In [59]:
def calculate_accuracy(data_set, target_column, pred_column):
    '''
    params:
    data_set -> Pandas DataFrame for which the accuracy has to be calculated
    target_column -> The name of the target_column in the data_set
    pred_column -> The name of the pred_column in the dataset
    returns:
    accuracy_score (floating_type)
    '''
    total_number_of_datapoints  = data_set.shape[0]
    total_correctly_classified = data_set[data_set[target_column] == data_set[pred_column]].shape[0]
    accuracy = total_correctly_classified/total_number_of_datapoints
    return round(accuracy,3)

# Function to calculate AUC

In [60]:
from tqdm import tqdm
def calculate_auc(dataset, target_column, proba_column, positive_class_label):
    '''
    params:
    dataset -> Pandas DataFrame on which AUC has to be calculated
    target_column -> The name of the target column in the dataset
    proba_column -> The name of the proba_scores_column in th dataset
    positive_class_label -> The type of the class_label  for which you need to calculate AUC(positive here refers to class of interest)
    '''
    # Identify the unique thresholds and sort them in descending order
    thresholds = sorted(list(dataset[proba_column].unique()),reverse=True)
    # initialize FPR AND TPR as lists
    FPR = list()
    TPR = list()
    # for each threshold
    for threshold in tqdm(thresholds):
        # caclculate the y_pred based on the threshold
        dataset['y_pred'] = dataset[proba_column].apply(predict_on_probabalities, args=(threshold, ))
        # calculate number of actual_non_positive_label_points
        total_actual_negative_points = dataset[dataset[target_column]!=positive_class_label].shape[0]
        # calculate number of actual_positive_label_points
        total_actual_positive_points = dataset[dataset[target_column]==positive_class_label].shape[0]
        # calculate true_positive_rate
        true_positive_rate = dataset[(dataset[target_column]==positive_class_label) & (dataset['y_pred']==positive_class_label)].shape[0]/total_actual_positive_points
        # calculate false_positive rate
        false_positive_rate = dataset[(dataset[target_column]!=positive_class_label) & (dataset['y_pred']==positive_class_label)].shape[0]/total_actual_negative_points
        # append the false_positive_rate to FPR
        FPR.append(false_positive_rate)
        # append the true_positive_rate to TPR
        TPR.append(true_positive_rate)
    # convert FPR AND TPR list to numpy array
    tpr_array = np.array(TPR)
    fpr_array = np.array(FPR)
    # calculate area under the curve by integrating tpr_array wrt fpr_array using np.trapz
    auc = np.trapz(tpr_array, fpr_array)
    # return the auc_score
    return round(auc,3)


<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

In [61]:
# loading the dataset
data_set = pd.read_csv('5_a.csv')
# overview of dataset
print('first 5 rows of the dataset\n',data_set.head())
# datatset columns
print('\nDataset columns are\n',data_set.columns.values)
# target distribution
print('\n The Target distribution\n',data_set['y'].value_counts())

first 5 rows of the dataset
      y     proba
0  1.0  0.637387
1  1.0  0.635165
2  1.0  0.766586
3  1.0  0.724564
4  1.0  0.889199

Dataset columns are
 ['y' 'proba']

 The Target distribution
 1.0    10000
0.0      100
Name: y, dtype: int64


* The dataset is highly imbalanced with # +ve points >> # -ve points

In [62]:
# set the threshold for predicted yq = 1.0
threshold_proba = 0.5
# create a y_pred column using the proba column and threshold value
data_set['y_pred'] = data_set.proba.apply(predict_on_probabalities, args= (threshold_proba,))
# overview of updated dataset
print(data_set.head())

     y     proba  y_pred
0  1.0  0.637387     1.0
1  1.0  0.635165     1.0
2  1.0  0.766586     1.0
3  1.0  0.724564     1.0
4  1.0  0.889199     1.0


In [63]:
# confusion matrix
confusion = create_confusion_matrix(data_set,'y','y_pred')
confusion

Unnamed: 0_level_0,Unnamed: 1_level_0,actual,actual
Unnamed: 0_level_1,Unnamed: 1_level_1,0.0,1.0
pred,0.0,0.0,0.0
pred,1.0,100.0,10000.0


In [64]:
# calculate f1-score
f1_score = calculate_f1_score(confusion, class_label = 1.0)
f1_score

0.995

In [65]:
# calculate accuracy
accuracy = calculate_accuracy(data_set, target_column = 'y', pred_column='y_pred')
accuracy

0.99

In [66]:
# ROC and AUC
data_set_new = data_set.drop('y_pred', axis=1, inplace=False)
area_under_the_curve = calculate_auc(data_set_new, target_column = 'y', proba_column = 'proba',positive_class_label=1.0)
area_under_the_curve

100%|██████████| 10100/10100 [01:13<00:00, 136.72it/s]


0.488

<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$


In [67]:
# loading the dataset
data_set = pd.read_csv('5_b.csv')
# overview of dataset
print('first 5 rows of the dataset\n',data_set.head())
# datatset columns
print('\nDataset columns are\n',data_set.columns.values)
# target distribution
print('\n The Target distribution\n',data_set['y'].value_counts())

first 5 rows of the dataset
      y     proba
0  0.0  0.281035
1  0.0  0.465152
2  0.0  0.352793
3  0.0  0.157818
4  0.0  0.276648

Dataset columns are
 ['y' 'proba']

 The Target distribution
 0.0    10000
1.0      100
Name: y, dtype: int64


In [68]:
# set the threshold for predicted yq = 1.0
threshold_proba = 0.5
# create a y_pred column using the proba column and threshold value
data_set['y_pred'] = data_set.proba.apply(predict_on_probabalities, args= (threshold_proba,))
# overview of updated dataset
print(data_set.head())

     y     proba  y_pred
0  0.0  0.281035     0.0
1  0.0  0.465152     0.0
2  0.0  0.352793     0.0
3  0.0  0.157818     0.0
4  0.0  0.276648     0.0


In [69]:
# confusion matrix
confusion = create_confusion_matrix(data_set,'y','y_pred')
confusion

Unnamed: 0_level_0,Unnamed: 1_level_0,actual,actual
Unnamed: 0_level_1,Unnamed: 1_level_1,0.0,1.0
pred,0.0,9761.0,45.0
pred,1.0,239.0,55.0


In [70]:
# calculate f1-score
f1_score = calculate_f1_score(confusion, class_label = 1.0)
f1_score

0.279

In [71]:
# calculate accuracy
accuracy = calculate_accuracy(data_set, target_column = 'y', pred_column='y_pred')
accuracy

0.972

In [72]:
# ROC and AUC
data_set_new = data_set.drop('y_pred', axis=1, inplace=False)
area_under_the_curve = calculate_auc(data_set_new, target_column = 'y', proba_column = 'proba',positive_class_label=1.0)
area_under_the_curve

100%|██████████| 10100/10100 [01:12<00:00, 139.37it/s]


0.938

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [73]:
# loading the dataset
data_set = pd.read_csv('5_c.csv')
# overview of dataset
print('first 5 rows of the dataset\n',data_set.head())
# datatset columns
print('\nDataset columns are\n',data_set.columns.values)
# target distribution
print('\n The Target distribution\n',data_set['y'].value_counts())

first 5 rows of the dataset
    y      prob
0  0  0.458521
1  0  0.505037
2  0  0.418652
3  0  0.412057
4  0  0.375579

Dataset columns are
 ['y' 'prob']

 The Target distribution
 0    1805
1    1047
Name: y, dtype: int64


* dataset is imbalanced with # -ve points > # +ve points

In [74]:
# find all unique proba and sort in decending order
thresholds = sorted(list(data_set['prob'].unique()),reverse=True)
# set the metric_value initially to very high value by running the code with max,Found out that max metric value is 523000, hence set  metricvaue to 6 lakh
metric_value = 600000
# for each threshold in thresholds
for threshold in tqdm(thresholds):
    # predict y_pred using the proba and threshold 
    data_set['y_pred'] = data_set['prob'].apply(predict_on_probabalities, args=(threshold, ))
    # calculate number of false_negatives
    false_negatives = data_set[(data_set['y']== 1.0) & (data_set['y_pred']== 0.0)].shape[0]
    # calculate number of false_positives
    false_positives = data_set[(data_set['y']==0.0) & (data_set['y_pred']==1.0)].shape[0]
    # calculate the custom metrics
    current_metric = 500 * false_negatives + 100 * false_positives
    # if the minimum value is current_metric
    if min(current_metric, metric_value) == current_metric:
        # update metric_value and optimum_threshold_probablity
        metric_value = current_metric
        optimum_threshold_probability = threshold
# print the optimum_threshold_probability
optimum_threshold_probability



100%|██████████| 2791/2791 [00:07<00:00, 368.00it/s]


0.2300390278970873

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [79]:
# loading the dataset
data_set = pd.read_csv('5_d.csv')
# overview of dataset
print('first 5 rows of the dataset\n',data_set.head())
# datatset columns
print('\nDataset columns are\n',data_set.columns.values)

first 5 rows of the dataset
        y   pred
0  101.0  100.0
1  120.0  100.0
2  131.0  113.0
3  164.0  125.0
4  154.0  152.0

Dataset columns are
 ['y' 'pred']


In [80]:
# mean square error
# create a squared_errors column in the data_set
data_set['squared_errors'] = (data_set['y'] - data_set['pred'])**2
# calculate the mean_square_error
mean_square_error = data_set['squared_errors'].sum()/data_set.shape[0]
# print MSE
print('mean_squared_error: ',mean_square_error)

mean_squared_error:  177.16569974554707


In [81]:
# mean absolute percentage error (MAPE)
# create a column of abs_error in the data_set
data_set['abs_error'] = abs(data_set['pred']-data_set['y'])
# calculate MAPE
mape = data_set['abs_error'].sum()/data_set['y'].sum()
# print MAPE
print('MAPE: ',mape)

MAPE:  0.1291202994009687


In [83]:
# r^2 error
# create a simple_squared_errors column in the data_set to account for simple_mean_model
data_set['simple_squared_errors'] = (data_set['y'] - data_set['y'].mean())**2
#calculate sum of squared errors for the simple_mean_model
ss_total = data_set['simple_squared_errors'].sum()
# calculate sum of squared errors for the actual_model
ss_residue = data_set['squared_errors'].sum()
# calculate r2 score
r2_score = 1-(ss_residue/ss_total)
# print r2_score
print('r2_score: ',r2_score)

r2_score:  0.9563582786990937
