In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix

##### This notebook imports the predictions made by the majority-rules voting system and provides a function to quickly look at a random post and see how the models voted.

There are also dataframes containing each of 6 categories of miscategorized posts.

Additionally, a function to print classification metrics for each individual class has been written and called with the proper parameters to calculate those metrics.

In [10]:
predictions = pd.read_csv('data/BCP_predictions.csv')

In [11]:
mislabeled = predictions[predictions['vote'] != predictions['y_true']]

### Function Definitions

In [28]:
def print_post(df):
    post = df.sample(n=1)
    bcp_map = {0 : 'Biology', 1 : 'Chemistry', 2 : 'Physics'}
    
    labeled = post['vote'].map(bcp_map).item()
    actual = post['y_true'].map(bcp_map).item()
    title= post['title'].item()
    text = post['selftext'].item()
    
    print('Votes : NB - %d SVC - %d Ada - %d' %(post['nb'], post['svc'], post['ada']))
    print(f'Labeled as : {labeled}')
    print(f'Posted in  : {actual}')
    print(f'TITLE: {title}')
    print(f'TEXT : {text}')
    print('-----------------------------------------------------')

In [57]:
def print_metrics(name, t_pos, t_neg, f_pos, f_neg):
    
    #these values are passed as lists since there are multiple types of misclassifications that fall under each category
    t_neg = sum(t_neg)
    f_pos = sum(f_pos)
    f_neg = sum(f_neg)
    
    metrics_dict = {
        'Accuracy' : (t_pos + t_neg) / (t_pos + t_neg + f_pos + f_neg),
        'Misclassification' : (f_pos + f_neg) / (t_pos + t_neg + f_pos + f_neg),
        'Sensitivity' : (t_pos) / (t_pos + f_neg),
        'Specificity' : (t_neg) / (t_neg + f_pos),
        'Precision' : (t_pos) / (t_pos + f_pos)
        }
    
    print(f'Metrics for {name} :')
    for met in metrics_dict:
        print(f'{met} : {round(metrics_dict[met],4)}')

# Mislabeled Post Content

In [29]:
pred_bio_true_phy = mislabeled.loc[(mislabeled['vote'] == 0) & (mislabeled['y_true'] == 2)]
pred_bio_true_chm = mislabeled.loc[(mislabeled['vote'] == 0) & (mislabeled['y_true'] == 1)]
pred_chm_true_phy = mislabeled.loc[(mislabeled['vote'] == 1) & (mislabeled['y_true'] == 2)]
pred_chm_true_bio = mislabeled.loc[(mislabeled['vote'] == 1) & (mislabeled['y_true'] == 0)]
pred_phy_true_chm = mislabeled.loc[(mislabeled['vote'] == 2) & (mislabeled['y_true'] == 1)]
pred_phy_true_bio = mislabeled.loc[(mislabeled['vote'] == 2) & (mislabeled['y_true'] == 0)]

In [30]:
print_post(pred_bio_true_chm)

Votes : NB - 0 SVC - 0 Ada - 1
Labeled as : Biology
Posted in  : Chemistry
TITLE: Chemist-turned-intelligence expert Rod Schoonover on climate change and COVID-19
TEXT : nan
-----------------------------------------------------


In [31]:
print_post(pred_bio_true_phy)

Votes : NB - 2 SVC - 0 Ada - 0
Labeled as : Biology
Posted in  : Physics
TITLE: is dE=dE cos theta i^ +d E sin theta j^ correct?
TEXT : nan
-----------------------------------------------------


In [32]:
print_post(pred_chm_true_bio)

Votes : NB - 1 SVC - 1 Ada - 1
Labeled as : Chemistry
Posted in  : Biology
TITLE: Does the pH location for a receipe of a buffer matter?
TEXT : Hi guy, I am trying to make two buffers with the following recipe:

1. 50 mM HEPES, 700 mM NaCl,12.5 mM CaCl2, pH 7.4
2. 100 mM Tris, pH 7.4, 150 mM NaCl, 1 mM CaCl2, 0.5 mM MgCl2, 0.1% Nonidet P-40

I notice that the pHs are at different positions. I wonder if that difference matters? My understanding is they both indicate the pH of the final solution with all components. I just want to make sure I am correct. Thanks.

[View Poll](https://www.reddit.com/poll/hdfzbi)
-----------------------------------------------------


In [33]:
print_post(pred_chm_true_phy)

Votes : NB - 1 SVC - 1 Ada - 0
Labeled as : Chemistry
Posted in  : Physics
TITLE: does any one know what is the weird symbol called
TEXT : nan
-----------------------------------------------------


In [34]:
print_post(pred_phy_true_bio)

Votes : NB - 2 SVC - 0 Ada - 2
Labeled as : Physics
Posted in  : Biology
TITLE: What exactly is Computational Neuroscience?
TEXT : I'm thinking of researching in that field of study, but I would like to know some more about the topic. Could someone point me in the right direction?
-----------------------------------------------------


In [35]:
print_post(pred_phy_true_chm)

Votes : NB - 2 SVC - 2 Ada - 1
Labeled as : Physics
Posted in  : Chemistry
TITLE: ISOLDE reveals fundamental property of rarest element on Earth
TEXT : nan
-----------------------------------------------------


# Classification Metrics

In [40]:
# Get values from a confusion matrix
# where t = true, f = false
# b = biology, c = chemistry, p = physics
# fc_b means false chemistry actually biology. it is a false positive for chemistry, a true negative for physics, and a false negative for biology

tb, fc_b, fp_b, fb_c, tc, fp_c, fb_p, fc_p, tp = confusion_matrix(predictions['y_true'], predictions['vote']).ravel()

In [58]:
print_metrics("Biology", t_pos = tb, t_neg = [tc, tp, fc_p, fp_c], f_pos = [fb_c, fb_p], f_neg = [fc_b, fp_b])

Metrics for Biology :
Accuracy : 0.9186
Misclassification : 0.0814
Sensitivity : 0.903
Specificity : 0.9274
Precision : 0.8755


In [59]:
print_metrics("Chemistry", t_pos = tc, t_neg = [tb, tp, fb_p, fp_b], f_pos = [fc_b, fc_p], f_neg = [fb_c, fp_c])

Metrics for Chemistry :
Accuracy : 0.9076
Misclassification : 0.0924
Sensitivity : 0.8623
Specificity : 0.93
Precision : 0.8585


In [60]:
print_metrics("Physics", t_pos = tp, t_neg = [tc, tb, fc_b, fb_c], f_pos = [fp_b, fp_c], f_neg = [fc_p, fb_p])

Metrics for Physics :
Accuracy : 0.9193
Misclassification : 0.0807
Sensitivity : 0.8485
Specificity : 0.9509
Precision : 0.8851
