In [1]:
import pandas as pd
import numpy as np
import os

# read in qm9 data with Emin

In [2]:
data = pd.read_csv('../../Computing Emin for QM9/data/clean/qm9.csv',usecols=['Reported','Emin'])

ground_truth = data['Reported']
prediction = data['Emin']

# get quick stats

In [3]:
unreported = data.query('`Reported` == False')
reported = data.query('`Reported` == True')

median_emin_unreported = np.median(unreported['Emin'])
conversion = 627.5 #kcal/mol per Ha

print(f"Median unreported emin = {median_emin_unreported*conversion} kcal/mol")

from scipy import stats

print(f"Percentile of reported molecules for emin = {median_emin_unreported*conversion} kcal/mol is {stats.percentileofscore(reported['Emin'],median_emin_unreported)}")

Median unreported emin = 58.801769999995166 kcal/mol
Percentile of reported molecules for emin = 58.801769999995166 kcal/mol is 92.15797556252922


# define some helper functions

In [4]:
def get_accuracies(ground_truth, prediction_value, c):
                   
    ground_truth = 1*ground_truth # change bools to numbers
    prediction = 1*(prediction_value <= c)

    # count false positives
    fp = np.sum((ground_truth == 0) & (prediction == 1))

    # find false negatives
    fn = np.sum((ground_truth == 1) & (prediction == 0))

    # find true positives
    tp = np.sum((ground_truth == 1) & (prediction == 1))

    # find true negatives
    tn = np.sum((ground_truth == 0) & (prediction == 0))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    return precision, recall, tp, tn, fp, fn

# get precision and recalls as a function of cutoff

In [5]:
precision_list = []
recall_list = []
tp_list = []
tn_list = []
fp_list = []
fn_list = []

cutoffs = np.arange(0,1,0.01) # these are Emin cutoffs (not to be confused with synthesizability) - units are in Ha

for c in cutoffs:
    
    # for predictions on the validation data points
    precision, recall, tp, tn, fp, fn = get_accuracies(ground_truth, prediction,c)
    
    precision_list += [precision]
    recall_list += [recall]
    tp_list += [tp]
    tn_list += [tn]
    fp_list += [fp]
    fn_list += [fn]
    

    

precision_recalls = pd.DataFrame(data={'Cutoff': cutoffs,
                                        'Precision': precision_list,
                                        'Recall': recall_list,
                                      'TP': tp_list,
                                      'TN': tn_list,
                                      'FP': fp_list,
                                      'FN': fn_list})
os.makedirs('precision_recalls',exist_ok=True)
precision_recalls.to_csv(f'precision_recalls/qm9_precision_recall_roc_smiles_1.csv', index=False)