In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, RocCurveDisplay, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from tqdm import tqdm

# Read in data

In [19]:
feature_names_list = ['Emin','smiles_1 sa_score','smiles_1 sc_score','smiles_1 ra_score','smiles_1 syba_score']
data = pd.read_csv(f'../Computing Mordred Features for QM9/qm9_mordred.csv',usecols=feature_names_list + ['Reported'])
data = data[data['smiles_1 sa_score'] != 'ERROR'] # remove error entries

# Define function to get ROC AUC of logistic regression

In [21]:
def get_ROC_AUC(feature_names,print_stats=True):
    roc_values = []
    coef_values = []
    
    all_test = []
    all_predict = []
    
    feature_values = data[feature_names].to_numpy()
    labels = data['Reported'].to_numpy().astype(int)
    
    n_folds = 10
    
    kf = KFold(n_splits=n_folds,shuffle=True,random_state=1)
    
    
    for i, (train_index, test_index) in tqdm(enumerate(kf.split(feature_values)),desc=f'{feature_names} Folds'):
        train_features, train_labels = feature_values[train_index], labels[train_index]
        test_features, test_labels = feature_values[test_index], labels[test_index]
        
    
        model = LogisticRegression(n_jobs=-1)
        model.fit(train_features,train_labels)
        predictions = model.predict_proba(test_features)
        
        all_test += [test_labels]
        all_predict += [predictions[:,1]]
        
        fpr, tpr, thresholds = roc_curve(test_labels, predictions[:,1])
        roc_auc = auc(fpr, tpr)
        
        roc_values += [roc_auc]
        coef_values += [model.coef_]

    
    mean_roc = np.mean(roc_values)
    std_roc = np.std(roc_values)
    mean_coeff = np.mean(coef_values,axis=0)
    std_coeff = np.std(coef_values,axis=0)
    

    # compute precision and recall
    
    
    precision, recall, thresholds = precision_recall_curve(np.concatenate(all_test),np.concatenate(all_predict))
    
    # set a target recall
    
    recall_target = 0.5
    
    distance_from_target = np.abs(recall - recall_target)
    index_min_distance_from_target = np.argmin(distance_from_target)
    
    recall = recall[index_min_distance_from_target-1:index_min_distance_from_target+1]
    precision = precision[index_min_distance_from_target-1:index_min_distance_from_target+1]
    
    
    if print_stats == True:
        print(f'For feature names {feature_names}')
        print(f'The mean ROC AUC is {mean_roc}')
        print(f'The std dev ROC AUC is {std_roc}')
        print(f'The mean coefficients are {mean_coeff}')
        print(f'The std dev coefficients are {std_coeff}')

    return mean_roc, std_roc, mean_coeff, std_coeff, precision, recall, thresholds



In [25]:
for feature_names in feature_names_list:
    mean_roc, std_roc, mean_coeff, std_coeff, precision, recall, thresholds = get_ROC_AUC([feature_names])
    print(precision, recall)
    mean_roc, std_roc, mean_coeff, std_coeff, precision, recall, thresholds = get_ROC_AUC([feature_names,'Emin'])
    print(precision, recall)

['Emin'] Folds: 10it [00:01,  5.31it/s]


For feature names ['Emin']
The mean ROC AUC is 0.8428462808518387
The std dev ROC AUC is 0.0028288749692470584
The mean coefficients are [[-39.32889261]]
The std dev coefficients are [[0.06954774]]
[0.68252298 0.68250846] [0.50001675 0.49998325]


['Emin', 'Emin'] Folds: 10it [00:07,  1.42it/s]


For feature names ['Emin', 'Emin']
The mean ROC AUC is 0.8428462808518387
The std dev ROC AUC is 0.0028288749692470584
The mean coefficients are [[-20.44868222 -20.44868222]]
The std dev coefficients are [[0.03809001 0.03809001]]
[0.68252298 0.68250846] [0.50001675 0.49998325]


['smiles_1 sa_score'] Folds: 10it [00:02,  4.71it/s]


For feature names ['smiles_1 sa_score']
The mean ROC AUC is 0.8899098151851931
The std dev ROC AUC is 0.002401955055267746
The mean coefficients are [[-2.27638609]]
The std dev coefficients are [[0.00357676]]
[0.75439838 0.75438596] [0.50001675 0.49998325]


['smiles_1 sa_score', 'Emin'] Folds: 10it [00:08,  1.15it/s]


For feature names ['smiles_1 sa_score', 'Emin']
The mean ROC AUC is 0.9117578347477784
The std dev ROC AUC is 0.0018265383433587632
The mean coefficients are [[ -1.91055107 -24.66812502]]
The std dev coefficients are [[0.00388212 0.0766648 ]]
[0.82606289 0.82605326] [0.50001675 0.49998325]


['smiles_1 sc_score'] Folds: 10it [00:01,  5.04it/s]


For feature names ['smiles_1 sc_score']
The mean ROC AUC is 0.6488217678570067
The std dev ROC AUC is 0.00715804323799224
The mean coefficients are [[-1.73703619]]
The std dev coefficients are [[0.00944541]]
[0.35810795 0.35809254] [0.50001675 0.49998325]


['smiles_1 sc_score', 'Emin'] Folds: 10it [00:08,  1.14it/s]


For feature names ['smiles_1 sc_score', 'Emin']
The mean ROC AUC is 0.8552153473562703
The std dev ROC AUC is 0.0031401701506483274
The mean coefficients are [[ -1.45630622 -38.50147922]]
The std dev coefficients are [[0.00931659 0.07265438]]
[0.69498393 0.69496973] [0.50001675 0.49998325]


['smiles_1 ra_score'] Folds: 10it [00:01,  5.55it/s]


For feature names ['smiles_1 ra_score']
The mean ROC AUC is 0.8076192033849405
The std dev ROC AUC is 0.004316293264249086
The mean coefficients are [[9.09011471]]
The std dev coefficients are [[0.04121817]]
[0.55038359 0.55036701] [0.50001675 0.49998325]


['smiles_1 ra_score', 'Emin'] Folds: 10it [00:07,  1.29it/s]


For feature names ['smiles_1 ra_score', 'Emin']
The mean ROC AUC is 0.8787811080357025
The std dev ROC AUC is 0.002561338242785622
The mean coefficients are [[  6.87102928 -33.43717272]]
The std dev coefficients are [[0.04302317 0.07207243]]
[0.73681612 0.73680312] [0.50001675 0.49998325]


['smiles_1 syba_score'] Folds: 10it [00:01,  5.03it/s]


For feature names ['smiles_1 syba_score']
The mean ROC AUC is 0.8188101282809537
The std dev ROC AUC is 0.0025120050481810255
The mean coefficients are [[0.07278361]]
The std dev coefficients are [[0.00011716]]
[0.63079134 0.63077573] [0.50001675 0.49998325]


['smiles_1 syba_score', 'Emin'] Folds: 10it [00:08,  1.14it/s]

For feature names ['smiles_1 syba_score', 'Emin']
The mean ROC AUC is 0.8977586390463828
The std dev ROC AUC is 0.001465299253956221
The mean coefficients are [[  0.06156411 -34.3962393 ]]
The std dev coefficients are [[9.64811682e-05 6.73725730e-02]]
[0.76869977 0.76868786] [0.50001675 0.49998325]



