In [13]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, RocCurveDisplay, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from tqdm import tqdm

# Read in data

In [9]:
data = pd.read_csv(f'../machine_learning/mordred_qm9/qm9_features_mordred_SMILES_1.csv')
data = data[data['smiles_1 sa_score'] != 'ERROR'] # remove error entries

  exec(code_obj, self.user_global_ns, self.user_ns)


# Define function to get ROC AUC of logistic regression

In [18]:
def get_ROC_AUC(feature_names,print_stats=True):
    roc_values = []
    coef_values = []
    
    all_test = []
    all_predict = []
    
    feature_values = data[feature_names].to_numpy()
    labels = data['Reported'].to_numpy().astype(int)
    
    n_folds = 10
    
    kf = KFold(n_splits=n_folds,shuffle=True,random_state=1)
    
    
    for i, (train_index, test_index) in tqdm(enumerate(kf.split(feature_values)),desc=f'{feature_names} Folds'):
        train_features, train_labels = feature_values[train_index], labels[train_index]
        test_features, test_labels = feature_values[test_index], labels[test_index]
        
    
        model = LogisticRegression(n_jobs=-1)
        model.fit(train_features,train_labels)
        predictions = model.predict_proba(test_features)
        
        all_test += [test_labels]
        all_predict += [predictions[:,1]]
        
        fpr, tpr, thresholds = roc_curve(test_labels, predictions[:,1])
        roc_auc = auc(fpr, tpr)
        
        roc_values += [roc_auc]
        coef_values += [model.coef_]

    
    mean_roc = np.mean(roc_values)
    std_roc = np.std(roc_values)
    mean_coeff = np.mean(coef_values,axis=0)
    std_coeff = np.std(coef_values,axis=0)
    

    # compute precision and recall
    
    
    precision, recall, thresholds = precision_recall_curve(np.concatenate(all_test),np.concatenate(all_predict))
    
    # set a target recall
    
    recall_target = 0.5
    
    distance_from_target = np.abs(recall - recall_target)
    index_min_distance_from_target = np.argmin(distance_from_target)
    
    recall = recall[index_min_distance_from_target-1:index_min_distance_from_target+1]
    precision = precision[index_min_distance_from_target-1:index_min_distance_from_target+1]
    
    
    if print_stats == True:
        print(f'For feature names {feature_names}')
        print(f'The mean ROC AUC is {mean_roc}')
        print(f'The std dev ROC AUC is {std_roc}')
        print(f'The mean coefficients are {mean_coeff}')
        print(f'The std dev coefficients are {std_coeff}')

    return mean_roc, std_roc, mean_coeff, std_coeff, precision, recall, thresholds



In [19]:
feature_names_list = ['g4mp2_above_min','smiles_1 sa_score','smiles_1 sc_score','smiles_1 ra_score','smiles_1 syba_score']


In [22]:
feature_names_list = ['g4mp2_above_min','smiles_1 sa_score','smiles_1 sc_score','smiles_1 ra_score','smiles_1 syba_score']

for feature_names in feature_names_list:
    mean_roc, std_roc, mean_coeff, std_coeff, precision, recall, thresholds = get_ROC_AUC([feature_names])
    print(precision, recall)
    mean_roc, std_roc, mean_coeff, std_coeff, precision, recall, thresholds = get_ROC_AUC([feature_names,'g4mp2_above_min'])
    print(precision, recall)


['g4mp2_above_min'] Folds: 10it [00:07,  1.35it/s]


For feature names ['g4mp2_above_min']
The mean ROC AUC is 0.8419197321612721
The std dev ROC AUC is 0.0034653919440633552
The mean coefficients are [[-39.28544725]]
The std dev coefficients are [[0.0742375]]
[0.67872747 0.67871285 0.67874374 0.67872912 0.67871449 0.67869987] [0.50006706 0.50003353 0.50003353 0.5        0.49996647 0.49993294]


['g4mp2_above_min', 'g4mp2_above_min'] Folds: 10it [00:08,  1.22it/s]


For feature names ['g4mp2_above_min', 'g4mp2_above_min']
The mean ROC AUC is 0.8419197321612721
The std dev ROC AUC is 0.0034653919440633552
The mean coefficients are [[-20.42771503 -20.42771503]]
The std dev coefficients are [[0.0404189 0.0404189]]
[0.67872747 0.67871285 0.67874374 0.67872912 0.67871449 0.67869987] [0.50006706 0.50003353 0.50003353 0.5        0.49996647 0.49993294]


['smiles_1 sa_score'] Folds: 10it [00:07,  1.28it/s]


For feature names ['smiles_1 sa_score']
The mean ROC AUC is 0.88835780325496
The std dev ROC AUC is 0.002325344579935834
The mean coefficients are [[-2.25759427]]
The std dev coefficients are [[0.00425708]]
[0.75502455 0.75501215 0.75499975 0.75498734 0.75502557 0.75501317] [0.5001006  0.50006706 0.50003353 0.5        0.5        0.49996647]


['smiles_1 sa_score', 'g4mp2_above_min'] Folds: 10it [00:10,  1.09s/it]


For feature names ['smiles_1 sa_score', 'g4mp2_above_min']
The mean ROC AUC is 0.910550686304717
The std dev ROC AUC is 0.0014481110938918806
The mean coefficients are [[ -1.89047849 -24.7129745 ]]
The std dev coefficients are [[0.00546085 0.11717246]]
[0.82502628 0.8250166  0.82500692 0.82499723 0.82504288 0.82508853] [0.5001006  0.50006706 0.50003353 0.5        0.5        0.5       ]


['smiles_1 sc_score'] Folds: 10it [00:04,  2.28it/s]


For feature names ['smiles_1 sc_score']
The mean ROC AUC is 0.6483632637842071
The std dev ROC AUC is 0.006163052231359871
The mean coefficients are [[-1.73009886]]
The std dev coefficients are [[0.00809657]]
[0.35643091 0.35643943 0.35644795 0.35643257 0.35644109 0.35644961] [0.50003353 0.50003353 0.50003353 0.5        0.5        0.5       ]


['smiles_1 sc_score', 'g4mp2_above_min'] Folds: 10it [00:09,  1.06it/s]


For feature names ['smiles_1 sc_score', 'g4mp2_above_min']
The mean ROC AUC is 0.8543894147988507
The std dev ROC AUC is 0.003280940641378941
The mean coefficients are [[ -1.45311367 -38.45692123]]
The std dev coefficients are [[0.00999687 0.06467077]]
[0.692197   0.69222913 0.69226127 0.69224698 0.69227912 0.69231126] [0.50003353 0.50003353 0.50003353 0.5        0.5        0.5       ]


['smiles_1 ra_score'] Folds: 10it [00:02,  4.68it/s]


For feature names ['smiles_1 ra_score']
The mean ROC AUC is 0.8070373543584657
The std dev ROC AUC is 0.003836980847997598
The mean coefficients are [[9.00564064]]
The std dev coefficients are [[0.03864358]]
[0.55043738 0.5504577  0.55044111 0.55042451 0.55040791 0.55039132] [0.50006706 0.50006706 0.50003353 0.5        0.49996647 0.49993294]


['smiles_1 ra_score', 'g4mp2_above_min'] Folds: 10it [00:08,  1.19it/s]


For feature names ['smiles_1 ra_score', 'g4mp2_above_min']
The mean ROC AUC is 0.8778911321195866
The std dev ROC AUC is 0.0022915677435138447
The mean coefficients are [[  6.8004573  -33.44137032]]
The std dev coefficients are [[0.04129499 0.08807094]]
[0.73205695 0.73209289 0.73212883 0.73211568 0.73210252 0.73208937] [0.50003353 0.50003353 0.50003353 0.5        0.49996647 0.49993294]


['smiles_1 syba_score'] Folds: 10it [00:02,  3.79it/s]


For feature names ['smiles_1 syba_score']
The mean ROC AUC is 0.8176904115645011
The std dev ROC AUC is 0.0036157380866182797
The mean coefficients are [[0.07262708]]
The std dev coefficients are [[0.00015579]]
[0.62825968 0.62828615 0.62827049 0.62825482 0.6282813  0.62830777] [0.50006706 0.50006706 0.50003353 0.5        0.5        0.5       ]


['smiles_1 syba_score', 'g4mp2_above_min'] Folds: 10it [00:09,  1.02it/s]

For feature names ['smiles_1 syba_score', 'g4mp2_above_min']
The mean ROC AUC is 0.8970157717681086
The std dev ROC AUC is 0.001365301188746595
The mean coefficients are [[  0.06144838 -34.39281839]]
The std dev coefficients are [[0.00017296 0.08571059]]
[0.76593559 0.76597493 0.76601428 0.76600226 0.76604161 0.76602959] [0.50003353 0.50003353 0.50003353 0.5        0.5        0.49996647]





# vary training set size 

In [22]:
from collections import defaultdict
from sklearn.model_selection import train_test_split


def get_subsampled_train_val_roc_auc(feature_names):

    n_train_sizes = [50,250,1250,6250,31250,100000]
    n_trials = 30
    
    feature_values = data[feature_names].to_numpy()
    labels = data['Reported'].to_numpy().astype(int)

    results = defaultdict(list)
    for n_train_size in n_train_sizes:
        trial_accuracies = []
        for trial in range(n_trials):
            X_train, X_test, y_train, y_test = train_test_split(feature_values, labels, train_size = n_train_size, stratify=labels, random_state=trial)
            model = LogisticRegression(n_jobs=-1)
            model.fit(X_train,y_train)
            predictions = model.predict_proba(X_test)
            fpr, tpr, thresholds = roc_curve(y_test, predictions[:,1])
            roc_auc = auc(fpr, tpr)
            trial_accuracies += [roc_auc]
        
        results[f'{feature_names} {n_train_size} mean'] += [np.mean(trial_accuracies)]
        results[f'{feature_names} {n_train_size} std'] += [np.std(trial_accuracies)]
    return results
            

In [23]:
for feature_names in feature_names_list[:1]:
    print(get_subsampled_train_val_roc_auc([feature_names]))
    print(get_subsampled_train_val_roc_auc([feature_names,'g4mp2_above_min']))

defaultdict(<class 'list'>, {'50 mean': [0.8419091633207749], '50 std': [2.6445528991379074e-05], '250 mean': [0.8419222693980459], '250 std': [6.927197285888112e-05], '1250 mean': [0.8419570813555878], '1250 std': [0.0001358618754090205], '6250 mean': [0.8418936520955917], '6250 std': [0.00022674458954047045], '31250 mean': [0.8420561614983514], '31250 std': [0.0008315784702030514], '100000 mean': [0.8417169836005608], '100000 std': [0.002530050388333317]})
defaultdict(<class 'list'>, {'50 mean': [0.8419091633207749], '50 std': [2.6445528991379074e-05], '250 mean': [0.8419222693980459], '250 std': [6.927197285888112e-05], '1250 mean': [0.8419570813555878], '1250 std': [0.0001358618754090205], '6250 mean': [0.8418936520955917], '6250 std': [0.00022674458954047045], '31250 mean': [0.8420561614983514], '31250 std': [0.0008315784702030514], '100000 mean': [0.8417169836005608], '100000 std': [0.002530050388333317]})
