# Table S3: p-values comparing models using DeLong test

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from operator import itemgetter

# used to load HSM output
from scripts.utils import load_results

# uses code from https://github.com/yandexdataschool/roc_comparison
from scripts.compare_auc_delong_xu import delong_roc_test
def calc_pval(labels, model1, model2):
    # check if predictions exist from both models (external models only make predictions for subset of HSM data)
    ndxs = ~np.isnan(model2)
    p = delong_roc_test(labels[ndxs], model1[ndxs], model2[ndxs])[0,0]
    return 10**p


# for loading PSSM predictions
def load_pssm_predictions(d, domain):
    df = pd.read_csv(os.path.join(d, '%s_pssm_predictions.csv' % domain), usecols=['Bound', 'PSSM likelihood'])
    return df.to_numpy().T

Load HSM and PSSM predictions and determine which of the HSM models is the best

In [2]:
domains = ["Kinase_TK", "PDZ", "PTB", "PTP","SH2","SH3","WH1","WW"]
pssm_dir = "../pssm/output"
hsmid_dir = "../training/hsmid"
hsmd_dir = "../training/hsmd/rand_round2"
names = ['HSM/ID', 'HSM/D', 'PSSM']
nfolds = 8 # number of CV folds, note all splits the same for HSM/D, HSM/ID, and PSSM (same seeds used)

prediction_data = dict()
aucs = dict()
best_model_ndx = dict()
model_names = dict()
for domain in domains:
    prediction_data[domain]  = [load_results('%s/%s' % (hsmid_dir,domain), domain, nfolds),
                                load_results(hsmd_dir, domain, nfolds),
                                load_pssm_predictions(pssm_dir, domain)]
    model_names[domain] = names.copy()
    # AUCs to determine best model for each domain
    aucs[domain] = [roc_auc_score(d[0,:], d[1,:]) for d in prediction_data[domain]]
    ndx, auc = max(enumerate(aucs[domain]), key=itemgetter(1))
    best_model_ndx[domain] = ndx

Kinase_TK 1663636440.2506108.metadata.json
0 17029
Kinase_TK 1663651666.0026891.metadata.json
1 17029
Kinase_TK 1663668084.4151826.metadata.json
2 17029
Kinase_TK 1663637098.666917.metadata.json
3 17029
Kinase_TK 1663653186.274473.metadata.json
4 17029
Kinase_TK 1663633871.6396828.metadata.json
5 17029
Kinase_TK 1663649974.8725417.metadata.json
6 17028
Kinase_TK 1663665586.7061486.metadata.json
7 17028
Kinase_TK 1670298134.5520897.mean.metadata.json
0 17029
Kinase_TK 1670301365.679557.mean.metadata.json
1 17029
Kinase_TK 1670316529.4122164.mean.metadata.json
2 17029
Kinase_TK 1670841924.6732035.mean.metadata.json
3 17029
Kinase_TK 1670840950.3112288.mean.metadata.json
4 17029
Kinase_TK 1670301830.2165387.mean.metadata.json
5 17029
Kinase_TK 1670300339.888463.mean.metadata.json
6 17028
Kinase_TK 1670608643.2700768.mean.metadata.json
7 17028
PDZ 1668398108.8696058.metadata.json
0 63048
PDZ 1668410571.0893161.metadata.json
1 63048
PDZ 1668423324.8478434.metadata.json
2 63048
PDZ 166844571

Load external model predictions

In [3]:
# for loading external models
def reorder_data(df, validation_chunk, n_folds=8, seed=0):
    np.random.seed(seed)
    randomized = np.random.permutation(len(df))
    chunks = np.array_split(randomized, n_folds)
    vndxs = chunks[validation_chunk]

    return df.iloc[vndxs].copy()

def load_external_model_predictions(full_csv, preds_csv, predcolname, nfolds=8):
    df = pd.read_csv(full_csv)
    df = pd.concat([reorder_data(df, i) for i in range(nfolds)])

    preds_df = pd.read_csv(preds_csv)
    auc = roc_auc_score(preds_df['Bound'], preds_df[predcolname])
    colnames = list(df.columns)
    joined = df.join(preds_df.set_index(colnames), on=colnames)
    return joined[['Bound', predcolname]].to_numpy().T, auc


# location of full data, including those that external models cannot make predictions for
data_dir = "../data/data_without_processed_duplicates/raw_data"

# NetPhorest models
netphor_dir = "../netphorest"
netphor_domains = ['PTB', 'Kinase_TK', 'SH2', 'PTP']
for domain in netphor_domains:
    csv = os.path.join(netphor_dir, '%s/%s_netphorest_predictions.csv' % (domain, domain))
    full = os.path.join(data_dir, '%s.csv' % domain)

    d, auc = load_external_model_predictions(full, csv, 'NetPhorest Posterior', nfolds=8)
    prediction_data[domain].append(d)
    aucs[domain].append(auc)
    model_names[domain].append('NetPhorest')
    
# PepInt models
pepint_dir = "../pepint"
pepint_domains = ['PDZ', 'SH2']
for domain in pepint_domains:
    csv = os.path.join(pepint_dir, '%s/%s_pepint_predictions.csv' % (domain, domain))
    full = os.path.join(data_dir, '%s.csv' % domain)

    d, auc = load_external_model_predictions(full, csv, 'PepInt Score', nfolds=8)
    prediction_data[domain].append(d)
    aucs[domain].append(auc)
    model_names[domain].append('PepInt')

Compute p-vals

In [4]:
for domain in domains:
    best_preds = prediction_data[domain][best_model_ndx[domain]]
    labels = best_preds[0,:]
    print("\n%s, best model: %s with AUC of %.5g" % (domain, model_names[domain][best_model_ndx[domain]], aucs[domain][best_model_ndx[domain]]))
    for n, pred, auc in zip(model_names[domain], prediction_data[domain], aucs[domain]):
        if n == model_names[domain][best_model_ndx[domain]]: continue
        assert np.array_equal(labels, pred[0,:]), "Ground truth labels for models %s %s do not match. Maybe the data was shuffled differently" % (model_names[best_model_ndx[domain]], n)
        p = calc_pval(labels, best_preds[1,:], pred[1,:])
        if p == 0: print_p = '<%.3g' % np.nextafter(p,1)
        else: print_p = '%.2g' % p
        print("p-value for %10s (AUC: %.5g) compared to %10s: %s" % (n, auc, model_names[domain][best_model_ndx[domain]], print_p))


Kinase_TK, best model: HSM/ID with AUC of 0.97744
p-value for      HSM/D (AUC: 0.97649) compared to     HSM/ID: 4.9e-12
p-value for       PSSM (AUC: 0.56592) compared to     HSM/ID: <4.94e-324
p-value for NetPhorest (AUC: 0.70587) compared to     HSM/ID: <4.94e-324

PDZ, best model: HSM/ID with AUC of 0.98116
p-value for      HSM/D (AUC: 0.98011) compared to     HSM/ID: 0.0074
p-value for       PSSM (AUC: 0.85806) compared to     HSM/ID: <4.94e-324
p-value for     PepInt (AUC: 0.88326) compared to     HSM/ID: 1.1e-248

PTB, best model: HSM/D with AUC of 0.91337
p-value for     HSM/ID (AUC: 0.9082) compared to      HSM/D: 0.29
p-value for       PSSM (AUC: 0.86213) compared to      HSM/D: 9.8e-09
p-value for NetPhorest (AUC: 0.83075) compared to      HSM/D: 9.1e-06

PTP, best model: HSM/ID with AUC of 0.89162
p-value for      HSM/D (AUC: 0.88677) compared to     HSM/ID: 0.00013
p-value for       PSSM (AUC: 0.79458) compared to     HSM/ID: 1.7e-101
p-value for NetPhorest (AUC: 0.74605) c