In [1]:
import numpy as np
import pandas as pd
import scipy as sc

from importlib import reload
from time import time
import matplotlib

import utils

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

## Load data

In [2]:
# load data with PTM labels and ground-truth and predicted disorder labels
# processed in process-data-and-train-model.ipynb
ptm_idr_df = pd.read_csv('data/ptm_idr.csv')
ptm_idr_df

Unnamed: 0,protein_id,AA,position,ac,ac_reg,ga,gl,gl_reg,m,m_reg,...,nAA_28_180_pae_smooth5,nAA_28_180_pae_smooth10,nAA_28_180_pae_smooth15,nAA_28_180_pae_smooth20,nAA_28_180_pae_smooth25,nAA_28_180_pae_smooth30,nAA_28_180_pae_smooth35,disordered,ordered,pred_disordered
0,O00151,K,22,1,0,0,0,0,0,0,...,72.818182,77.428571,74.774194,74.951220,75.319149,76.500000,76.877193,0.0,1.0,0.022424
1,O00151,K,71,0,0,0,0,0,0,0,...,75.000000,77.809524,72.483871,67.268293,64.294118,61.786885,59.450704,0.0,1.0,0.057403
2,O00151,K,87,0,0,0,0,0,1,0,...,33.000000,41.095238,41.419355,43.170732,43.764706,43.491803,44.070423,0.0,1.0,0.245860
3,O00151,T,34,0,0,0,0,0,0,0,...,80.727273,78.380952,79.193548,80.560976,77.980392,77.754098,76.695652,0.0,1.0,0.017603
4,O00168,S,82,0,0,0,0,0,0,0,...,18.454545,16.761905,16.384615,16.806452,17.638889,19.756098,22.000000,0.0,1.0,0.433978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10797,Q9Y6W3,S,697,0,0,0,0,0,0,0,...,101.818182,105.047619,103.870968,113.097561,115.352941,124.655738,129.366197,0.0,1.0,0.006763
10798,Q9Y6W3,S,700,0,0,0,0,0,0,0,...,93.454545,98.857143,102.258065,110.756098,110.725490,121.901639,129.845070,0.0,1.0,0.006944
10799,Q9Y6W6,K,380,0,0,0,0,0,0,0,...,114.909091,129.333333,127.290323,123.536585,124.137255,128.213115,129.056338,0.0,1.0,0.004483
10800,Q9Y6Y9,Y,131,0,0,0,0,0,0,0,...,97.272727,85.666667,88.677419,94.707317,92.862745,87.200000,84.169231,0.0,1.0,0.016282


In [5]:
ptm_names = ['ac', 'ac_reg', 'ga', 'gl', 'gl_reg', 'm', 'm_reg', 'p', 'p_reg', 'sm', 'sm_reg', 'ub', 'ub_reg']
N_ptm = [len(ptm_idr_df.loc[ptm_idr_df[name] == 1]) for name in ptm_names]
idx = np.argsort(N_ptm)[::-1]
print('Number of data points with the following PTMs:')
for i in idx:
    print('  {}: {}'.format(ptm_names[i], N_ptm[i]))

Number of data points with the following PTMs:
  p: 6017
  ub: 3738
  ac: 1171
  p_reg: 663
  m: 442
  sm: 361
  ac_reg: 68
  ub_reg: 66
  sm_reg: 56
  ga: 47
  gl: 23
  m_reg: 11
  gl_reg: 3


## Construct confidence intervals on odds ratio

In [4]:
reload(utils)
N = len(ptm_idr_df)
ns = np.logspace(np.log10(200), np.log10(N), num=20, base=10).astype(int)
n_trial = 500
ptm_names = ['p', 'ub', 'ac', 'p_reg', 'ac_reg', 'ub_reg']
alpha = 0.1
grid_spacing = 1e-3
print_every = 100
save = True
save_fname = 'iid-results-010922.csv'

columns = [
    'ptm', 'n', 'estimator',
    'mu1-lower', 'mu1-upper', 'mu1-coverage', 'mu1-width',
    'mu0-lower', 'mu0-upper', 'mu0-coverage', 'mu0-width',
    'o-lower', 'o-upper', 'o-coverage', 'o-width'
]
results = []

for p_idx, ptm_name in enumerate(ptm_names):
    for n_idx, n in enumerate(ns):
        
        t0 = time()
        print('PTM = {}, n = {}'.format(ptm_name, n))
        
        for t in range(n_trial):

            result = utils.get_odds_ratio_cis(
                ptm_idr_df, ptm_name, n, alpha, grid_spacing=grid_spacing, verbose=False
            )
            mu1, mu1_pp_ci, mu1_cl_ci, mu0, mu0_pp_ci, mu0_cl_ci, o, o_pp_ci, o_cl_ci = result
            mu1_cov_pp = (mu1_pp_ci[0] <= mu1) & (mu1_pp_ci[1] >= mu1)
            mu1_cov_cl = (mu1_cl_ci[0] <= mu1) & (mu1_cl_ci[1] >= mu1)
            mu0_cov_pp = (mu0_pp_ci[0] <= mu0) & (mu0_pp_ci[1] >= mu0)
            mu0_cov_cl = (mu0_cl_ci[0] <= mu0) & (mu0_cl_ci[1] >= mu0)
            o_cov_pp = (o_pp_ci[0] <= o) & (o_pp_ci[1] >= o)
            o_cov_cl = (o_cl_ci[0] <= o) & (o_cl_ci[1] >= o)
            
            # record results
            results.append(
                [ptm_name, n, 'prediction-powered',
                 mu1_pp_ci[0], mu1_pp_ci[1], mu1_cov_pp, mu1_pp_ci[1] - mu1_pp_ci[0],
                 mu0_pp_ci[0], mu0_pp_ci[1], mu0_cov_pp, mu0_pp_ci[1] - mu0_pp_ci[0],
                 o_pp_ci[0], o_pp_ci[1], o_cov_pp, o_pp_ci[1] - o_pp_ci[0]]
            )
            results.append(
                [ptm_name, n, 'classical',
                 mu1_cl_ci[0], mu1_cl_ci[1], mu1_cov_cl, mu1_cl_ci[1] - mu1_cl_ci[0],
                 mu0_cl_ci[0], mu0_cl_ci[1], mu0_cov_cl, mu0_cl_ci[1] - mu0_cl_ci[0],
                 o_cl_ci[0], o_cl_ci[1], o_cov_cl, o_cl_ci[1] - o_cl_ci[0]]
            )
            if (t + 1) % print_every == 0:
                print('  Done with {} trials. {:d} s'.format(t + 1, int(time() - t0)))
        
        df = pd.DataFrame(results, columns=columns)
        if save:
            df.to_csv(save_fname)
            
        print('  Avg PP width: {:.2f}'.format(
            df.loc[(df.ptm == ptm_name) & (df.n == n) & (df.estimator == 'prediction-powered')]['o-width'].mean()))
        print('  Avg classical width: {:.2f}'.format(
            df.loc[(df.ptm == ptm_name) & (df.n == n) & (df.estimator == 'classical')]['o-width'].mean()))
        print('  PP coverage: {:.2f}'.format(
            df.loc[(df.ptm == ptm_name) & (df.n == n) & (df.estimator == 'prediction-powered')]['o-coverage'].mean()))
        print('  Classical coverage: {:.2f}'.format(
            df.loc[(df.ptm == ptm_name) & (df.n == n) & (df.estimator == 'classical')]['o-coverage'].mean()))
        print('{:d} s'.format(int(time() - t0)))

PTM = p, n = 200
  Done with 100 trials. 7 s
  Done with 200 trials. 14 s
  Done with 300 trials. 21 s
  Done with 400 trials. 28 s
  Done with 500 trials. 35 s
  Avg PP width: inf
  Avg classical width: 23.27
  PP coverage: 1.00
  Classical coverage: 1.00
35 s
PTM = p, n = 246
  Done with 100 trials. 7 s
  Done with 200 trials. 14 s
  Done with 300 trials. 22 s
  Done with 400 trials. 29 s
  Done with 500 trials. 36 s
  Avg PP width: 5.38
  Avg classical width: 10.05
  PP coverage: 1.00
  Classical coverage: 1.00
36 s
PTM = p, n = 304
  Done with 100 trials. 7 s
  Done with 200 trials. 15 s
  Done with 300 trials. 23 s
  Done with 400 trials. 31 s
  Done with 500 trials. 39 s
  Avg PP width: 3.92
  Avg classical width: 7.25
  PP coverage: 1.00
  Classical coverage: 1.00
39 s
PTM = p, n = 375
  Done with 100 trials. 8 s
  Done with 200 trials. 16 s
  Done with 300 trials. 25 s
  Done with 400 trials. 33 s
  Done with 500 trials. 41 s
  Avg PP width: 3.29
  Avg classical width: 6.12
  P

  theta * np.exp(np.cumsum(np.log(1 + lambdaplus_n * (x_n - m)))),


  Done with 100 trials. 43 s
  Done with 200 trials. 87 s
  Done with 300 trials. 131 s
  Done with 400 trials. 174 s
  Done with 500 trials. 218 s
  Avg PP width: 0.51
  Avg classical width: 0.91
  PP coverage: 1.00
  Classical coverage: 1.00
218 s
PTM = p, n = 8756
  Done with 100 trials. 54 s
  Done with 200 trials. 108 s
  Done with 300 trials. 163 s
  Done with 400 trials. 217 s
  Done with 500 trials. 271 s
  Avg PP width: 0.46
  Avg classical width: 0.82
  PP coverage: 1.00
  Classical coverage: 1.00
272 s
PTM = p, n = 10801
  Done with 100 trials. 66 s
  Done with 200 trials. 133 s
  Done with 300 trials. 201 s
  Done with 400 trials. 267 s
  Done with 500 trials. 334 s
  Avg PP width: 0.40
  Avg classical width: 0.73
  PP coverage: 1.00
  Classical coverage: 1.00
334 s
PTM = ub, n = 200
  Done with 100 trials. 6 s
  Done with 200 trials. 13 s
  Done with 300 trials. 19 s
  Done with 400 trials. 26 s
  Done with 500 trials. 33 s
  Avg PP width: 1.10
  Avg classical width: 1.73


  Done with 100 trials. 11 s
  Done with 200 trials. 22 s
  Done with 300 trials. 34 s
  Done with 400 trials. 45 s
  Done with 500 trials. 57 s
  Avg PP width: 1.06
  Avg classical width: 1.66
  PP coverage: 1.00
  Classical coverage: 0.99
57 s
PTM = ac, n = 1323
  Done with 100 trials. 13 s
  Done with 200 trials. 26 s
  Done with 300 trials. 39 s
  Done with 400 trials. 52 s
  Done with 500 trials. 65 s
  Avg PP width: 0.92
  Avg classical width: 1.47
  PP coverage: 0.99
  Classical coverage: 1.00
65 s
PTM = ac, n = 1632
  Done with 100 trials. 14 s
  Done with 200 trials. 29 s
  Done with 300 trials. 43 s
  Done with 400 trials. 58 s
  Done with 500 trials. 72 s
  Avg PP width: 0.81
  Avg classical width: 1.29
  PP coverage: 0.99
  Classical coverage: 0.99
73 s
PTM = ac, n = 2013
  Done with 100 trials. 16 s
  Done with 200 trials. 33 s
  Done with 300 trials. 49 s
  Done with 400 trials. 66 s
  Done with 500 trials. 82 s
  Avg PP width: 0.70
  Avg classical width: 1.11
  PP covera

  Avg PP width: 1.38
  Avg classical width: 2.59
  PP coverage: 1.00
  Classical coverage: 1.00
278 s
PTM = p_reg, n = 10801
  Done with 100 trials. 66 s
  Done with 200 trials. 134 s
  Done with 300 trials. 201 s
  Done with 400 trials. 268 s
  Done with 500 trials. 335 s
  Avg PP width: 1.23
  Avg classical width: 2.33
  PP coverage: 1.00
  Classical coverage: 1.00
336 s
PTM = ac_reg, n = 200
  Done with 100 trials. 8 s
  Done with 200 trials. 16 s
  Done with 300 trials. 24 s
  Done with 400 trials. 32 s
  Done with 500 trials. 41 s
  Avg PP width: inf
  Avg classical width: 52.01
  PP coverage: 1.00
  Classical coverage: 1.00
42 s
PTM = ac_reg, n = 246
  Done with 100 trials. 8 s
  Done with 200 trials. 16 s
  Done with 300 trials. 25 s
  Done with 400 trials. 33 s
  Done with 500 trials. 42 s
  Avg PP width: inf
  Avg classical width: 32.86
  PP coverage: 1.00
  Classical coverage: 1.00
43 s
PTM = ac_reg, n = 304
  Done with 100 trials. 8 s
  Done with 200 trials. 17 s
  Done with

  Done with 100 trials. 14 s
  Done with 200 trials. 28 s
  Done with 300 trials. 42 s
  Done with 400 trials. 57 s
  Done with 500 trials. 71 s
  Avg PP width: 13.75
  Avg classical width: 20.00
  PP coverage: 1.00
  Classical coverage: 0.99
72 s
PTM = ub_reg, n = 1632
  Done with 100 trials. 15 s
  Done with 200 trials. 31 s
  Done with 300 trials. 47 s
  Done with 400 trials. 63 s
  Done with 500 trials. 79 s
  Avg PP width: 13.52
  Avg classical width: 29.22
  PP coverage: 1.00
  Classical coverage: 0.98
80 s
PTM = ub_reg, n = 2013
  Done with 100 trials. 17 s
  Done with 200 trials. 35 s
  Done with 300 trials. 53 s
  Done with 400 trials. 71 s
  Done with 500 trials. 89 s
  Avg PP width: 13.80
  Avg classical width: 19.89
  PP coverage: 1.00
  Classical coverage: 0.98
90 s
PTM = ub_reg, n = 2484
  Done with 100 trials. 20 s
  Done with 200 trials. 40 s
  Done with 300 trials. 60 s
  Done with 400 trials. 80 s
  Done with 500 trials. 101 s
  Avg PP width: 13.20
  Avg classical wid