In [1]:
from time import time
from importlib import reload

import numpy as np
import pandas as pd
import scipy as sc

import utils
from structuremap.processing import get_smooth_score

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

## Load and merge ground-truth labels of IDRs with Alphafold-based structural accessibility scores

`idr-groundtruth.csv`, which contains per-residue ground-truth disordered/ordered labels, is equivalent to the "groundtruth_data" dataframe computed
in the following notebook from [Isabell Bludau et al. (2022), PLOS Biology](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3001636):
https://github.com/MannLabs/structuremap_analysis/blob/master/IDR_benchmark.ipynb

`accessibility_df`, which contains per-residue scores of structural accessibility scores based on Alphafold-predicted structures, is equivalent to the "accessibility_df" dataframe computed in the same notebook.

`rsa_df` contains per-residue relative solvent-accessible surface area (RSA) based on Alphafold-predicted structures and is equivalent to the "RSA_data" dataframe.

In [6]:
idr_df = pd.read_csv('idr-groundtruth.csv')
idr_df

Unnamed: 0,protein_id,position,disordered,ordered
0,P49913,134,1.0,0.0
1,P49913,135,1.0,0.0
2,P49913,136,1.0,0.0
3,P49913,137,1.0,0.0
4,P49913,138,1.0,0.0
...,...,...,...,...
485034,P30519,232,0.0,1.0
485035,P30519,233,0.0,1.0
485036,P30519,234,0.0,1.0
485037,P30519,235,0.0,1.0


In [7]:
# accessibility score, termed prediction-aware part-sphere exposure (pPSE) in the main paper (Bludau et al.),
# is computed with a range of hyperparameter settings:
# 12, 16, 20, 24, and 28-angstrom radii around each amino acid
accessibility_df = pd.read_csv('accessibility_df.csv')
accessibility_df

Unnamed: 0,protein_id,AA,position,nAA_12_180_nopae,nAA_12_180_pae,nAA_16_180_nopae,nAA_16_180_pae,nAA_20_180_nopae,nAA_20_180_pae,nAA_24_180_nopae,nAA_24_180_pae,nAA_28_180_nopae,nAA_28_180_pae
0,A1A4S6,M,1,5,2,16,3,34,3,52,7,78,11
1,A1A4S6,G,2,9,3,29,9,48,17,72,28,93,41
2,A1A4S6,L,3,20,10,36,21,60,36,83,56,101,72
3,A1A4S6,Q,4,19,10,36,24,52,39,76,51,95,73
4,A1A4S6,P,5,23,12,42,32,58,44,81,59,98,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1622132,Q9ZWM9,I,340,6,4,8,6,10,8,11,9,14,10
1622133,Q9ZWM9,F,341,6,4,7,6,8,7,10,8,12,9
1622134,Q9ZWM9,H,342,5,4,6,5,7,6,9,7,11,8
1622135,Q9ZWM9,A,343,4,3,5,3,6,5,8,6,10,7


In [10]:
rsa_df = pd.read_csv('RSA_data.csv')
rsa_df

Unnamed: 0,protein_id,AA,position,RSA,neg_RSA
0,Q9BUK0,M,1,1.000000,0.000000
1,Q9BUK0,P,2,0.529412,0.470588
2,Q9BUK0,S,3,0.469231,0.530769
3,Q9BUK0,V,4,0.774648,0.225352
4,Q9BUK0,T,5,0.626761,0.373239
...,...,...,...,...,...
1622132,P07273,R,305,0.629032,0.370968
1622133,P07273,W,306,0.251101,0.748899
1622134,P07273,K,307,0.746341,0.253659
1622135,P07273,F,308,0.573604,0.426396


Smooth scores over sliding windows of residues

In [4]:
# need dummy protein_number column for get_smooth_score in cell below
protein_ids = sorted(accessibility_df.protein_id.unique())
id2number = {pid: i for i, pid in enumerate(protein_ids)}
protein_number_col = [id2number[pid] for pid in accessibility_df.protein_id]
accessibility_df['protein_number'] = protein_number_col
accessibility_df

Unnamed: 0,protein_id,AA,position,nAA_12_180_nopae,nAA_12_180_pae,nAA_16_180_nopae,nAA_16_180_pae,nAA_20_180_nopae,nAA_20_180_pae,nAA_24_180_nopae,nAA_24_180_pae,nAA_28_180_nopae,nAA_28_180_pae,protein_number
0,A1A4S6,M,1,5,2,16,3,34,3,52,7,78,11,0
1,A1A4S6,G,2,9,3,29,9,48,17,72,28,93,41,0
2,A1A4S6,L,3,20,10,36,21,60,36,83,56,101,72,0
3,A1A4S6,Q,4,19,10,36,24,52,39,76,51,95,73,0
4,A1A4S6,P,5,23,12,42,32,58,44,81,59,98,80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1622132,Q9ZWM9,I,340,6,4,8,6,10,8,11,9,14,10,3061
1622133,Q9ZWM9,F,341,6,4,7,6,8,7,10,8,12,9,3061
1622134,Q9ZWM9,H,342,5,4,6,5,7,6,9,7,11,8,3061
1622135,Q9ZWM9,A,343,4,3,5,3,6,5,8,6,10,7,3061


In [5]:
# final selected hyperparameters used in Bludau et al.: 24A, 180 degrees, smoothing window of 10,
# and pPSA threshold of 34.27
smoothed_accessibility_df = get_smooth_score(
    accessibility_df, 
    np.array(['nAA_24_180_pae']),
    [10]
)

100%|████████████████████████████████████████████████████████████████████████████████████████| 3062/3062 [00:03<00:00, 965.11it/s]


In [6]:
alphafold_disorder_benchmark = smoothed_accessibility_df.merge(
    groundtruth_data, how='inner', on=['protein_id', 'position'])


In [7]:
print(len(alphafold_disorder_benchmark))
print(sum(alphafold_disorder_benchmark.disordered == 1) + sum(alphafold_disorder_benchmark.ordered == 1))

479275
479275


## intersect with PTM data

In [8]:
ptm_df = pd.read_csv('phosphositeplus_annotation.csv')
ptm_df

Unnamed: 0,protein_id,AA,position,ac,ac_reg,ga,gl,gl_reg,m,m_reg,p,p_reg,sm,sm_reg,ub,ub_reg
0,A0A024R5B6,K,43,0,0,0,0,0,0,0,0,0,0,0,1,1
1,A0A024RBG1,K,128,0,0,0,0,0,0,0,0,0,0,0,1,0
2,A0A024RBG1,K,134,0,0,0,0,0,0,0,0,0,0,0,1,0
3,A0A024RBG1,K,143,0,0,0,0,0,0,0,0,0,0,0,1,0
4,A0A024RBG1,K,5,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369206,Q9Y6Z7,K,97,1,0,0,0,0,0,0,0,0,0,0,0,0
369207,Q9Y6Z7,S,141,0,0,0,0,0,0,0,1,0,0,0,0,0
369208,Q9Y6Z7,T,155,0,0,0,0,0,0,0,1,0,0,0,0,0
369209,Q9YNA8,S,465,0,0,0,0,0,0,0,1,0,0,0,0,0


In [9]:
ptm_idr_df = ptm_df.merge(
    alphafold_disorder_benchmark, how='inner', on=['protein_id', 'position', 'AA']
)
print(len(ptm_idr_df))
ptm_idr_df = ptm_idr_df.fillna(0)
ptm_idr_df = ptm_idr_df[(ptm_idr_df.ordered == 1) | (ptm_idr_df.disordered == 1)]

10813


In [10]:
ptm_idr_df.columns

Index(['protein_id', 'AA', 'position', 'ac', 'ac_reg', 'ga', 'gl', 'gl_reg',
       'm', 'm_reg', 'p', 'p_reg', 'sm', 'sm_reg', 'ub', 'ub_reg',
       'nAA_12_180_nopae', 'nAA_12_180_pae', 'nAA_16_180_nopae',
       'nAA_16_180_pae', 'nAA_20_180_nopae', 'nAA_20_180_pae',
       'nAA_24_180_nopae', 'nAA_24_180_pae', 'nAA_28_180_nopae',
       'nAA_28_180_pae', 'protein_number', 'nAA_24_180_pae_smooth10',
       'disordered', 'ordered'],
      dtype='object')

In [11]:
thresh = 34.27
ptm_idr_df['pred_disordered'] = (ptm_idr_df['nAA_24_180_pae_smooth10'] <= thresh).astype(int)
sum(ptm_idr_df['pred_disordered'])

2355

In [12]:
len(ptm_idr_df.protein_id.drop_duplicates())  # unique protein sequences in intersection

1368

In [14]:
ptm_idr_df.to_csv('ptm_idr.csv', index=False)

In [15]:
# LOAD PRECOMPUTED
ptm_idr_df = pd.read_csv('ptm_idr.csv')
ptm_idr_df

Unnamed: 0,protein_id,AA,position,ac,ac_reg,ga,gl,gl_reg,m,m_reg,...,nAA_20_180_pae,nAA_24_180_nopae,nAA_24_180_pae,nAA_28_180_nopae,nAA_28_180_pae,protein_number,nAA_24_180_pae_smooth10,disordered,ordered,pred_disordered
0,O00151,K,22,1,0,0,0,0,0,0,...,37,81,48,110,64,20,65.904762,0.0,1.0,0
1,O00151,K,71,0,0,0,0,0,0,0,...,52,80,74,87,80,20,68.523810,0.0,1.0,0
2,O00151,K,87,0,0,0,0,0,1,0,...,8,98,12,139,19,20,34.238095,0.0,1.0,1
3,O00151,T,34,0,0,0,0,0,0,0,...,41,94,63,113,82,20,65.380952,0.0,1.0,0
4,O00168,S,82,0,0,0,0,0,0,0,...,14,45,16,56,19,21,13.761905,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10808,Q9Y6W3,S,697,0,0,0,0,0,0,0,...,53,82,70,109,84,3046,81.285714,0.0,1.0,0
10809,Q9Y6W3,S,700,0,0,0,0,0,0,0,...,57,91,78,112,92,3046,78.333333,0.0,1.0,0
10810,Q9Y6W6,K,380,0,0,0,0,0,0,0,...,35,112,55,185,79,3047,101.571429,0.0,1.0,0
10811,Q9Y6Y9,Y,131,0,0,0,0,0,0,0,...,63,103,85,120,107,3048,68.238095,0.0,1.0,0
