In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

First, check if any overlap in the UniProt protein IDs.

In [8]:
dis_fname = 'disordered_data_annotation.csv'
ord_fname = 'ordered_data_annotation.csv'
ptm_fname = 'phosphositeplus_annotation.csv'

dis_df = pd.read_csv(dis_fname)
ord_df = pd.read_csv(ord_fname)
ptm_df = pd.read_csv(ptm_fname)

In [10]:
dis_df

Unnamed: 0,protein_id,AA,position,disordered
0,P49913,L,134,1
1,P49913,L,135,1
2,P49913,G,136,1
3,P49913,D,137,1
4,P49913,F,138,1
...,...,...,...,...
27468,P16471,A,593,1
27469,P16471,N,594,1
27470,P16471,F,595,1
27471,P16471,T,596,1


In [9]:
ptm_df

Unnamed: 0,protein_id,AA,position,ac,ac_reg,ga,gl,gl_reg,m,m_reg,p,p_reg,sm,sm_reg,ub,ub_reg
0,A0A024R5B6,K,43,0,0,0,0,0,0,0,0,0,0,0,1,1
1,A0A024RBG1,K,128,0,0,0,0,0,0,0,0,0,0,0,1,0
2,A0A024RBG1,K,134,0,0,0,0,0,0,0,0,0,0,0,1,0
3,A0A024RBG1,K,143,0,0,0,0,0,0,0,0,0,0,0,1,0
4,A0A024RBG1,K,5,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369206,Q9Y6Z7,K,97,1,0,0,0,0,0,0,0,0,0,0,0,0
369207,Q9Y6Z7,S,141,0,0,0,0,0,0,0,1,0,0,0,0,0
369208,Q9Y6Z7,T,155,0,0,0,0,0,0,0,1,0,0,0,0,0
369209,Q9YNA8,S,465,0,0,0,0,0,0,0,1,0,0,0,0,0


In [17]:
ptm_ids = set(ptm_df.protein_id.unique())
dis_ids = set(dis_df.protein_id.unique())
ord_ids = set(ord_df.protein_id.unique())
print('Unique PTM IDs: {}.\nUnique disordered IDs: {}.\nUnique ordered IDs: {}.'.format(
    len(ptm_ids), len(dis_ids), len(ord_ids)))

Unique PTM IDs: 20938.
Unique disordered IDs: 232.
Unique ordered IDs: 1471.


In [21]:
ptm_dis_ids = ptm_ids.intersection(dis_ids)
dis_ord_ids = dis_ids.intersection(ord_ids)
ord_ptm_ids = ord_ids.intersection(ptm_ids)
ptm_dis_or_ord_ids = ptm_ids.intersection(dis_ids.union(ord_ids))
print('PTM-disordered IDs: {}.\ndisordered-ordered IDs: {}.\nordered-PTM IDs: {}\nPTM-[disordered U ordered] IDs: {}.'.format(
    len(ptm_dis_ids), len(dis_ord_ids), len(ord_ptm_ids), len(ptm_dis_or_ord_ids)))

PTM-disordered IDs: 227.
disordered-ordered IDs: 63.
ordered-PTM IDs: 1417
PTM-[disordered U ordered] IDs: 1582.


## set pPSA hyperparameters based on non-overlapping IDR benchmark data

First, follow a lot of the pre-processing in Bludau's IDR_benchmark.ipynb notebook
to get the groundtruth_data dataframe.

In [2]:
from structuremap.processing import format_alphafold_data, annotate_accessibility, get_smooth_score  # HERE

ModuleNotFoundError: No module named 'structuremap'

In [9]:
def extract_region_boundaries(df: pd.DataFrame) -> pd.DataFrame:
    start = [x.split('-')[0] for x in df["UniProt boundaries"]]
    end = [x.split('-')[1] for x in df["UniProt boundaries"]]
    df["start"] = start
    df["end"] = end
    df = df[["UniProt accession","start","end"]]
    df = df.rename(columns={"UniProt accession": "protein_id"}) 
    df["start"] = pd.to_numeric(df["start"])
    df["end"] = pd.to_numeric(df["end"])
    return df

def format_region_boundaries(prot, start, end):
    try:
        position = np.arange(start,end)
        res = pd.DataFrame({"protein_id": np.repeat(prot, len(position)),\
                            "position": position, 
                            "structure": np.repeat(1, len(position))})
    except:
        res = None
    return(res)

def get_disorder_annotation(df: pd.DataFrame) -> pd.DataFrame:
    res = df.apply(lambda row : format_region_boundaries(row['protein_id'],row['start'], row['end']), axis = 1)
    res_filtered = [r for r in res if r is not None]
    return pd.concat(res_filtered)

In [19]:
disordered_data = pd.read_csv('disordered_regions.csv',sep=";")
disordered_data = extract_region_boundaries(disordered_data)
print(disordered_data[0:3])
disordered_data_annotation = get_disorder_annotation(df=disordered_data)
disordered_data_annotation = disordered_data_annotation.rename(columns={"structure": "disordered"})
print(disordered_data_annotation[0:3])

  protein_id  start  end
0     P49913    134  170
1     P27695      1   43
2     Q64693      1  256
  protein_id  position  disordered
0     P49913       134           1
1     P49913       135           1
2     P49913       136           1


In [20]:
ordered_data = pd.read_csv('ordered_structures.csv',sep=";")
ordered_data = extract_region_boundaries(ordered_data)
print(ordered_data[0:3])
ordered_data_annotation = get_disorder_annotation(df=ordered_data)
ordered_data_annotation = ordered_data_annotation.rename(columns={"structure": "ordered"})
print(ordered_data_annotation[0:3])

  protein_id  start  end
0     P00918      3  260
1     P61626     19  148
2     Q42449      2  131
  protein_id  position  ordered
0     P00918         3        1
1     P00918         4        1
2     P00918         5        1


In [21]:
groundtruth_data = disordered_data_annotation.merge(ordered_data_annotation, how='outer', on = ['protein_id','position'])
groundtruth_data = groundtruth_data.fillna(0)

In [22]:
groundtruth_data = groundtruth_data[groundtruth_data.disordered!=groundtruth_data.ordered].reset_index(drop=True)

In [23]:
groundtruth_data[0:3]

Unnamed: 0,protein_id,position,disordered,ordered
0,P49913,134,1.0,0.0
1,P49913,135,1.0,0.0
2,P49913,136,1.0,0.0


In [24]:
groundtruth_data.to_csv('idr-benchmark-groundtruth.csv', index=False)

In [25]:
groundtruth_data = pd.read_csv('idr-benchmark-groundtruth.csv')
groundtruth_data

Unnamed: 0,protein_id,position,disordered,ordered
0,P49913,134,1.0,0.0
1,P49913,135,1.0,0.0
2,P49913,136,1.0,0.0
3,P49913,137,1.0,0.0
4,P49913,138,1.0,0.0
...,...,...,...,...
485034,P30519,232,0.0,1.0
485035,P30519,233,0.0,1.0
485036,P30519,234,0.0,1.0
485037,P30519,235,0.0,1.0


Download pPSA, RSA, and IUPred2 predictions computed and formatted in IDR_benchmark.ipynb

In [5]:
accessibility_df = pd.read_csv('accessibility_df.csv')
accessibility_df

Unnamed: 0,protein_id,AA,position,nAA_12_180_nopae,nAA_12_180_pae,nAA_16_180_nopae,nAA_16_180_pae,nAA_20_180_nopae,nAA_20_180_pae,nAA_24_180_nopae,nAA_24_180_pae,nAA_28_180_nopae,nAA_28_180_pae
0,A1A4S6,M,1,5,2,16,3,34,3,52,7,78,11
1,A1A4S6,G,2,9,3,29,9,48,17,72,28,93,41
2,A1A4S6,L,3,20,10,36,21,60,36,83,56,101,72
3,A1A4S6,Q,4,19,10,36,24,52,39,76,51,95,73
4,A1A4S6,P,5,23,12,42,32,58,44,81,59,98,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1622132,Q9ZWM9,I,340,6,4,8,6,10,8,11,9,14,10
1622133,Q9ZWM9,F,341,6,4,7,6,8,7,10,8,12,9
1622134,Q9ZWM9,H,342,5,4,6,5,7,6,9,7,11,8
1622135,Q9ZWM9,A,343,4,3,5,3,6,5,8,6,10,7


In [6]:
RSA_data = pd.read_csv('RSA_data.csv')
RSA_data

Unnamed: 0,protein_id,AA,position,RSA,neg_RSA
0,Q9BUK0,M,1,1.000000,0.000000
1,Q9BUK0,P,2,0.529412,0.470588
2,Q9BUK0,S,3,0.469231,0.530769
3,Q9BUK0,V,4,0.774648,0.225352
4,Q9BUK0,T,5,0.626761,0.373239
...,...,...,...,...,...
1622132,P07273,R,305,0.629032,0.370968
1622133,P07273,W,306,0.251101,0.748899
1622134,P07273,K,307,0.746341,0.253659
1622135,P07273,F,308,0.573604,0.426396


In [7]:
iupred2_data = pd.read_csv('iupred2_df.csv')
iupred2_data

Unnamed: 0,protein_id,AA,position,iupred2,neg_iupred2
0,P53220,M,1,0.250310,0.749690
1,P53220,S,2,0.155435,0.844565
2,P53220,S,3,0.196879,0.803121
3,P53220,S,4,0.236433,0.763567
4,P53220,L,5,0.284882,0.715118
...,...,...,...,...,...
1724887,O09053,K,1397,0.494003,0.505997
1724888,O09053,G,1398,0.480142,0.519858
1724889,O09053,L,1399,0.465241,0.534759
1724890,O09053,F,1400,0.444081,0.555919


Merge prediction dataframes with ground-truth dataframe

In [None]:
alphafold_disorder_benchmark = benchmark_acc_rsa_iupred2_smooth.merge(groundtruth_data, how='left', 
                                                                      on=['protein_id','position'])
alphafold_disorder_benchmark = alphafold_disorder_benchmark.fillna(0)