In [1]:
# this script is for mapping PDBs structures to MSAs of orthologs kinases proteins

In [2]:
import pandas as pd
import pyranges as pr

In [3]:
# Load data
# Sift dataset
sift = pd.read_csv('../datasets/uniprot_segments_observed.tsv.gz', sep= '\t', compression= 'gzip', skiprows= 1)
sift.columns = sift.columns.str.lower()
# Drop individual observed residues
sift = sift[sift.sp_beg != sift.sp_end]

# dataset with KD and CRE region coordinates
kd = pd.read_csv('../datasets/kd.tsv', sep= '\t')
cre = pd.read_csv('../datasets/cre.tsv', sep= '\t')
regions = kd.merge(cre, on= 'uniprot_acc', suffixes= ["_kd", "_cre"])

In [4]:
# sift data with start and end in uniprot sequence
sift_ = sift[['sp_primary', 'pdb','sp_beg', 'sp_end']].drop_duplicates()
sift_ = sift_.sort_values("sp_primary")
sift_

Unnamed: 0,sp_primary,pdb,sp_beg,sp_end
37784,A0A003,6kvc,13,48
37751,A0A003,6kv9,13,49
37752,A0A003,6kv9,59,320
37785,A0A003,6kvc,59,321
112426,A0A009I821,7ryg,1,109
...,...,...,...,...
204109,X7YCN8,5vba,16,46
204111,X7YCN8,5vba,107,170
204110,X7YCN8,5vba,51,104
78144,X8CHM4,5o5k,210,425


In [5]:
# Create a unique range for each PDB (instead of splited by segments)
# Get the min value of coordinates for each pdb
pdb_min = sift_.groupby(['sp_primary', 'pdb']).min().reset_index().drop(columns= 'sp_end')
# Get the max value of coordinates for each pdb
pdb_max = sift_.groupby(['sp_primary', 'pdb']).max().reset_index().drop(columns= 'sp_beg')
pdb_coord = pdb_min.merge(pdb_max) # ok!!

In [6]:
# Regions "global range", start and end including both KD and CRE
regions_gl = regions[['uniprot_acc', 'term_id_kd', 'term_id_cre']].copy()
regions_gl["start"] = regions_gl['end'] = 0
for i in regions.index:
    coords = [regions.start_cre[i], regions.end_cre[i], regions.start_kd[i], regions.end_kd[i]]
    start = min(coords)
    end = max(coords)
    regions_gl['start'][i] = start
    regions_gl['end'][i] = end

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":


In [7]:
regions_gl.nunique() # Ok

uniprot_acc    280
term_id_kd     286
term_id_cre    286
start          135
end            169
dtype: int64

In [8]:
# Convert regions df to pyranges object
regions_gl.rename(columns= {'uniprot_acc': 'Chromosome' }, inplace= True)
regions_gl.columns = regions_gl.columns.str.capitalize()
regions_gl["length"] = regions_gl.End - regions_gl.Start + 1
# As pyranges object
regions_gl_pr = pr.PyRanges(regions_gl)

# PDBs coordinates as pyranges object
pdb_pr = pr.PyRanges(pdb_coord.rename(
    columns= {
        'sp_primary': 'Chromosome',
        'sp_beg': 'Start',
        'sp_end': 'End'
    }
)
)

In [9]:
# # With intersect the returned intervals are the intersection 
# # of the overlapping intervals in self and other. All overlaps are returned by default
# regions_pdb_both = regions_gl_pr.intersect(pdb_pr)
# regions_pdb_both = regions_pdb_both.as_df()
# regions_pdb_both.columns = regions_pdb_both.columns.str.lower()
# regions_pdb_both.rename(columns= {'chromosome': 'uniprot'}, inplace= True)
# regions_pdb_both

In [10]:
# Join pyranges method
regions_pdb_both = regions_gl_pr.join(pdb_pr, slack= 1, suffix= "_pdb", report_overlap= True) # slack= 1 includes sequence edges
regions_pdb_both = regions_pdb_both.as_df()
regions_pdb_both.columns = regions_pdb_both.columns.str.lower()
regions_pdb_both.rename(columns= {'chromosome': 'uniprot'}, inplace= True)
regions_pdb_both

Unnamed: 0,uniprot,term_id_kd,term_id_cre,start,end,length,pdb,start_pdb,end_pdb,overlap
0,A5K0N4,kd10,cre10,1,791,791,5f0a,1,846,791
1,A5K0N4,kd10,cre10,1,791,791,5dzc,1,843,791
2,A5K0N4,kd10,cre10,1,791,791,5dyl,1,816,791
3,A5K0N4,kd10,cre10,1,791,791,4rz7,1,815,791
4,A5K0N4,kd10,cre10,1,791,791,5fet,2,816,790
...,...,...,...,...,...,...,...,...,...,...
2114,Q99683,kd247,cre247,1,938,938,6e2o,670,939,269
2115,Q99683,kd247,cre247,1,938,938,4bf2,671,940,268
2116,Q99683,kd247,cre247,1,938,938,6oyw,671,940,268
2117,Q99683,kd247,cre247,1,938,938,6xih,671,940,268


In [11]:
del(pdb_pr, regions_gl_pr)

In [16]:
regions_pdb_both.nunique() # 102 out of 280 proteins have 1904 PDB structures

uniprot         102
term_id_kd      105
term_id_cre     108
start            75
end              88
length           86
pdb            1904
start_pdb       300
end_pdb         357
overlap         213
dtype: int64

In [13]:
# Cuantos PDB cubren totalmente ambas regiones?
regions_pdb_both[regions_pdb_both.length == regions_pdb_both.overlap].pdb.nunique()

77

In [14]:
# Cuantos cubren parcialmente
regions_pdb_both[regions_pdb_both.length != regions_pdb_both.overlap].pdb.nunique()

1895

In [14]:
# No. of unique proteins
regions_pdb_both.uniprot.nunique()

102

In [15]:
# No. of unique pdbs
regions_pdb_both.pdb.nunique()

1904

In [32]:
# Add which MSA belongs each pdb
all_msa = pd.read_csv('../datasets/all_msas.tsv', sep= '\t')
# only 60% identity
all_msa = all_msa[all_msa.msa.str.endswith("60")]

In [27]:
# Mapping MSA - uniprot
to_add = all_msa.drop(columns= 'length_msa')#.uniprot.nunique()

In [45]:
msa_regions_pdbs = regions_pdb_both.merge(to_add, how= 'left') #left_on='uniprot', right_on= 'uniprot'

In [46]:
msa_regions_pdbs.isna().any()

uniprot        False
term_id_kd     False
term_id_cre    False
start          False
end            False
length         False
pdb            False
start_pdb      False
end_pdb        False
overlap        False
msa             True
dtype: bool

In [47]:
msa_regions_pdbs[msa_regions_pdbs.msa.isna()].uniprot.nunique()

3

In [48]:
msa_regions_pdbs.columns

Index(['uniprot', 'term_id_kd', 'term_id_cre', 'start', 'end', 'length', 'pdb',
       'start_pdb', 'end_pdb', 'overlap', 'msa'],
      dtype='object')

In [49]:
msa_regions_pdbs = msa_regions_pdbs.dropna()
msa_regions_pdbs = msa_regions_pdbs.loc[:, ['msa', 'uniprot', 'term_id_kd', 'term_id_cre', 'start', 'end', 'length', 'pdb',
       'start_pdb', 'end_pdb', 'overlap']]
msa_regions_pdbs

Unnamed: 0,msa,uniprot,term_id_kd,term_id_cre,start,end,length,pdb,start_pdb,end_pdb,overlap
0,Q8I719_60,A5K0N4,kd10,cre10,1,791,791,5f0a,1,846,791
1,A5K0N4_60,A5K0N4,kd10,cre10,1,791,791,5f0a,1,846,791
2,Q8I719_60,A5K0N4,kd10,cre10,1,791,791,5dzc,1,843,791
3,A5K0N4_60,A5K0N4,kd10,cre10,1,791,791,5dzc,1,843,791
4,Q8I719_60,A5K0N4,kd10,cre10,1,791,791,5dyl,1,816,791
...,...,...,...,...,...,...,...,...,...,...,...
2757,Q99683_60,Q99683,kd247,cre247,1,938,938,6e2o,670,939,269
2758,Q99683_60,Q99683,kd247,cre247,1,938,938,4bf2,671,940,268
2759,Q99683_60,Q99683,kd247,cre247,1,938,938,6oyw,671,940,268
2760,Q99683_60,Q99683,kd247,cre247,1,938,938,6xih,671,940,268


In [50]:
#msa_regions_pdbs.to_csv('../datasets/msa_regions_pdbs.tsv', sep= '\t', index= False)