In [1]:
import pandas as pd

# protein table  
## must contain the cols: *id_protein, uniprot_acc, hgnc_id, gene_id, gene_name, sequence, length, disorder_content*

In [38]:
proteins = pd.read_csv('../datasets/llps_human_all_proteins.csv')
proteins.drop(columns='organism', inplace= True)

In [39]:
# Drop the "-nan-" entry
proteins.iloc[4266]
proteins.drop(4266, inplace= True)

In [37]:
# Mobidb disorder content
mobidb = pd.read_csv('dc_mobidb_lite.csv').rename(columns={'uniprot': 'uniprot_acc', 'dc': 'disorder_content'})

In [41]:
# Cannonical sequences from UniProt for each protein
seqs = pd.read_csv('sequences.csv')
seqs.rename(columns= {'uniprot': 'uniprot_acc'}, inplace= True)

## The uniprot accession 'Q8NEP4' changed to 'O43236' (2020-12-02 Merged into O43236)  
Here, we will keep Q8NEP4 because this entry didin't change in drLLPS (which is the source of this protein)

In [42]:
proteins[proteins.uniprot_acc == 'Q8NEP4'] # UniProt: 2020-12-02 Merged into O43236.

Unnamed: 0,uniprot_acc,mlo,rol,db,hgnc_id,gene_name,approved_name,gene_id
4352,Q8NEP4,postsynaptic density,client,drllps,,,,


In [45]:
# Keep 'Q8NEP4'
seqs[seqs.uniprot_acc == 'O43236']
seqs.iloc[4282]

uniprot_acc                                               O43236
sequence       MDRSLGWQGNSVPEDRTEAGIKRFLEDTTDDGELSKFVKDFSGNAS...
Name: 4282, dtype: object

In [46]:
seqs.replace('O43236', 'Q8NEP4', inplace= True)

In [47]:
seqs.iloc[4282]

uniprot_acc                                               Q8NEP4
sequence       MDRSLGWQGNSVPEDRTEAGIKRFLEDTTDDGELSKFVKDFSGNAS...
Name: 4282, dtype: object

In [48]:
proteins = proteins.merge(seqs, how= 'left') # add seqs
proteins['length'] = proteins.sequence.map(lambda x: len(x)) # add length

In [49]:
proteins.head()

Unnamed: 0,uniprot_acc,mlo,rol,db,hgnc_id,gene_name,approved_name,gene_id,sequence,length
0,A0A024RBG1,null_phasepdb_ht,regulator,phasepdb_ht,HGNC:18012,NUDT4B,nudix hydrolase 4B,440672.0,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,181
1,A0A096LP55,stress granule,regulator,drllps,HGNC:51714,UQCRHL,ubiquinol-cytochrome c reductase hinge protein...,440567.0,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...,91
2,A0A0U1RRE5,p-body,"component, client","drllps, phasepdb_uniprot",HGNC:50713,NBDY,negative regulator of P-body association,550643.0,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...,68
3,A0AV96,"stress granule, p-body, stress granule","component, client","drllps, phasepdb_ht",HGNC:30358,RBM47,RNA binding motif protein 47,54502.0,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...,593
4,A0FGR8,"postsynaptic density, nucleolus",client,drllps,HGNC:22211,ESYT2,extended synaptotagmin 2,57488.0,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...,921


In [551]:
# Don't forget: tabla separada para MLOs
#mlo = proteins.copy()

## Add disorder_content col

In [50]:
len(mobidb.uniprot_acc.unique()) # 1 entry = 1 protein

2655

In [51]:
proteins =proteins.merge(mobidb, how= 'left')

In [52]:
# protein df with cols to keep
protein = proteins[['uniprot_acc', 'hgnc_id', 'gene_id', 'gene_name', 'sequence', 'length', 'disorder_content']].copy()

In [54]:
protein.head()

Unnamed: 0,uniprot_acc,hgnc_id,gene_id,gene_name,sequence,length,disorder_content
0,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,181,
1,A0A096LP55,HGNC:51714,440567.0,UQCRHL,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...,91,0.33
2,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...,68,1.0
3,A0AV96,HGNC:30358,54502.0,RBM47,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...,593,0.042
4,A0FGR8,HGNC:22211,57488.0,ESYT2,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...,921,0.215


In [56]:
# col id_mutation INT
protein['id_protein'] = range(1, len(proteins)+1)

In [57]:
protein = protein[['id_protein', 'uniprot_acc', 'hgnc_id', 'gene_id', 'gene_name', 'length', 'sequence', 'disorder_content']]

In [59]:
protein.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4368 entries, 0 to 4367
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id_protein        4368 non-null   int32  
 1   uniprot_acc       4368 non-null   object 
 2   hgnc_id           4283 non-null   object 
 3   gene_id           4280 non-null   float64
 4   gene_name         4283 non-null   object 
 5   length            4368 non-null   int64  
 6   sequence          4368 non-null   object 
 7   disorder_content  2655 non-null   float64
dtypes: float64(2), int32(1), int64(1), object(4)
memory usage: 290.1+ KB


In [62]:
#import os
#os.mkdir('db_tables')

In [64]:
protein.to_csv('db_tables/protein.tsv', sep='\t', index= False) # name according to our db schema
# subir a https://github.com/alvaro-2/mutations