In [65]:
import pandas as pd
import numpy as np

In [66]:
#pip install pyranges

In [67]:
import pyranges as pr

In [68]:
# all LLPS human proteins dataset (Orti et al.)
proteins = pd.read_csv('../datasets/all_proteins_table.csv')

In [69]:
mutations = pd.read_csv('../datasets/clinvar_all_proteins_mutations.csv.gz', compression='gzip')

In [70]:
disorder = pd.read_csv('disorder_lite.csv')
low_complexity = pd.read_csv('low_complexity.csv')
pfam = pd.read_csv('pfam.csv')

---  
## Proteins  
cols: id_protein, uniprot_acc, hgnc_id, gene_id, gene_name, sequence, length

In [71]:
proteins.head()

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name,length,sequence
0,1,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,4369,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
1,2,A0A096LP55,HGNC:51714,440567.0,UQCRHL,4369,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...
2,3,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,4369,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
3,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...
4,5,A0FGR8,HGNC:22211,57488.0,ESYT2,4369,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...


In [72]:
# VER
#proteins.gene_id = proteins.gene_id.fillna(-1).apply(int)
#proteins.hgnc_id = proteins.hgnc_id.fillna('-')
#proteins.gene_name = proteins.gene_name.fillna('-')

In [73]:
proteins

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name,length,sequence
0,1,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,4369,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
1,2,A0A096LP55,HGNC:51714,440567.0,UQCRHL,4369,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...
2,3,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,4369,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
3,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...
4,5,A0FGR8,HGNC:22211,57488.0,ESYT2,4369,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...
...,...,...,...,...,...,...,...
4364,4365,Q9UNL7,,,,4369,KLDVEEPDSANSSFYSTRSAPASQASLRATSSTQSLARLGSPDYGN...
4365,4366,Q9Y4C0,HGNC:8010,9369.0,NRXN3,4369,MSSTLHSVFFTLKVSILLGSLLGLCLGLEFMGLPNQWARYLRWDAS...
4366,4367,Q9Y649,,,,4369,MNDLICFLDNTFKNNVLSQAWWCVHLVPTIWEAEAGGSLEPRSLKL...
4367,4368,R4GMX3,,,,4369,MELSESVQKGFQMLADPRSFDSNAFTLLLRAAFQSLLDAQADEAVL...


## Load Domains and regions tables

In [74]:
pfam.rename(columns={'uniprot': 'uniprot_acc'}, inplace= True)

In [75]:
# Add and unique integer ID fow low_complexity and disorder
low_complexity['id_lc'] = range(1, len(low_complexity)+1)
disorder['id_idr'] = range(1, len(disorder)+1)

---  
## Consequences and Sources tables

In [76]:
cf = mutations.consequence.value_counts()
cf

missense       132500
frameshit       18891
nonsense        11265
deletion         2610
insertion        1103
delins            763
duplication       619
Name: consequence, dtype: int64

In [77]:
Consequences = pd.DataFrame({'id_consequence': range(1, len(cf)+1), 'consequence': cf.index})
Consequences

Unnamed: 0,id_consequence,consequence
0,1,missense
1,2,frameshit
2,3,nonsense
3,4,deletion
4,5,insertion
5,6,delins
6,7,duplication


In [78]:
mutations.source.value_counts()

clinvar    167751
Name: source, dtype: int64

In [79]:
Sources = pd.DataFrame({'id_source': [1,2,3], 'source': ['clinvar', 'disgenet', 'uniprot']})
Sources

Unnamed: 0,id_source,source
0,1,clinvar
1,2,disgenet
2,3,uniprot


In [80]:
Consequences.to_csv('db_tables/Consequences.tsv', sep='\t', index = False)

In [81]:
Sources.to_csv('db_tables/Sources.tsv', sep='\t', index = False)

---  
## Mutations table  
cols: id_mutation, snp_id, chromosome, start_genomic, end_genomic, start_aa, end_aa, from_aa, to_aa, Proteins_id_protein, Sources_id_source, Consequences_id_consequence

In [82]:
mutations[~mutations.end_aa.isnull()][['start_aa',	'end_aa',	'from',	'to',	'consequence',	'source']]

Unnamed: 0,start_aa,end_aa,from,to,consequence,source
5,1755,1757.0,LeuThr,,deletion,clinvar
13,23,24.0,GlyGlu,,deletion,clinvar
14,47,54.0,GlyArg,,deletion,clinvar
16,295,298.0,AspLeu,,deletion,clinvar
18,116,124.0,GluVal,,deletion,clinvar
...,...,...,...,...,...,...
23978,1,2.0,MetGly,Ala,insertion,clinvar
23979,517,518.0,IleLeu,Ter,insertion,clinvar
23981,229,230.0,GlyPro,,insertion,clinvar
23984,1171,1172.0,AspGlu,,insertion,clinvar


In [83]:
mutations.columns

Index(['uniprot_acc', 'organism', 'mlo', 'rol', 'db', 'hgnc_id', 'gene_name',
       'approved_name', 'gene_id', 'geneid', 'genesymbol', 'snpid', 'alleleid',
       'chromosomeaccession', 'chromosome', 'start', 'stop', 'type', 'name',
       'origin', 'phenotypeids', 'phenotypelist', 'otherids', 'nuccore_id',
       'cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence', 'source'],
      dtype='object')

In [84]:
Mutations = mutations[['uniprot_acc', 'snpid', 'chromosome', 'start', 'stop', 'start_aa', 'end_aa', 'from', 'to', 'consequence', 'source']].copy()
Mutations.rename(columns={'snpid': 'snp_id', 'start': 'start_genomic', 'stop': 'end_genomic', 'from': 'from_aa', 'to': 'to_aa'}, inplace= True)

In [85]:
# Add an unique ID for each mutation, type INT
Mutations['id_mutation'] = range(1, len(mutations)+1)

In [86]:
# Aquellas mutaciones que tienen NaN en end sustituir por el start
Mutations.end_aa = Mutations.end_aa.fillna(value= Mutations.start_aa).apply(int)

In [87]:
Mutations

Unnamed: 0,uniprot_acc,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,consequence,source,id_mutation
0,A6H8Y1,879255413,5,71512291,71512293,1371,1371,Arg,,deletion,clinvar,1
1,A6NHR9,886043345,18,2700757,2700759,497,497,Lys,,deletion,clinvar,2
2,A6NHR9,886044914,18,2697985,2697987,430,430,His,,deletion,clinvar,3
3,A6NHR9,1598342592,18,2705766,2705768,639,639,Gly,,deletion,clinvar,4
4,A6NHR9,1598293848,18,2673343,2673345,164,164,Arg,,deletion,clinvar,5
...,...,...,...,...,...,...,...,...,...,...,...,...
167746,Q9ULB1,1064795493,2,51028155,51028155,40,40,Trp,Ter,nonsense,clinvar,167747
167747,Q9ULB1,1553759318,2,50538311,50538311,695,695,Trp,Ter,nonsense,clinvar,167748
167748,Q9ULB1,1201575289,2,50055006,50055006,1253,1253,Arg,Ter,nonsense,clinvar,167749
167749,Q9ULB1,-1,2,49922240,49922240,1410,1410,Arg,Ter,nonsense,clinvar,167750


In [88]:
id_proteins = proteins[['id_protein', 'uniprot_acc']].copy()

In [89]:
# Add IDs from protein, consequences and sources
Mutations = Mutations.merge(id_proteins)
Mutations = Mutations.merge(Consequences)
Mutations = Mutations.merge(Sources)
Mutations.rename(columns= {'id_protein': 'Proteins_id_protein',
                            'id_source': 'Sources_id_source',
                            'id_consequence': 'Consequences_id_consequence'}, inplace= True)

Mutations.drop(columns=['uniprot_acc', 'consequence', 'source'], inplace= True)

In [90]:
Mutations.sort_values('id_mutation')

Unnamed: 0,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_mutation,Proteins_id_protein,Consequences_id_consequence,Sources_id_source
0,879255413,5,71512291,71512293,1371,1371,Arg,,1,19,4,1
1,886043345,18,2700757,2700759,497,497,Lys,,2,25,4,1
2,886044914,18,2697985,2697987,430,430,His,,3,25,4,1
3,1598342592,18,2705766,2705768,639,639,Gly,,4,25,4,1
4,1598293848,18,2673343,2673345,164,164,Arg,,5,25,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
164603,1064795493,2,51028155,51028155,40,40,Trp,Ter,167747,4363,3,1
164604,1553759318,2,50538311,50538311,695,695,Trp,Ter,167748,4363,3,1
164605,1201575289,2,50055006,50055006,1253,1253,Arg,Ter,167749,4363,3,1
164606,-1,2,49922240,49922240,1410,1410,Arg,Ter,167750,4363,3,1


In [91]:
Mutations.to_csv('db_tables/Mutations.tsv', sep='\t', index = False)

---

In [92]:
len(mutations.uniprot_acc.unique()) # analizar esto

2414

In [93]:
len(proteins.uniprot_acc.unique())

4369

In [94]:
mutations[mutations.uniprot_acc.isin(proteins.uniprot_acc)] # esto es, todas las mutaciones de clinvar tienen alguna proteina de llps

Unnamed: 0,uniprot_acc,organism,mlo,rol,db,hgnc_id,gene_name,approved_name,gene_id,geneid,...,phenotypelist,otherids,nuccore_id,cambio,start_aa,end_aa,from,to,consequence,source
0,A6H8Y1,Homo sapiens,"stress granule, p-body",regulator,drllps,HGNC:13652,BDP1,"B double prime 1, subunit of RNA polymerase II...",55814.0,55814,...,not specified,ClinGen:CA3296626,NM_018429.3,Arg1371del,1371,,Arg,,deletion,clinvar
1,A6NHR9,Homo sapiens,nucleolus,"component, client","drllps, phasepdb_ht",HGNC:29090,SMCHD1,structural maintenance of chromosomes flexible...,23347.0,23347,...,not provided,ClinGen:CA10605406,NM_015295.2,Lys497del,497,,Lys,,deletion,clinvar
2,A6NHR9,Homo sapiens,nucleolus,"component, client","drllps, phasepdb_ht",HGNC:29090,SMCHD1,structural maintenance of chromosomes flexible...,23347.0,23347,...,Scapulohumeral muscular dystrophy,ClinGen:CA10607099,NM_015295.2,His430del,430,,His,,deletion,clinvar
3,A6NHR9,Homo sapiens,nucleolus,"component, client","drllps, phasepdb_ht",HGNC:29090,SMCHD1,structural maintenance of chromosomes flexible...,23347.0,23347,...,not provided,-,NM_015295.3,Gly639del,639,,Gly,,deletion,clinvar
4,A6NHR9,Homo sapiens,nucleolus,"component, client","drllps, phasepdb_ht",HGNC:29090,SMCHD1,structural maintenance of chromosomes flexible...,23347.0,23347,...,not provided,-,NM_015295.3,Arg164del,164,,Arg,,deletion,clinvar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167746,Q9ULB1,Homo sapiens,postsynaptic density,client,drllps,HGNC:8008,NRXN1,,9378.0,9378,...,not provided,ClinGen:CA16617739,NM_001330078.2,Trp40Ter,40,,Trp,Ter,nonsense,clinvar
167747,Q9ULB1,Homo sapiens,postsynaptic density,client,drllps,HGNC:8008,NRXN1,,9378.0,9378,...,not provided,ClinGen:CA346770191,NM_001330078.2,Trp695Ter,695,,Trp,Ter,nonsense,clinvar
167748,Q9ULB1,Homo sapiens,postsynaptic density,client,drllps,HGNC:8008,NRXN1,,9378.0,9378,...,not provided,ClinGen:CA346820132,NM_001330078.2,Arg1253Ter,1253,,Arg,Ter,nonsense,clinvar
167749,Q9ULB1,Homo sapiens,postsynaptic density,client,drllps,HGNC:8008,NRXN1,,9378.0,9378,...,Pitt-Hopkins-like syndrome 2,-,NM_001330078.2,Arg1410Ter,1410,,Arg,Ter,nonsense,clinvar


In [95]:
proteins[~proteins.uniprot_acc.isin(mutations.uniprot_acc)]

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name,length,sequence
0,1,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,4369,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
2,3,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,4369,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
3,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...
4,5,A0FGR8,HGNC:22211,57488.0,ESYT2,4369,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...
5,6,A0FGR9,HGNC:24295,83850.0,ESYT3,4369,MRAEEPCAPGAPSALGAQRTPGPELRLSSQLLPELCTFVVRVLFYL...
...,...,...,...,...,...,...,...
4363,4364,Q9UN81,,,,4369,MGKKQNRKTGNSKTQSASPPPKERSSSPATEQSWMENDFDELREEG...
4364,4365,Q9UNL7,,,,4369,KLDVEEPDSANSSFYSTRSAPASQASLRATSSTQSLARLGSPDYGN...
4366,4367,Q9Y649,,,,4369,MNDLICFLDNTFKNNVLSQAWWCVHLVPTIWEAEAGGSLEPRSLKL...
4367,4368,R4GMX3,,,,4369,MELSESVQKGFQMLADPRSFDSNAFTLLLRAAFQSLLDAQADEAVL...


In [96]:
# proteinas del dataset de llps que no mergearon con clinvar
proteins[~proteins.uniprot_acc.isin(mutations.uniprot_acc)][:20]

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name,length,sequence
0,1,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,4369,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
2,3,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,4369,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
3,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...
4,5,A0FGR8,HGNC:22211,57488.0,ESYT2,4369,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...
5,6,A0FGR9,HGNC:24295,83850.0,ESYT3,4369,MRAEEPCAPGAPSALGAQRTPGPELRLSSQLLPELCTFVVRVLFYL...
6,7,A0MZ66,HGNC:29319,57698.0,SHTN1,4369,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...
7,8,A1KXE4,HGNC:27016,130074.0,FAM168B,4369,MNPVYSPGSSGVPYANAKGIGYPAGFPMGYAAAAPAYSPNMYPGAN...
9,10,A1L020,HGNC:33482,92312.0,MEX3A,4369,MPSLVVSGIMERNGGFGELGCFGGSAKDRGLLEDERALQLALDQLC...
10,11,A1L3X0,HGNC:26292,79993.0,ELOVL7,4369,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...
12,13,A2A3K4,HGNC:30184,138639.0,PTPDC1,4369,MAAGVLPQNEQPYSTLVNNSECVANMKGNLERPTPKYTKVGERLRH...


In [97]:
(mutations[mutations.consequence == 'missense'])[['start_aa',	'end_aa',	'from',	'to',	'consequence']]

Unnamed: 0,start_aa,end_aa,from,to,consequence
23986,53,,Tyr,Cys,missense
23987,981,,Met,Val,missense
23988,1133,,Ala,Val,missense
23989,478,,Ala,Val,missense
23990,232,,Gln,Glu,missense
...,...,...,...,...,...
156481,566,,Leu,Pro,missense
156482,439,,Lys,Glu,missense
156483,1481,,Arg,Gln,missense
156484,203,,Gly,Asp,missense


---
## Para asignar los rangos debo tener:  
- Tabla de mutaciones con id_mutation, *id_protein(Chromosome), start_aa(Start), end_aa(End)*  
- Tablas de lc, idr y pfam con id unico 

---  
# Pfam Tables

In [98]:
pfam.head()

Unnamed: 0,uniprot_acc,tipo,start,end
0,O94910,7tm_2,857,1093
1,Q9HAR2,7tm_2,861,1097
2,O14514,7tm_2,944,1180
3,O75899,7tm_3,475,743
4,Q9NZH0,7tm_3,49,291


## pfam_domains  
cols: pfam_id, pfam_domain, por ej: PF00003 7tm_3

In [99]:
pf_domain = pfam.tipo.unique() # unique pfam domains

In [100]:
pfam_domains = pd.DataFrame({'pfam_domain': pf_domain, 'pfam_id': range(1, len(pf_domain)+1)})  # luego cambiar el id por los PF000...
pfam_domains

Unnamed: 0,pfam_domain,pfam_id
0,7tm_2,1
1,7tm_3,2
2,ATP-synt_ab,3
3,GTP_EFTU,4
4,HLH,5
...,...,...
2934,PhoLip_ATPase_C,2935
2935,HIP1_clath_bdg,2936
2936,DAO_C,2937
2937,Armet,2938


In [101]:
pfam_domains.to_csv('db_tables/pfam_domains.tsv', sep='\t', index= False)

## Proteins_has_pfam_domains  
cols: Proteins_id_protein, pfam_domains_pfam_id, start, end, length

In [102]:
# Mapping uniprot_acc with its unique INT id
id_proteins = proteins[['id_protein','uniprot_acc']].copy()

In [103]:
Proteins_has_pfam_domains = pfam.merge(id_proteins) # agregar col id_protein
Proteins_has_pfam_domains['length'] = Proteins_has_pfam_domains.end - Proteins_has_pfam_domains.start + 1 # col length

In [104]:
Proteins_has_pfam_domains.rename(columns= {'tipo': 'pfam_domain'}, inplace= True)
Proteins_has_pfam_domains = Proteins_has_pfam_domains.merge(pfam_domains) # para agregar col pfam_id
Proteins_has_pfam_domains.rename(columns= {'id_protein': 'Proteins_id_protein', 'pfam_id': 'pfam_domains_pfam_id'}, inplace= True)

In [105]:
Proteins_has_pfam_domains = Proteins_has_pfam_domains[['Proteins_id_protein', 'pfam_domains_pfam_id', 'start', 'end', 'length']].sort_values('Proteins_id_protein')
Proteins_has_pfam_domains.head()

Unnamed: 0,Proteins_id_protein,pfam_domains_pfam_id,start,end,length
6140,1,171,17,144,128
9114,2,814,28,91,64
627,4,50,153,220,68
626,4,50,248,312,65
625,4,50,73,141,69


In [106]:
Proteins_has_pfam_domains.to_csv('db_tables/Proteins_has_pfam_domains.tsv', sep='\t', index= False)

## Mutations_has_Proteins_has_pfam_domains  
cols: Mutations_id_mutation, Proteins_has_pfam_domains_Proteins_id_protein, Proteins_has_pfam_domains_pfam_domains_pfam_id

## Pyranges  
columnas obligatorias: *Chromosome	 Start	End*  
Chromosome: id_protein    
otras columnas con ids son opcionales y cualquier nombre  
  
por ejemplo df seria tabla de mutaciones  
df = pr.PyRanges(df.rename(columns={'chromosome':'Chromosome','start_position':'Start','end_position':'End'}))  
  
df = pyrange de mutaciones (columnas: Chromosome, Start, End, id_mutacion)  
low_c = pyrange de low complexity(columnas: Chromosome, Start, End, id_low, id_proteina)  
data = df.join(low_c, strandedness=False, slack=1).drop(like="_b") # mutaciones lo junto con low_complex  
strandedness=False no tener en cuenta el Strand  
slack=1 coincidir los extremos. Importante  
drop(like="_b") eliminar el Chromosome, Start, End de low_c (en pfam no hacer el drop)  
data = data.df[[Chromosome, Start, End, id_mutacion, id_low, id_proteina]] # pasa de pyrange a dataframe

In [107]:
Mutations.head()

Unnamed: 0,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_mutation,Proteins_id_protein,Consequences_id_consequence,Sources_id_source
0,879255413,5,71512291,71512293,1371,1371,Arg,,1,19,4,1
1,886043345,18,2700757,2700759,497,497,Lys,,2,25,4,1
2,886044914,18,2697985,2697987,430,430,His,,3,25,4,1
3,1598342592,18,2705766,2705768,639,639,Gly,,4,25,4,1
4,1598293848,18,2673343,2673345,164,164,Arg,,5,25,4,1


In [108]:
# df has pfam domains data
df = pfam.rename(columns={'tipo': 'pfam_domain'}).merge(pfam_domains)
df = df.merge(id_proteins)                      # mapping uniprot_acc - id_protein
df.drop(columns='uniprot_acc', inplace= True)

In [109]:
df.head()

Unnamed: 0,pfam_domain,start,end,pfam_id,id_protein
0,7tm_2,857,1093,1,477
1,GPS,800,844,690,477
2,Gal_Lectin,48,128,745,477
3,OLF,144,396,773,477
4,Latrophilin,1113,1474,818,477


In [110]:
df.rename(columns={'id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)

In [111]:
df.head()

Unnamed: 0,pfam_domain,Start,End,pfam_id,Chromosome
0,7tm_2,857,1093,1,477
1,GPS,800,844,690,477
2,Gal_Lectin,48,128,745,477
3,OLF,144,396,773,477
4,Latrophilin,1113,1474,818,477


In [112]:
# Create the pyranges object of pfam domains
df_py = pr.PyRanges(df)

In [113]:
aux = Mutations[['start_aa', 'end_aa', 'id_mutation', 'Proteins_id_protein']].copy()
aux.rename(columns={'Proteins_id_protein': 'Chromosome', 'start_aa': 'Start', 'end_aa': 'End'}, inplace= True)

In [114]:
aux.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,1371,1371,1,19
1,497,497,2,25
2,430,430,3,25
3,639,639,4,25
4,164,164,5,25


In [115]:
# Pyranges object of mutations
aux_py = pr.PyRanges(aux)

In [116]:
# Join both pyranges object: this assings mutations to pfam domains
pfam_py = aux_py.join(df_py, strandedness= False, slack=1)  # strandedness= False doesnt take count of the chain strand; slack= 1 include bounds

In [117]:
pfam_py.head() # Start and End are from the mutation. Start_b and End_b are from the pfam domain

Unnamed: 0,Start,End,id_mutation,Chromosome,pfam_domain,Start_b,End_b,pfam_id
0,53,53,23987,2,UCR_hinge,28,91,814
1,305,305,3993,9,I-set,234,321,2914
2,981,981,23988,9,An_peroxidase,727,1272,947
3,1133,1133,23989,9,An_peroxidase,727,1272,947
4,478,478,23990,9,I-set,419,505,2914
5,1207,1207,23993,9,An_peroxidase,727,1272,947
6,1039,1039,23994,9,An_peroxidase,727,1272,947
7,559,559,23995,9,I-set,511,597,2914


In [118]:
# Pyranges to DataFrame
Mutations_has_Proteins_has_pfam_domains = pfam_py.df[['id_mutation', 'Chromosome', 'pfam_id']] # cols to keep

In [119]:
Mutations_has_Proteins_has_pfam_domains.rename(columns={'id_mutation': 'Mutations_id_mutation', 'Chromosome': 'Proteins_has_pfam_domains_Proteins_id_protein',
                                                        'pfam_id': 'Proteins_has_pfam_domains_pfam_domains_pfam_id'}, inplace= True)

In [120]:
Mutations_has_Proteins_has_pfam_domains.head()

Unnamed: 0,Mutations_id_mutation,Proteins_has_pfam_domains_Proteins_id_protein,Proteins_has_pfam_domains_pfam_domains_pfam_id
0,23987,2,814
1,3993,9,2914
2,23988,9,947
3,23989,9,947
4,23990,9,2914


In [121]:
# control
#Mutations[Mutations.id_mutation == 23987]

In [122]:
# control
#pfam_domains[pfam_domains.pfam_id == 814] # ok!

In [123]:
Mutations_has_Proteins_has_pfam_domains.to_csv('db_tables/Mutations_has_Proteins_has_pfam_domains.tsv', sep='\t', index= False)

---  
# Low-complexity Tables

In [124]:
low_complexity.head()

Unnamed: 0,uniprot,start,end,id_lc
0,P61981,236,243,1
1,P31947,235,247,2
2,P31947,248,247,3
3,P27348,230,244,4
4,P27348,245,244,5


In [125]:
# Add length col 
low_complexity['length'] = low_complexity.end - low_complexity.start + 1 

In [126]:
# Add id_proteins
low_complexity.rename(columns={'uniprot': 'uniprot_acc'}, inplace= True)
low_complexity = low_complexity.merge(id_proteins)
low_complexity.rename(columns={'id_protein': 'Proteins_id_protein'}, inplace= True)
low_complexity.drop(columns='uniprot_acc', inplace= True)

In [127]:
low_complexity.head()

Unnamed: 0,start,end,id_lc,length,Proteins_id_protein
0,236,243,1,8,1602
1,235,247,2,13,1132
2,248,247,3,0,1132
3,230,244,4,15,1049
4,245,244,5,0,1049


In [128]:
low_complexity.to_csv('db_tables/Low_complexity.tsv', sep='\t', index= False)

## Mutations_has_Low_complexity  
cols: Mutations_id_mutation, Low_complexity_id_lc

In [129]:
# Table for LC data
lc_has = low_complexity.copy()
lc_has.rename(columns={'id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)

In [130]:
lc_has.head()

Unnamed: 0,Start,End,id_lc,length,Proteins_id_protein
0,236,243,1,8,1602
1,235,247,2,13,1132
2,248,247,3,0,1132
3,230,244,4,15,1049
4,245,244,5,0,1049


In [131]:
# Auxiliar table for mutations
aux_lc = Mutations[['start_aa', 'end_aa', 'id_mutation', 'Proteins_id_protein']].copy()
aux_lc.rename(columns={'Proteins_id_protein': 'Chromosome', 'start_aa': 'Start', 'end_aa': 'End'}, inplace= True)

In [132]:
aux_lc.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,1371,1371,1,19
1,497,497,2,25
2,430,430,3,25
3,639,639,4,25
4,164,164,5,25


In [133]:
# Create the Pyranges objects
lc_has_py = pr.PyRanges(lc_has)
aux_lc_py = pr.PyRanges(aux_lc)

AssertionError: The dataframe does not have all the columns Chromosome, Start and End.

In [180]:
# Join both pyranges object: this assings mutations to pfam domains
lc_py = aux_lc_py.join(lc_has_py, strandedness= False, slack=1).drop(like="_b") # strandedness= False doesnt take count of the chain strand;
                                                                                # slack= 1 include bounds; drop(like="_b"): delete those cols (redudants)

In [181]:
lc_py.head()

Unnamed: 0,Start,End,id_mutation,Chromosome,id_lc,length
0,1133,1133,23989,9,5240,12
1,287,287,3994,16,9733,28
2,197,197,3995,16,9732,29
3,336,336,24003,17,5023,26
4,1038,1038,24009,18,11903,20
5,1469,1469,24030,19,10455,18
6,5,5,24048,23,5786,14
7,7,7,24056,23,5786,14


In [182]:
# Pyrange to DataFrame
Mutations_has_Low_complexity = lc_py.df[['id_mutation', 'Chromosome']] # cols to keep

In [183]:
Mutations_has_Low_complexity.rename(columns={'id_mutation': 'Mutations_id_mutation', 'Chromosome': 'Low_complexity_id_lc'}, inplace= True)

In [184]:
Mutations_has_Low_complexity.head()

Unnamed: 0,Mutations_id_mutation,Low_complexity_id_lc
0,23989,9
1,3994,16
2,3995,16
3,24003,17
4,24009,18


In [379]:
Mutations_has_Low_complexity.to_csv('db_tables/Mutations_has_Low_complexity.tsv', sep='\t', index= False)

---  
# Disorder Tables

## Disorder_regions  
cols: id_idr, start, end, length, Proteins_id_protein

In [48]:
# Add length col 
disorder['length'] = disorder.end - disorder.start + 1 

In [55]:
Disorder_regions = disorder.rename(columns={'uniprot': 'uniprot_acc'}).merge(id_proteins).sort_values('id_protein')
Disorder_regions.drop(columns='uniprot_acc', inplace= True)
Disorder_regions.rename(columns={'id_protein': 'Proteins_id_protein'}, inplace= True)
Disorder_regions.head()

Unnamed: 0,start,end,id_idr,length,Proteins_id_protein
2338,1,30,2339,30,2
1426,1,68,1427,68,3
2489,1,25,2490,25,4
5782,660,754,5783,95,5
5781,1,103,5782,103,5


In [56]:
Disorder_regions.to_csv('db_tables/Disorder_regions.tsv', sep='\t', index= False)

## Mutations_has_Disorder_regions  
cols: Mutations_id_mutation, Disorder_regions_id_idr

In [60]:
# Auxiliar table for mutations from low-complexity same for disorder
aux_idr = aux_lc
aux_idr.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,1371,1371,1,19
1,497,497,2,25
2,430,430,3,25
3,639,639,4,25
4,164,164,5,25


In [135]:
# Table for IDRs data
idr_has = Disorder_regions.copy()
idr_has.rename(columns={'Proteins_id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)
idr_has.head()

Unnamed: 0,Start,End,id_idr,length,Chromosome
2338,1,30,2339,30,2
1426,1,68,1427,68,3
2489,1,25,2490,25,4
5782,660,754,5783,95,5
5781,1,103,5782,103,5


In [136]:
# Create the Pyranges objects
idr_has_py = pr.PyRanges(idr_has)
aux_idr_py = pr.PyRanges(aux_idr)

In [137]:
# Join both pyranges object: this assings mutations to pfam domains
idr_py = aux_idr_py.join(idr_has_py, strandedness= False, slack=1).drop(like="_b") # strandedness= False doesnt take count of the chain strand;
                                                                                   # slack= 1 include bounds; drop(like="_b"): delete those cols (redudants)

In [138]:
idr_py.head()

Unnamed: 0,Start,End,id_mutation,Chromosome,id_idr,length
0,70,70,24001,12,4339,28
1,287,287,3994,16,4224,129
2,336,336,24003,17,2225,157
3,1014,1014,24004,17,2227,49
4,1371,1371,1,19,4637,76
5,1180,1180,24014,19,4635,293
6,2580,2580,24016,19,4645,65
7,213,213,24021,19,4630,49


In [141]:
# Pyrange to DataFrame
Mutations_has_Disorder_regions = idr_py.df[['id_mutation', 'Chromosome']] # cols to keep
Mutations_has_Disorder_regions.rename(columns={'id_mutation': 'Mutations_id_mutation', 'Chromosome': 'Disorder_regions_id_idr'}, inplace= True)

In [142]:
Mutations_has_Disorder_regions.head()

Unnamed: 0,Mutations_id_mutation,Disorder_regions_id_idr
0,24001,12
1,3994,16
2,24003,17
3,24004,17
4,1,19


In [143]:
# Control
Mutations[Mutations.id_mutation == 24001]

Unnamed: 0,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_mutation,Proteins_id_protein,Consequences_id_consequence,Sources_id_source
148021,116340837,6,149474335,149474335,70,70,Ala,Val,24001,12,1,1


In [144]:
id_proteins[id_proteins.id_protein == 12]

Unnamed: 0,id_protein,uniprot_acc
11,12,A2A288


In [145]:
disorder[disorder.id_idr == 4339] # It's Ok. A point mutation in position 70 in the idr region between 45-78

Unnamed: 0,uniprot,start,end,id_idr
4338,A2A288,48,75,4339


In [146]:
Mutations_has_Disorder_regions.to_csv('db_tables/Mutations_has_Disorder_regions.tsv', sep='\t', index= False)

---  
## llps: reduced table of proteins

In [197]:
llps = pd.read_csv('../datasets/llps_human_all_proteins.csv')
llps.drop(columns='organism', inplace= True)
llps = llps[['uniprot_acc', 'mlo', 'rol', 'db']]
llps.head()

Unnamed: 0,uniprot_acc,mlo,rol,db
0,A0A024RBG1,null_phasepdb_ht,regulator,phasepdb_ht
1,A0A096LP55,stress granule,regulator,drllps
2,A0A0U1RRE5,p-body,"component, client","drllps, phasepdb_uniprot"
3,A0AV96,"stress granule, p-body, stress granule","component, client","drllps, phasepdb_ht"
4,A0FGR8,"postsynaptic density, nucleolus",client,drllps


In [198]:
llps.mlo[3]

' stress granule, p-body, stress granule'

In [199]:
# Remove blank spaces
llps.mlo = llps.mlo.str.strip()

In [200]:
# Set mlo col as list-like and explode() to separate list elements into separate rows
llps = llps.assign(mlo= llps.mlo.str.split(',')).explode('mlo')

In [201]:
llps

Unnamed: 0,uniprot_acc,mlo,rol,db
0,A0A024RBG1,null_phasepdb_ht,regulator,phasepdb_ht
1,A0A096LP55,stress granule,regulator,drllps
2,A0A0U1RRE5,p-body,"component, client","drllps, phasepdb_uniprot"
3,A0AV96,stress granule,"component, client","drllps, phasepdb_ht"
3,A0AV96,p-body,"component, client","drllps, phasepdb_ht"
...,...,...,...,...
4366,Q9Y649,p-body,client,drllps
4367,R4GMX3,pcg body,client,drllps
4368,V9GYY5,p-body,client,drllps
4368,V9GYY5,paraspeckle,client,drllps


In [202]:
llps.mlo[4368]

4368           p-body
4368      paraspeckle
4368        nucleolus
Name: mlo, dtype: object

In [203]:
llps.mlo.value_counts()[:20] # there's blanks

postsynaptic density             1239
nucleolus                         820
stress granule                    580
 nucleolus                        528
null_phasepdb_ht                  522
centrosome/spindle pole body      411
 stress granule                   392
 p-body                           371
p-body                            276
 null_phasepdb_ht                 228
  stress granule                  205
 centrosome/spindle pole body     123
 droplet                           90
others                             87
 postsynaptic density              74
 nucleus speckles                  65
  postsynaptic density             64
 null_phasepdb_rev                 59
 nuclear speckle                   58
 pml nuclear body                  54
Name: mlo, dtype: int64

In [204]:
# Remove blank spaces
llps.mlo = llps.mlo.str.strip()

In [205]:
llps.mlo.value_counts()

nucleolus                                                                                     1377
postsynaptic density                                                                          1377
stress granule                                                                                1177
null_phasepdb_ht                                                                               750
p-body                                                                                         682
                                                                                              ... 
nuage                                                                                            1
galectin lattice                                                                                 1
u body                                                                                           1
selective hydrogel-like meshwork formed by fg-nucleoporins in nuclear pore central channel       1
intracellu

## Rol table  
cols: id_rol, rol

In [260]:
entrada_dbs = pd.read_csv('entrada_dbs.tsv.txt', sep='\t')

In [261]:
entrada_dbs.head()

Unnamed: 0,uniprot,organism,mlo,rol,db
0,P35637,Homo sapiens,cytoplasmic stress granule,driver,phasepro
1,P35637,Homo sapiens,cytoplasmic ribonucleoprotein granule,driver,phasepro
2,Q06787,Homo sapiens,cytoplasmic stress granule,driver,phasepro
3,Q06787,Homo sapiens,cytoplasmic ribonucleoprotein granule,driver,phasepro
4,Q06787,Homo sapiens,"synaptosome, neuron projection",driver,phasepro


In [262]:
entrada_dbs.rol.unique()

array(['driver', 'component', 'regulator', 'client'], dtype=object)

In [264]:
entrada_dbs.rol.value_counts()

client       5790
regulator    1395
component     836
driver        384
Name: rol, dtype: int64

In [267]:
Rol = pd.DataFrame({'rol': entrada_dbs.rol.value_counts().index, 'id_rol': range(1, len(entrada_dbs.rol.value_counts())+1)})
Rol

Unnamed: 0,rol,id_rol
0,client,1
1,regulator,2
2,component,3
3,driver,4


In [382]:
Rol.to_csv('db_tables/Rol.tsv', sep='\t', index= False)

## Databases table  
cols: id_database, database

In [271]:
entrada_dbs.db.value_counts()

drllps              5050
phasepdb_ht         2546
phasepdb_uniprot     384
phasepdb_rev         297
phasepro             128
Name: db, dtype: int64

In [273]:
Databases = pd.DataFrame({'database': entrada_dbs.db.value_counts().index, 'id_database': range(1, len(entrada_dbs.db.value_counts())+1)})
Databases

Unnamed: 0,database,id_database
0,drllps,1
1,phasepdb_ht,2
2,phasepdb_uniprot,3
3,phasepdb_rev,4
4,phasepro,5


In [383]:
Databases.to_csv('db_tables/Databases.tsv', sep='\t', index= False)

## Proteins_has_Mlos  
cols: Proteins_id_protein, Mlos_id_mlo, Rol_id_rol, Databases_id_database

In [288]:
entrada_dbs.head()

Unnamed: 0,uniprot,organism,mlo,rol,db
0,P35637,Homo sapiens,cytoplasmic stress granule,driver,phasepro
1,P35637,Homo sapiens,cytoplasmic ribonucleoprotein granule,driver,phasepro
2,Q06787,Homo sapiens,cytoplasmic stress granule,driver,phasepro
3,Q06787,Homo sapiens,cytoplasmic ribonucleoprotein granule,driver,phasepro
4,Q06787,Homo sapiens,"synaptosome, neuron projection",driver,phasepro


In [313]:
# There's NaNs uniprots. Why?
entrada_dbs = entrada_dbs[entrada_dbs.uniprot.notnull()]

In [358]:
Proteins_has_Mlos = entrada_dbs.rename(columns= {'uniprot': 'uniprot_acc'}).drop(columns='organism').copy()

In [359]:
# Set mlo col as list-like and explode() to separate list elements into separate rows
Proteins_has_Mlos = Proteins_has_Mlos.assign(mlo= Proteins_has_Mlos.mlo.str.split(',')).explode('mlo')

In [360]:
Proteins_has_Mlos = Proteins_has_Mlos.merge(id_proteins).sort_values('id_protein')

In [361]:
Proteins_has_Mlos

Unnamed: 0,uniprot_acc,mlo,rol,db,id_protein
5641,A0A024RBG1,null_phasepdb_ht,regulator,phasepdb_ht,1
5996,A0A096LP55,Stress granule,regulator,drllps,2
1169,A0A0U1RRE5,P-body,component,phasepdb_uniprot,3
1170,A0A0U1RRE5,P-body,client,drllps,3
5750,A0AV96,Stress granule,component,phasepdb_ht,4
...,...,...,...,...,...
8280,Q9Y649,P-body,client,drllps,4367
6312,R4GMX3,PcG body,client,drllps,4368
7019,V9GYY5,P-body,client,drllps,4369
7018,V9GYY5,Paraspeckle,client,drllps,4369


In [362]:
# Strip blank spaces
Proteins_has_Mlos.mlo = Proteins_has_Mlos.mlo.str.strip()

## MLOs tables  
cols: id_mlo, mlo

In [331]:
entrada_dbs.mlo.value_counts()

Nucleolus               2042
Stress granule           918
Postsynaptic density     917
null_phasepdb_ht         750
P-body                   748
                        ... 
 granular component        1
Heterochromatin            1
Sress granule              1
Histone Locus body         1
heterochromatin            1
Name: mlo, Length: 147, dtype: int64

In [333]:
entrada_dbs.mlo = entrada_dbs.mlo.str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [334]:
entrada_dbs.mlo.value_counts()

Nucleolus                                         2133
Stress granule                                    1407
Postsynaptic density                              1374
P-body                                             880
null_phasepdb_ht                                   750
                                                  ... 
condensed compartments of microtubule bundling       1
Cytoplasmic ribonucleoprotein granule                1
Sress granule                                        1
Cytoplasmic granule                                  1
TIS granule                                          1
Name: mlo, Length: 116, dtype: int64

In [338]:
Mlos = pd.DataFrame({'mlo': entrada_dbs.mlo.unique(), 'id_mlo': range(1, len(entrada_dbs.mlo.unique())+1)})

In [339]:
Mlos

Unnamed: 0,mlo,id_mlo
0,cytoplasmic stress granule,1
1,cytoplasmic ribonucleoprotein granule,2
2,"synaptosome, neuron projection",3
3,neuronal ribonucleoprotein granule,4
4,nuclear protein granule,5
...,...,...
112,Perinucleolar compartment,113
113,U body,114
114,Receptor cluster,115
115,Pericentriolar matrix,116


In [385]:
Mlos.to_csv('db_tables/Mlos.tsv', sep='\t', index= False)

In [363]:
# Add id_mlo
Proteins_has_Mlos = Proteins_has_Mlos.merge(Mlos).sort_values('id_protein')

In [365]:
# Add id_rol and id_database
Proteins_has_Mlos = Proteins_has_Mlos.merge(Rol)
Proteins_has_Mlos = Proteins_has_Mlos.rename(columns={'db': 'database'}).merge(Databases).sort_values('id_protein')

In [366]:
Proteins_has_Mlos.drop(columns=['uniprot_acc',	'mlo',	'rol',	'database'], inplace= True)
Proteins_has_Mlos.rename(columns= {'id_protein': 'Proteins_id_protein',	'id_mlo': 'Mlos_id_mlo',
                                    'id_rol': 'Rol_id_rol', 'id_database': 'Databases_id_database'}, inplace= True)
Proteins_has_Mlos

Unnamed: 0,Proteins_id_protein,Mlos_id_mlo,Rol_id_rol,Databases_id_database
0,1,92,2,2
2546,2,56,2,1
3186,3,24,1,1
7880,3,24,3,3
3187,4,24,1,1
...,...,...,...,...
7319,4367,24,1,1
7320,4368,103,1,1
7321,4369,64,1,1
7322,4369,65,1,1


In [384]:
Proteins_has_Mlos.to_csv('db_tables/Proteins_has_Mlos.tsv', sep='\t', index= False)

# Finally, Proteins table

In [225]:
proteins.head()

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name,length,sequence
0,1,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,4369,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
1,2,A0A096LP55,HGNC:51714,440567.0,UQCRHL,4369,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...
2,3,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,4369,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
3,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...
4,5,A0FGR8,HGNC:22211,57488.0,ESYT2,4369,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...


In [220]:
# Mobidb disorder content
mobidb = pd.read_csv('dc_mobidb_lite.csv').rename(columns={'uniprot': 'uniprot_acc'})
mobidb.head()

Unnamed: 0,uniprot_acc,dc
0,P62258,0.086
1,Q15172,0.107
2,Q14738,0.159
3,Q13362,0.065
4,Q9Y3L3,0.359


In [226]:
# Add disorder content
Proteins = proteins.merge(mobidb, how= 'left')

In [255]:
Proteins.rename(columns= {'dc': 'disorder_content'}, inplace= True)

In [256]:
Proteins

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name,length,sequence,disorder_content
0,1,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,4369,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,
1,2,A0A096LP55,HGNC:51714,440567.0,UQCRHL,4369,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...,0.330
2,3,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,4369,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...,1.000
3,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...,0.042
4,5,A0FGR8,HGNC:22211,57488.0,ESYT2,4369,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...,0.215
...,...,...,...,...,...,...,...,...
4364,4365,Q9UNL7,,,,4369,KLDVEEPDSANSSFYSTRSAPASQASLRATSSTQSLARLGSPDYGN...,
4365,4366,Q9Y4C0,HGNC:8010,9369.0,NRXN3,4369,MSSTLHSVFFTLKVSILLGSLLGLCLGLEFMGLPNQWARYLRWDAS...,
4366,4367,Q9Y649,,,,4369,MNDLICFLDNTFKNNVLSQAWWCVHLVPTIWEAEAGGSLEPRSLKL...,
4367,4368,R4GMX3,,,,4369,MELSESVQKGFQMLADPRSFDSNAFTLLLRAAFQSLLDAQADEAVL...,0.194


In [257]:
Proteins.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4369 entries, 0 to 4368
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id_protein        4369 non-null   int64  
 1   uniprot_acc       4369 non-null   object 
 2   hgnc_id           4283 non-null   object 
 3   gene_id           4280 non-null   float64
 4   gene_name         4283 non-null   object 
 5   length            4369 non-null   int64  
 6   sequence          4367 non-null   object 
 7   disorder_content  2655 non-null   float64
dtypes: float64(2), int64(2), object(4)
memory usage: 307.2+ KB


In [386]:
Proteins.to_csv('db_tables/Proteins.tsv', sep='\t', index= False)