In [1]:
#%pip install cython
#%pip install pyranges

In [1]:
import pandas as pd
import numpy as np
import pyranges as pr

In [20]:
# Proteins from each LLPS database with their roles, mlos and dataset
database_entrada = pd.read_csv('database_entrada.csv').drop_duplicates()
database_entrada.rename(columns= {'uniprot': 'uniprot_acc'}, inplace= True)
database_entrada.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8182 entries, 0 to 8381
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uniprot_acc  8182 non-null   object
 1   organism     8182 non-null   object
 2   mlo          7361 non-null   object
 3   rol          8182 non-null   object
 4   db           8182 non-null   object
dtypes: object(5)
memory usage: 383.5+ KB


In [21]:
database_entrada.drop(columns='organism', inplace= True)

# Load proteins, mutations, domains and regions tables

## Load the others tables

In [8]:
# protein table for our db. Same above but one protein by row
protein = pd.read_csv('db_tables/protein.tsv', sep='\t')

In [9]:
# DataFrame with unique id_protein col
id_protein = protein[['id_protein', 'uniprot_acc']].copy()

In [347]:
# only clinvar mutations at the moment
mutations = pd.read_csv('../datasets/clinvar_all_proteins_mutations_v2.csv.gz', compression='gzip') # comes from parse_clinvar.py

  interactivity=interactivity, compiler=compiler, result=result)


In [181]:
disorder = pd.read_csv('disorder_lite.csv').rename(columns={'uniprot': 'uniprot_acc'})
low_complexity = pd.read_csv('low_complexity.csv').rename(columns={'uniprot': 'uniprot_acc'})
pfam = pd.read_csv('pfam.csv').rename(columns={'uniprot': 'uniprot_acc'})

In [182]:
# Add and unique integer ID fow low_complexity and disorder
low_complexity['id_lc'] = range(1, len(low_complexity)+1)
disorder['id_idr'] = range(1, len(disorder)+1)

---  
# consequence and source tables

In [183]:
cf = mutations.consequence.value_counts()
cf

missense       132500
frameshit       18891
nonsense        11265
deletion         2610
insertion        1103
delins            763
duplication       619
Name: consequence, dtype: int64

In [184]:
consequence = pd.DataFrame({'id_consequence': range(1, len(cf)+1), 'consequence': cf.index})
consequence

Unnamed: 0,id_consequence,consequence
0,1,missense
1,2,frameshit
2,3,nonsense
3,4,deletion
4,5,insertion
5,6,delins
6,7,duplication


In [185]:
mutations.source.value_counts()

clinvar    167751
Name: source, dtype: int64

In [186]:
source = pd.DataFrame({'id_source': [1,2,3], 'source': ['clinvar', 'disgenet', 'uniprot']})
source

Unnamed: 0,id_source,source
0,1,clinvar
1,2,disgenet
2,3,uniprot


In [187]:
consequence.to_csv('db_tables/consequence.tsv', sep='\t', index = False)

In [188]:
source.to_csv('db_tables/source.tsv', sep='\t', index = False)

---  
# mutation table  
cols: *id_mutation, snp_id, chromosome, start_genomic, end_genomic, start_aa, end_aa, from_aa, to_aa, id_source, id_protein*

In [189]:
mutations[~mutations.end_aa.isnull()][['start_aa',	'end_aa',	'from',	'to',	'consequence',	'source']]

Unnamed: 0,start_aa,end_aa,from,to,consequence,source
5,1755,1757.0,LeuThr,,deletion,clinvar
13,23,24.0,GlyGlu,,deletion,clinvar
14,47,54.0,GlyArg,,deletion,clinvar
16,295,298.0,AspLeu,,deletion,clinvar
18,116,124.0,GluVal,,deletion,clinvar
...,...,...,...,...,...,...
23978,1,2.0,MetGly,Ala,insertion,clinvar
23979,517,518.0,IleLeu,Ter,insertion,clinvar
23981,229,230.0,GlyPro,,insertion,clinvar
23984,1171,1172.0,AspGlu,,insertion,clinvar


In [324]:
mutations.columns

Index(['id_protein', 'uniprot_acc', 'hgnc_id', 'gene_id', 'gene_name',
       'length', 'sequence', 'disorder_content', 'alleleid', 'variationid',
       'snpid', 'nsv', 'geneid', 'genesymbol', 'chromosomeaccession',
       'chromosome', 'start', 'stop', 'type', 'name', 'origin', 'phenotypeids',
       'phenotypelist', 'otherids', 'citation_source', 'citation_id',
       'nuccore_id', 'cambio', 'start_aa', 'end_aa', 'from', 'to',
       'consequence', 'source'],
      dtype='object')

In [362]:
# Subset by cols to keep for mutation db table
mutation = mutations[['uniprot_acc', 'snpid', 'chromosome', 'start', 'stop', 'start_aa', 'end_aa', 'from', 'to', 'consequence', 'citation_source', 'citation_id']].copy()
mutation.rename(columns={'snpid': 'snp_id', 'start': 'start_genomic', 'stop': 'end_genomic', 'from': 'from_aa', 'to': 'to_aa'}, inplace= True)

In [326]:
mutation.columns

Index(['uniprot_acc', 'snp_id', 'chromosome', 'start_genomic', 'end_genomic',
       'start_aa', 'end_aa', 'from_aa', 'to_aa', 'consequence',
       'citation_source', 'citation_id'],
      dtype='object')

In [363]:
mutation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458111 entries, 0 to 458110
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   uniprot_acc      458111 non-null  object 
 1   snp_id           389760 non-null  float64
 2   chromosome       458111 non-null  object 
 3   start_genomic    458111 non-null  float64
 4   end_genomic      458111 non-null  float64
 5   start_aa         458111 non-null  int64  
 6   end_aa           8790 non-null    float64
 7   from_aa          458111 non-null  object 
 8   to_aa            396154 non-null  object 
 9   consequence      458111 non-null  object 
 10  citation_source  367909 non-null  object 
 11  citation_id      367909 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 41.9+ MB


In [330]:
len(mutation.snp_id.unique())

121157

In [364]:
# Add an unique ID for each mutation, type INT
mutation['id_mutation'] = range(1, len(mutation)+1)

In [365]:
# Fill those mutations containing NaNs in the end_aa col with the start_aa value
mutation.end_aa = mutation.end_aa.fillna(value= mutation.start_aa).apply(int)

In [366]:
mutation.head()

Unnamed: 0,uniprot_acc,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,consequence,citation_source,citation_id,id_mutation
0,A6H8Y1,879255400.0,5,71512291.0,71512293.0,1371,1371,Arg,,deletion,,,1
1,A6NHR9,886043300.0,18,2700757.0,2700759.0,497,497,Lys,,deletion,,,2
2,A6NHR9,886044900.0,18,2697985.0,2697987.0,430,430,His,,deletion,,,3
3,A6NHR9,1598343000.0,18,2705766.0,2705768.0,639,639,Gly,,deletion,PubMed,26467025.0,4
4,A6NHR9,1598294000.0,18,2673343.0,2673345.0,164,164,Arg,,deletion,,,5


In [367]:
# Add IDs from protein and consequence
mutation = mutation.merge(id_protein)
mutation = mutation.merge(consequence)
mutation.drop(columns=['uniprot_acc', 'consequence'], inplace= True)

In [352]:
def format_snp(df, column):
    '''
    format an int snps column in a DataFrame containing -1 values.
    Returns: the snp column in str format ('rs1580653772' or 'nan')
    '''
    #a = df.column.replace(-1, 'nan')
    a = df[column]
    a = a.apply(str)
    a = a.map(lambda x: 'rs' + x if x != 'nan' else x)
    a = a.replace(['rs-1'], 'nan')
    df[column] = a

In [368]:
# Format snp_id col
format_snp(mutation, 'snp_id')

In [369]:
# Format from_aa and to_aa cols to 1 letter code
from Bio.SeqUtils import seq1
mutation['from_aa'] = mutation['from_aa'].map(lambda x: seq1(x))
mutation['to_aa'] = mutation['to_aa'].apply(str).map(lambda x: seq1(x) if x != 'nan' else x)

In [370]:
mutation.columns

Index(['snp_id', 'chromosome', 'start_genomic', 'end_genomic', 'start_aa',
       'end_aa', 'from_aa', 'to_aa', 'citation_source', 'citation_id',
       'id_mutation', 'id_protein', 'id_consequence'],
      dtype='object')

In [371]:
mutation = mutation[['id_mutation', 'snp_id', 'chromosome', 'start_genomic', 'end_genomic', 'start_aa','end_aa',
                    'from_aa', 'to_aa', 'id_protein', 'id_consequence', 'citation_source', 'citation_id']].sort_values('id_mutation')
mutation

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence,citation_source,citation_id
0,1,rs879255413.0,5,71512291.0,71512293.0,1371,1371,R,,19,4,,
1,2,rs886043345.0,18,2700757.0,2700759.0,497,497,K,,25,4,,
2,3,rs886044914.0,18,2697985.0,2697987.0,430,430,H,,25,4,,
3,4,rs1598342592.0,18,2705766.0,2705768.0,639,639,G,,25,4,PubMed,26467025
4,5,rs1598293848.0,18,2673343.0,2673345.0,164,164,R,,25,4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450580,458107,,2,49922240.0,49922240.0,1410,1410,R,*,4362,3,PubMed,28492532
450581,458108,,2,50236893.0,50236893.0,1148,1148,R,*,4362,3,PubMed,19896112
450582,458109,,2,50236893.0,50236893.0,1148,1148,R,*,4362,3,PubMed,21964664
450583,458110,,2,50236893.0,50236893.0,1148,1148,R,*,4362,3,PubMed,25149956


In [372]:
mutation.chromosome = mutation.chromosome.apply(str)

In [373]:
mutation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458111 entries, 0 to 450584
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id_mutation      458111 non-null  int32  
 1   snp_id           458111 non-null  object 
 2   chromosome       458111 non-null  object 
 3   start_genomic    458111 non-null  float64
 4   end_genomic      458111 non-null  float64
 5   start_aa         458111 non-null  int64  
 6   end_aa           458111 non-null  int64  
 7   from_aa          458111 non-null  object 
 8   to_aa            458111 non-null  object 
 9   id_protein       458111 non-null  int64  
 10  id_consequence   458111 non-null  int64  
 11  citation_source  367909 non-null  object 
 12  citation_id      367909 non-null  object 
dtypes: float64(2), int32(1), int64(4), object(6)
memory usage: 47.2+ MB


In [378]:
mutation.duplicated().any()

False

## mutation_has_citation

In [375]:
mutation_has_pubmed = mutation[['id_mutation', 'citation_source', 'citation_id']].copy()

In [376]:
mutation_has_pubmed.citation_source.value_counts()

PubMed           364303
NCBIBookShelf      3460
PubMedCentral       146
Name: citation_source, dtype: int64

In [377]:
mutation_has_pubmed.citation_id.value_counts()

28492532    115074
25741868     58328
26467025      8931
24728327      7861
24033266      7812
             ...  
15050448         1
16476093         1
11641390         1
15219840         1
14576434         1
Name: citation_id, Length: 26365, dtype: int64

In [360]:
mutation.to_csv('db_tables/mutation.tsv', sep='\t', index = False)

---
## Para asignar los rangos debo tener:  
- Tabla de mutaciones con id_mutation, *id_protein(Chromosome), start_aa(Start), end_aa(End)*  
- Tablas de lc, idr y pfam con id unico 

---  
# Pfam Tables

In [204]:
pfam.head()

Unnamed: 0,uniprot_acc,tipo,start,end
0,O94910,7tm_2,857,1093
1,Q9HAR2,7tm_2,861,1097
2,O14514,7tm_2,944,1180
3,O75899,7tm_3,475,743
4,Q9NZH0,7tm_3,49,291


## pfam_domain  
cols: pfam_id, pfam_domain, por ej: PF00003 7tm_3

In [205]:
# Array with unique pfam domains
pf_domain = pfam.tipo.unique() # unique pfam domains (2939 for this set of proteins)

In [206]:
pfam_domain = pd.DataFrame({'pfam_domain': pf_domain, 'id_pfam': range(1, len(pf_domain)+1)})  # luego cambiar el id por los PF000...
pfam_domain

Unnamed: 0,pfam_domain,id_pfam
0,7tm_2,1
1,7tm_3,2
2,ATP-synt_ab,3
3,GTP_EFTU,4
4,HLH,5
...,...,...
2934,PhoLip_ATPase_C,2935
2935,HIP1_clath_bdg,2936
2936,DAO_C,2937
2937,Armet,2938


In [207]:
pfam_domain.to_csv('db_tables/pfam_domain.tsv', sep='\t', index= False)

## protein_has_pfam_domain  
cols: id_protein, id_pfam, start, end, length

In [208]:
protein_has_pfam_domain = pfam.merge(id_protein) # agregar col id_protein
protein_has_pfam_domain['length'] = protein_has_pfam_domain.end - protein_has_pfam_domain.start + 1 # col length

In [209]:
protein_has_pfam_domain.rename(columns= {'tipo': 'pfam_domain'}, inplace= True)
protein_has_pfam_domain = protein_has_pfam_domain.merge(pfam_domain) # to add the col pfam_id

In [210]:
protein_has_pfam_domain = protein_has_pfam_domain[['id_protein', 'id_pfam', 'start', 'end', 'length']].sort_values('id_protein')
protein_has_pfam_domain

Unnamed: 0,id_protein,id_pfam,start,end,length
6140,1,171,17,144,128
9114,2,814,28,91,64
627,4,50,153,220,68
626,4,50,248,312,65
625,4,50,73,141,69
...,...,...,...,...,...
1461,4365,784,55,184,130
1457,4365,784,475,619,145
5600,4367,2244,160,199,40
10551,4367,2529,305,369,65


In [211]:
protein_has_pfam_domain.to_csv('db_tables/protein_has_pfam_domain.tsv', sep='\t', index= False)

## mutation_has_pfam_domain  
cols: id_mutation, id_protein, id_pfam, start, end

### Pyranges  
columnas obligatorias: *Chromosome	 Start	End*  
Chromosome: id_protein    
otras columnas con ids son opcionales y cualquier nombre  
  
por ejemplo df seria tabla de mutaciones  
df = pr.PyRanges(df.rename(columns={'chromosome':'Chromosome','start_position':'Start','end_position':'End'}))  
  
df = pyrange de mutaciones (columnas: Chromosome, Start, End, id_mutacion)  
low_c = pyrange de low complexity(columnas: Chromosome, Start, End, id_low, id_proteina)  
data = df.join(low_c, strandedness=False, slack=1).drop(like="_b") # mutaciones lo junto con low_complex  
strandedness=False no tener en cuenta el Strand  
slack=1 coincidir los extremos. Importante  
drop(like="_b") eliminar el Chromosome, Start, End de low_c (en pfam no hacer el drop)  
data = data.df[[Chromosome, Start, End, id_mutacion, id_low, id_proteina]] # pasa de pyrange a dataframe

In [212]:
mutation.head()

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence
0,1,rs879255413,5,71512291,71512293,1371,1371,R,,19,4
1,2,rs886043345,18,2700757,2700759,497,497,K,,25,4
2,3,rs886044914,18,2697985,2697987,430,430,H,,25,4
3,4,rs1598342592,18,2705766,2705768,639,639,G,,25,4
4,5,rs1598293848,18,2673343,2673345,164,164,R,,25,4


In [213]:
# df has pfam domains data
df = pfam.rename(columns={'tipo': 'pfam_domain'}).merge(pfam_domain)
df = df.merge(id_protein)                      # mapping uniprot_acc - id_protein
df.drop(columns='uniprot_acc', inplace= True)

In [214]:
df.head()

Unnamed: 0,pfam_domain,start,end,id_pfam,id_protein
0,7tm_2,857,1093,1,477
1,GPS,800,844,690,477
2,Gal_Lectin,48,128,745,477
3,OLF,144,396,773,477
4,Latrophilin,1113,1474,818,477


In [215]:
df.rename(columns={'id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)

In [216]:
df.head()

Unnamed: 0,pfam_domain,Start,End,id_pfam,Chromosome
0,7tm_2,857,1093,1,477
1,GPS,800,844,690,477
2,Gal_Lectin,48,128,745,477
3,OLF,144,396,773,477
4,Latrophilin,1113,1474,818,477


In [217]:
# Create the pyranges object of pfam domains
df_py = pr.PyRanges(df)

In [218]:
aux = mutation[['start_aa', 'end_aa', 'id_mutation', 'id_protein']].copy()
aux.rename(columns={'id_protein': 'Chromosome', 'start_aa': 'Start', 'end_aa': 'End'}, inplace= True)

In [219]:
aux.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,1371,1371,1,19
1,497,497,2,25
2,430,430,3,25
3,639,639,4,25
4,164,164,5,25


In [220]:
# Pyranges object of mutations
aux_py = pr.PyRanges(aux)

In [221]:
# Join both pyranges object: this assings mutations to pfam domains
pfam_py = df_py.join(aux_py, strandedness= False, slack= 1)  # strandedness= False doesnt take count of the chain strand; slack= 1 include bounds

In [222]:
pfam_py.head() # Start and End are from the pfam domain in that protein (a protein may have the same pfam domain repeated at different positions along its sequence).
                # Start_b and End_b are from the mutation in this case

Unnamed: 0,pfam_domain,Start,End,id_pfam,Chromosome,Start_b,End_b,id_mutation
0,UCR_hinge,28,91,814,2,53,53,23987
1,An_peroxidase,727,1272,947,9,981,981,23988
2,An_peroxidase,727,1272,947,9,1039,1039,23994
3,An_peroxidase,727,1272,947,9,1133,1133,23989
4,An_peroxidase,727,1272,947,9,1207,1207,23993
5,LRR_8,50,110,2219,9,65,65,23996
6,Ig_3,329,402,2246,9,391,391,24000
7,I-set,511,597,2914,9,538,538,23997


In [223]:
# Pyranges to DataFrame
mutation_has_pfam_domain = pfam_py.df[['id_mutation', 'Chromosome', 'id_pfam', 'Start', 'End']] # cols to keep

In [224]:
mutation_has_pfam_domain.rename(columns={'Chromosome': 'id_protein', 'Start': 'start', 'End': 'end'}, inplace= True)

In [225]:
mutation_has_pfam_domain.head()

Unnamed: 0,id_mutation,id_protein,id_pfam,start,end
0,23987,2,814,28,91
1,23988,9,947,727,1272
2,23994,9,947,727,1272
3,23989,9,947,727,1272
4,23993,9,947,727,1272


In [226]:
# control
mutation[mutation.id_mutation == 23987]

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence
148020,23987,rs7417535,1,15807492,15807492,53,53,Y,C,2,1


In [227]:
# control
pfam_domain[pfam_domain.id_pfam == 814] 

Unnamed: 0,pfam_domain,id_pfam
813,UCR_hinge,814


In [228]:
mutation_has_pfam_domain[mutation_has_pfam_domain.id_pfam == 814] # ok!

Unnamed: 0,id_mutation,id_protein,id_pfam,start,end
0,23987,2,814,28,91


In [229]:
mutation_has_pfam_domain.to_csv('db_tables/mutation_has_pfam_domain.tsv', sep='\t', index= False)

---  
# low-complexity Tables

## low_complexity  
cols: id_lc, start, end, length, id_protein

In [230]:
low_complexity.head()

Unnamed: 0,uniprot_acc,start,end,id_lc
0,P61981,236,243,1
1,P31947,235,247,2
2,P31947,248,247,3
3,P27348,230,244,4
4,P27348,245,244,5


In [231]:
# Add length col 
low_complexity['length'] = low_complexity.end - low_complexity.start + 1 

In [232]:
# Add id_proteins
low_complexity.rename(columns={'uniprot': 'uniprot_acc'}, inplace= True)
low_complexity = low_complexity.merge(id_protein)
low_complexity.drop(columns='uniprot_acc', inplace= True)

In [233]:
low_complexity.head()

Unnamed: 0,start,end,id_lc,length,id_protein
0,236,243,1,8,1602
1,235,247,2,13,1132
2,248,247,3,0,1132
3,230,244,4,15,1049
4,245,244,5,0,1049


In [234]:
low_complexity.to_csv('db_tables/low_complexity.tsv', sep='\t', index= False)

## mutation_has_low_complexity  
cols: id_mutation, id_lc

In [235]:
# Table for LC data
lc_has = low_complexity.copy()
lc_has.rename(columns={'id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)

In [236]:
# Auxiliar table for mutations
aux_lc = mutation[['start_aa', 'end_aa', 'id_mutation', 'id_protein']].copy()
aux_lc.rename(columns={'id_protein': 'Chromosome', 'start_aa': 'Start', 'end_aa': 'End'}, inplace= True)

In [237]:
aux_lc.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,1371,1371,1,19
1,497,497,2,25
2,430,430,3,25
3,639,639,4,25
4,164,164,5,25


In [238]:
# Create the Pyranges objects
lc_has_py = pr.PyRanges(lc_has)
aux_lc_py = pr.PyRanges(aux_lc)

In [239]:
# Join both pyranges object: this assings mutations to low-complexity regions
lc_py = aux_lc_py.join(lc_has_py, strandedness= False, slack=1).drop(like="_b") # strandedness= False doesnt take count of the chain strand;
                                                                                # slack= 1 include bounds; drop(like="_b"): delete those cols (redudants)

In [240]:
lc_py.head()

Unnamed: 0,Start,End,id_mutation,Chromosome,id_lc,length
0,1133,1133,23989,9,5240,12
1,287,287,3994,16,9733,28
2,197,197,3995,16,9732,29
3,336,336,24003,17,5023,26
4,1038,1038,24009,18,11903,20
5,1469,1469,24030,19,10455,18
6,5,5,24048,23,5786,14
7,7,7,24056,23,5786,14


In [241]:
# Pyrange to DataFrame
mutation_has_low_complexity = lc_py.df[['id_mutation', 'id_lc']] # cols to keep

In [242]:
mutation_has_low_complexity.head()

Unnamed: 0,id_mutation,id_lc
0,23989,5240
1,3994,9733
2,3995,9732
3,24003,5023
4,24009,11903


In [243]:
# Control
low_complexity[low_complexity.id_lc == 5240]

Unnamed: 0,start,end,id_lc,length,id_protein
5239,1128,1139,5240,12,9


In [244]:
protein.iloc[8]

id_protein                                                          9
uniprot_acc                                                    A1KZ92
hgnc_id                                                    HGNC:26359
gene_id                                                      137902.0
gene_name                                                       PXDNL
length                                                           1463
sequence            MEPRLFCWTTLFLLAGWCLPGLPCPSRCLCFKSTVRCMHLMLDHIP...
disorder_content                                                  NaN
Name: 8, dtype: object

In [245]:
mutation[mutation.id_mutation == 23989] # It's allright! Mutation in aa 1133, which belongs to the low-complexity region between 1128 - 1139 in that protein

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence
138298,23989,rs74731075,8,51408226,51408226,1133,1133,A,V,9,1


In [246]:
mutation_has_low_complexity.to_csv('db_tables/mutation_has_low_complexity.tsv', sep='\t', index= False)

---  
# Disorder Tables

## disorder_region  
cols: id_idr, start, end, length, id_protein

In [247]:
# Add length col 
disorder['length'] = disorder.end - disorder.start + 1 

In [248]:
disorder_region = disorder.rename(columns={'uniprot': 'uniprot_acc'}).merge(id_protein).sort_values('id_protein')
disorder_region.drop(columns='uniprot_acc', inplace= True)
disorder_region.head()

Unnamed: 0,start,end,id_idr,length,id_protein
2338,1,30,2339,30,2
1426,1,68,1427,68,3
2489,1,25,2490,25,4
5782,660,754,5783,95,5
5781,1,103,5782,103,5


In [249]:
disorder_region.to_csv('db_tables/disorder_region.tsv', sep='\t', index= False)

## mutation_has_disorder_region  
cols: id_mutation, id_idr

In [250]:
# Auxiliar table for mutations from low-complexity is the same for disorder. id-protein, start and end of the mutation
aux_idr = aux_lc
aux_idr.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,1371,1371,1,19
1,497,497,2,25
2,430,430,3,25
3,639,639,4,25
4,164,164,5,25


In [251]:
# Table for IDRs data
idr_has = disorder_region.copy()
idr_has.rename(columns={'id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)
idr_has.head()

Unnamed: 0,Start,End,id_idr,length,Chromosome
2338,1,30,2339,30,2
1426,1,68,1427,68,3
2489,1,25,2490,25,4
5782,660,754,5783,95,5
5781,1,103,5782,103,5


In [252]:
# Create the Pyranges objects
idr_has_py = pr.PyRanges(idr_has)
aux_idr_py = pr.PyRanges(aux_idr)

In [253]:
# Join both pyranges object: this assings mutations to pfam domains
idr_py = aux_idr_py.join(idr_has_py, strandedness= False, slack=1).drop(like="_b") # strandedness= False doesnt take count of the chain strand;
                                                                                   # slack= 1 include bounds; drop(like="_b"): delete those cols (redudants)

In [254]:
idr_py.head()

Unnamed: 0,Start,End,id_mutation,Chromosome,id_idr,length
0,70,70,24001,12,4339,28
1,287,287,3994,16,4224,129
2,336,336,24003,17,2225,157
3,1014,1014,24004,17,2227,49
4,1371,1371,1,19,4637,76
5,1180,1180,24014,19,4635,293
6,2580,2580,24016,19,4645,65
7,213,213,24021,19,4630,49


In [255]:
# Pyrange to DataFrame
mutation_has_disorder_region = idr_py.df[['id_mutation', 'id_idr']] # cols to keep
mutation_has_disorder_region.head()

Unnamed: 0,id_mutation,id_idr
0,24001,4339
1,3994,4224
2,24003,2225
3,24004,2227
4,1,4637


In [256]:
# Control
mutation[mutation.id_mutation == 24001]

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence
148021,24001,rs116340837,6,149474335,149474335,70,70,A,V,12,1


In [257]:
id_protein[id_protein.id_protein == 12]

Unnamed: 0,id_protein,uniprot_acc
11,12,A2A288


In [258]:
disorder[disorder.id_idr == 4339] # It's Ok. A point mutation in position 70 in the idr region between 48-75

Unnamed: 0,uniprot_acc,start,end,id_idr,length
4338,A2A288,48,75,4339,28


In [259]:
mutation_has_disorder_region.to_csv('db_tables/mutation_has_disorder_region.tsv', sep='\t', index= False)

---   
# Rol table  
cols: id_rol, rol

In [10]:
database_entrada.rol.unique()

array(['driver', 'unassigned', 'regulator', 'client'], dtype=object)

In [11]:
database_entrada.rol.value_counts()

client        4138
unassigned    2272
regulator     1395
driver         377
Name: rol, dtype: int64

In [12]:
rol = pd.DataFrame({'rol': database_entrada.rol.value_counts().index, 'id_rol': range(1, len(database_entrada.rol.value_counts())+1)})
rol

Unnamed: 0,rol,id_rol
0,client,1
1,unassigned,2
2,regulator,3
3,driver,4


In [13]:
rol.to_csv('db_tables/rol.tsv', sep='\t', index= False)

---  
# dataset table  
cols: id_dataset, dataset

In [14]:
database_entrada.db.value_counts()

drllps              5034
phasepdb_ht         2346
phasepdb_uniprot     384
phasepdb_rev         297
phasepro             121
Name: db, dtype: int64

In [15]:
database = pd.DataFrame({'database': database_entrada.db.value_counts().index, 'id_database': range(1, len(database_entrada.db.value_counts())+1)})
database

Unnamed: 0,database,id_database
0,drllps,1
1,phasepdb_ht,2
2,phasepdb_uniprot,3
3,phasepdb_rev,4
4,phasepro,5


In [86]:
database.to_csv('db_tables/dataset.tsv', sep='\t', index= False)

---  
# MLOs tables  

In [17]:
database_entrada.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8182 entries, 0 to 8381
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   uniprot  8182 non-null   object
 1   mlo      7361 non-null   object
 2   rol      8182 non-null   object
 3   db       8182 non-null   object
dtypes: object(4)
memory usage: 319.6+ KB


In [18]:
database_entrada.mlo.value_counts()

Nucleolus                       2063
Postsynaptic density            1374
Stress granule                  1334
P-body                           823
Centrosome/Spindle pole body     534
                                ... 
inclusion body                     1
MORC3-NBs                          1
granular component                 1
TIS granule                        1
extracellular matrix               1
Name: mlo, Length: 113, dtype: int64

In [22]:
len(database_entrada.uniprot_acc.unique())

4368

## Deal with mlos annotations

In [27]:
database_entrada.mlo = database_entrada.mlo.str.strip()

In [28]:
len(database_entrada.mlo.unique()) # no blank spaces

114

In [29]:
database_entrada.mlo.value_counts()

Nucleolus                       2063
Postsynaptic density            1374
Stress granule                  1334
P-body                           823
Centrosome/Spindle pole body     534
                                ... 
inclusion body                     1
MORC3-NBs                          1
granular component                 1
TIS granule                        1
extracellular matrix               1
Name: mlo, Length: 113, dtype: int64

### Paraspeckle

In [30]:
(database_entrada.mlo == 'Paraspeckle').sum()

101

In [31]:
# Unify paraspeckle with Paraspeckle
database_entrada.replace('paraspeckle', 'Paraspeckle', inplace= True)
(database_entrada.mlo == 'Paraspeckle').sum()

104

### Sam68

In [32]:
(database_entrada.mlo == 'Sam68 nuclear bodies').sum()

11

In [33]:
(database_entrada.mlo == 'Sam68 nuclear bodies (SNBs)').sum()

2

In [34]:
(database_entrada.mlo == 'Sam68 nuclear body').sum()

12

In [35]:
database_entrada.replace(['Sam68 nuclear bodies', 'Sam68 nuclear bodies (SNBs)'], 'Sam68 nuclear body', inplace= True)
(database_entrada.mlo == 'Sam68 nuclear body').sum()

25

### PML body  
**PhaSepDB**: The PML bodies are dynamic nuclear protein aggregates interspersed between chromatin. These punctate nuclear structures are call PML bodies because the PML gene is essential for their formation. are present in most mammalian cell nuclei and typically number 1 to 30 bodies per nucleus.  
**DrLLPS**: PML nuclear bodies are annotetad in the nucleus. They are matrix-associated domains that recruit an astonishing variety of seemingly unrelated proteins.

In [36]:
(database_entrada.mlo == 'PML nuclear body').sum()

97

In [37]:
(database_entrada.mlo == 'PML body').sum()

77

In [38]:
database_entrada.replace('PML body', 'PML nuclear body', inplace= True)
(database_entrada.mlo == 'PML nuclear body').sum()

174

### Polycomb body

In [39]:
(database_entrada.mlo == 'Polycomb bodies').sum()

2

In [40]:
database_entrada.replace('Polycomb bodies', 'Polycomb body', inplace= True)
(database_entrada.mlo == 'Polycomb body').sum()

4

### Pre and postsynaptic density

In [41]:
database_entrada.replace('Pre and postsynaptic densities', 'Pre and postsynaptic density', inplace= True)
(database_entrada.mlo == 'Pre and postsynaptic density').sum()

8

### Nuclear speckle

In [42]:
(database_entrada.mlo == 'Nucleus speckles').sum() #phasepdb

115

In [43]:
(database_entrada.mlo == 'Nuclear speckle').sum() # drllps

110

In [44]:
(database_entrada.mlo == 'Nuclear speckles').sum() #phasepdb

24

In [45]:
(database_entrada.mlo == 'nuclear speckle').sum()

3

In [46]:
database_entrada.replace(['Nucleus speckles', 'Nuclear speckles', 'nuclear speckle'], 'Nuclear speckle', inplace= True)
(database_entrada.mlo == 'Nuclear speckle').sum()

252

### Heterochromatin

In [47]:
(database_entrada.mlo == 'heterochromatin').sum()

2

In [48]:
database_entrada.replace('heterochromatin', 'Heterochromatin', inplace= True)
(database_entrada.mlo == 'Heterochromatin').sum()

3

### Cytoplasmic ribonucleoprotein granule

In [49]:
(database_entrada.mlo == 'cytoplasmic ribonucleoprotein granule').sum()


5

In [50]:
database_entrada.replace('cytoplasmic ribonucleoprotein granule', 'Cytoplasmic ribonucleoprotein granule', inplace= True)

### Membrane cluster

In [51]:
(database_entrada.mlo == 'Membrane clusters').sum()

4

In [52]:
(database_entrada.mlo == 'membrane cluster').sum()

3

In [53]:
database_entrada.replace(['Membrane clusters', 'membrane cluster'], 'Membrane cluster', inplace= True)
(database_entrada.mlo == 'Membrane cluster').sum()

7

### Nuclear body

In [54]:
database_entrada.replace('nuclear body', 'Nuclear body', inplace= True)
(database_entrada.mlo == 'Nuclear body').sum()

11

### Nucleolus

In [55]:
database_entrada.replace('nucleolus', 'Nucleolus', inplace= True)
(database_entrada.mlo == 'Nucleolus').sum()

2064

In [56]:
(database_entrada.mlo == 'Centrosome/Spindle pole body').sum() # keep this annotation

534

## OK, now mlo table  
cols: id_mlo, mlo

In [57]:
database_entrada.mlo.value_counts()

Nucleolus                       2064
Postsynaptic density            1374
Stress granule                  1334
P-body                           823
Centrosome/Spindle pole body     534
                                ... 
inclusion body                     1
liquid-like DYRK3 speckles         1
MORC3-NBs                          1
granular component                 1
extracellular matrix               1
Name: mlo, Length: 99, dtype: int64

In [66]:
# EXPLODE:
# P-body, Stress granule
# P-body, GW body
# Set mlo col as list-like and explode() to separate list elements into separate rows
# before: 8266 rows
database_entrada = database_entrada.assign(mlo= database_entrada.mlo.str.split(',')).explode('mlo')
database_entrada.mlo = database_entrada.mlo.str.strip()
database_entrada
# after: 8272 rows

Unnamed: 0,uniprot_acc,mlo,rol,db
0,P35637,cytoplasmic stress granule,driver,phasepro
1,P35637,Cytoplasmic ribonucleoprotein granule,driver,phasepro
2,Q06787,cytoplasmic stress granule,driver,phasepro
3,Q06787,Cytoplasmic ribonucleoprotein granule,driver,phasepro
4,Q06787,synaptosome,driver,phasepro
...,...,...,...,...
8377,O95670,Nucleolus,client,drllps
8378,O95670,Postsynaptic density,client,drllps
8379,Q9H269,Postsynaptic density,client,drllps
8380,Q9Y3D7,Postsynaptic density,client,drllps


In [306]:
# GW-body
entrada_dbs.replace('GW body', 'GW-body', inplace= True)
(entrada_dbs.mlo == 'GW-body').sum()

3

In [58]:
# Postsynaptic density
database_entrada.replace('postsynaptic density', 'Postsynaptic density', inplace= True)
(database_entrada.mlo == 'Postsynaptic density').sum()

1375

In [59]:
# Cytoplasmic ribonucleoprotein granule
database_entrada.replace('cytoplasmic ribonucleoprotein granule', 'Cytoplasmic ribonucleoprotein granule', inplace= True)
(database_entrada.mlo == 'Cytoplasmic ribonucleoprotein granule').sum()

6

In [60]:
# Histone locus body
database_entrada.replace('Histone Locus body', 'Histone locus body', inplace= True)
(database_entrada.mlo == 'Histone locus body').sum()

16

In [61]:
# Stress granule
database_entrada.replace('Sress granule', 'Stress granule', inplace= True)
(database_entrada.mlo == 'Stress granule').sum()

1335

In [67]:
database_entrada.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8188 entries, 0 to 8381
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uniprot_acc  8188 non-null   object
 1   mlo          7367 non-null   object
 2   rol          8188 non-null   object
 3   db           8188 non-null   object
dtypes: object(4)
memory usage: 319.8+ KB


In [68]:
database_entrada.mlo.value_counts()

Nucleolus                       2064
Postsynaptic density            1375
Stress granule                  1338
P-body                           827
Centrosome/Spindle pole body     534
                                ... 
inclusion body                     1
neuron projection                  1
MORC3-NBs                          1
TIS granule                        1
extracellular matrix               1
Name: mlo, Length: 97, dtype: int64

In [72]:
database_entrada.mlo.replace(np.nan, 'nan', inplace= True)

In [73]:
database_entrada.mlo.value_counts()

Nucleolus               2064
Postsynaptic density    1375
Stress granule          1338
P-body                   827
nan                      821
                        ... 
inclusion body             1
neuron projection          1
MORC3-NBs                  1
TIS granule                1
extracellular matrix       1
Name: mlo, Length: 98, dtype: int64

In [74]:
mlo = pd.DataFrame({'mlo': database_entrada.mlo.value_counts().index, 'id_mlo': range(1, len(database_entrada.mlo.unique())+1)})
mlo

Unnamed: 0,mlo,id_mlo
0,Nucleolus,1
1,Postsynaptic density,2
2,Stress granule,3
3,P-body,4
4,,5
...,...,...
93,inclusion body,94
94,neuron projection,95
95,MORC3-NBs,96
96,TIS granule,97


In [75]:
mlo.to_csv('db_tables/mlo.tsv', sep='\t', index= False)

## protein_has_mlo  
cols: id_protein, id_mlo, id_rol, id_database

In [76]:
len(database_entrada.uniprot_acc.unique())

4368

In [77]:
protein_has_mlo = database_entrada.copy()

In [78]:
protein_has_mlo.head()

Unnamed: 0,uniprot_acc,mlo,rol,db
0,P35637,cytoplasmic stress granule,driver,phasepro
1,P35637,Cytoplasmic ribonucleoprotein granule,driver,phasepro
2,Q06787,cytoplasmic stress granule,driver,phasepro
3,Q06787,Cytoplasmic ribonucleoprotein granule,driver,phasepro
4,Q06787,synaptosome,driver,phasepro


In [79]:
# Add id_protein
protein_has_mlo = protein_has_mlo.merge(id_protein)
# Add id_mlo
protein_has_mlo = protein_has_mlo.merge(mlo)
# Add id_rol and id_database
protein_has_mlo = protein_has_mlo.merge(rol)
protein_has_mlo = protein_has_mlo.rename(columns={'db': 'database'}).merge(database).sort_values('id_protein')

In [80]:
protein_has_mlo.drop(columns=['uniprot_acc', 'mlo', 'rol', 'database'], inplace= True)
protein_has_mlo

Unnamed: 0,id_protein,id_mlo,id_rol,id_database
8187,1,5,3,2
567,2,3,3,1
1718,3,4,1,1
5183,3,4,2,3
1912,4,4,1,1
...,...,...,...,...
1946,4366,4,1,1
5083,4367,19,1,1
2873,4368,1,1,1
1935,4368,4,1,1


In [83]:
# protein_has_mlo[protein_has_mlo.duplicated()] # Ver

Unnamed: 0,id_protein,id_mlo,id_rol,id_database
5656,1183,3,2,4
5831,2059,18,2,4


In [85]:
protein_has_mlo.to_csv('db_tables/protein_has_mlo.tsv', sep='\t', index= False)

In [131]:
#aa = entrada_dbs[['uniprot_acc', 'rol']].drop_duplicates().groupby('uniprot_acc').size().reset_index(name='counts')

In [134]:
#aa['counts'].unique().tolist()

[1, 2, 3, 4]

In [135]:
#aa[aa.counts == 4].head()

Unnamed: 0,uniprot_acc,counts
733,P06748,4
1160,P29590,4
1196,P31483,4
1286,P38432,4
1353,P43243,4


In [139]:
#entrada_dbs[entrada_dbs.uniprot_acc == 'P06748'] # es el rol en la db

Unnamed: 0,uniprot_acc,organism,mlo,rol,db
32,P06748,Homo sapiens,Nucleolus,driver,phasepro
33,P06748,Homo sapiens,granular component,driver,phasepro
177,P06748,Homo sapiens,Nucleolus,component,phasepdb_rev
401,P06748,Homo sapiens,null_phasepdb_rev,component,phasepdb_rev
1384,P06748,Homo sapiens,Nucleolus,client,phasepdb_ht
1696,P06748,Homo sapiens,Nucleolus,client,phasepdb_ht
2398,P06748,Homo sapiens,Stress granule,client,phasepdb_ht
3076,P06748,Homo sapiens,null_phasepdb_ht,regulator,phasepdb_ht
3386,P06748,Homo sapiens,Droplet,driver,drllps
3387,P06748,Homo sapiens,Nucleolus,driver,drllps
