In [348]:
#%pip install cython
#%pip install pyranges

In [349]:
import pandas as pd
import numpy as np
import pyranges as pr

In [350]:
# Proteins from each LLPS database with their roles, mlos and dataset
database_entrada = pd.read_csv('database_entrada.csv')
database_entrada.drop(columns='organism', inplace= True)
database_entrada.drop_duplicates(inplace = True)
database_entrada.rename(columns= {'uniprot': 'uniprot_acc'}, inplace= True)
database_entrada.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8182 entries, 0 to 8381
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uniprot_acc  8182 non-null   object
 1   mlo          7361 non-null   object
 2   rol          8182 non-null   object
 3   db           8182 non-null   object
dtypes: object(4)
memory usage: 319.6+ KB


# Load proteins, mutations, domains and regions tables

## Load the others tables

In [351]:
# protein table for our db. Same above but one protein by row
protein = pd.read_csv('db_tables/protein.tsv', sep='\t')

In [352]:
# DataFrame with unique id_protein col
id_protein = protein[['id_protein', 'uniprot_acc']].copy()

In [353]:
# only clinvar mutations at the moment
mutations = pd.read_csv('../datasets/mutations.tsv.gz', sep='\t', compression='gzip') # comes from parse_clinvar.py

  interactivity=interactivity, compiler=compiler, result=result)


In [354]:
disorder = pd.read_csv('disorder_lite.csv').rename(columns={'uniprot': 'uniprot_acc'})
low_complexity = pd.read_csv('low_complexity.csv').rename(columns={'uniprot': 'uniprot_acc'})

In [355]:
pfam = pd.read_csv('pfam.csv').rename(columns={'uniprot': 'uniprot_acc', 'tipo': 'pfam_name'})
pfam_map = pd.read_csv('pfam_map.csv')

In [356]:
pfam_map.duplicated().any()

False

In [357]:
pfam = pfam.merge(pfam_map)

In [358]:
pfam.duplicated().any()

False

In [359]:
# Add and unique integer ID fow low_complexity and disorder
low_complexity['id_lc'] = range(1, len(low_complexity)+1)
disorder['id_idr'] = range(1, len(disorder)+1)

In [360]:
# PMIDs
pmid = pd.read_csv('var_citations.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


---  
# consequence table

In [361]:
cf = mutations.consequence.value_counts()
cf

missense       210190
frameshit       28230
nonsense        16365
deletion         5414
insertion        1868
duplication      1384
delins           1104
Name: consequence, dtype: int64

In [362]:
consequence = pd.DataFrame({'id_consequence': range(1, len(cf)+1), 'consequence': cf.index})
consequence

Unnamed: 0,id_consequence,consequence
0,1,missense
1,2,frameshit
2,3,nonsense
3,4,deletion
4,5,insertion
5,6,duplication
6,7,delins


In [363]:
consequence.to_csv('db_tables/consequence.tsv', sep='\t', index = False)

---  
# mutation table  
cols: *id_mutation, snp_id, chromosome, start_genomic, end_genomic, start_aa, end_aa, from_aa, to_aa, id_source, id_protein, nt_change*

In [364]:
mutations.columns

Index(['snpid', 'chromosome', 'start', 'stop', 'type', 'cambio', 'cambio_nt',
       'id_protein', 'uniprot_acc', 'nuccore_id', 'start_aa', 'end_aa', 'from',
       'to', 'consequence', 'id_mutation'],
      dtype='object')

In [365]:
# Subset by cols to keep for mutation db table
mutation = mutations[['id_protein', 'id_mutation', 'snpid', 'chromosome', 'start', 'stop', 'start_aa', 'end_aa', 'from', 'to', 'consequence', 'cambio_nt']].copy()
mutation.rename(columns={'snpid': 'snp_id', 'start': 'start_genomic', 'stop': 'end_genomic', 'from': 'from_aa', 'to': 'to_aa', 'cambio_nt': 'nt_change'}, inplace= True)

In [366]:
mutation.columns

Index(['id_protein', 'id_mutation', 'snp_id', 'chromosome', 'start_genomic',
       'end_genomic', 'start_aa', 'end_aa', 'from_aa', 'to_aa', 'consequence',
       'nt_change'],
      dtype='object')

In [367]:
mutation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264555 entries, 0 to 264554
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id_protein     264555 non-null  int64  
 1   id_mutation    264555 non-null  int64  
 2   snp_id         209437 non-null  float64
 3   chromosome     264555 non-null  object 
 4   start_genomic  264555 non-null  int64  
 5   end_genomic    264555 non-null  int64  
 6   start_aa       264555 non-null  int64  
 7   end_aa         264555 non-null  int64  
 8   from_aa        264545 non-null  object 
 9   to_aa          229506 non-null  object 
 10  consequence    264555 non-null  object 
 11  nt_change      264555 non-null  object 
dtypes: float64(1), int64(6), object(5)
memory usage: 24.2+ MB


In [368]:
# Add IDs from consequence
mutation = mutation.merge(consequence)
mutation.drop(columns='consequence', inplace= True)

In [369]:
def format_snp(df, column):
    '''
    format an int snps column in a DataFrame containing -1 values.
    Returns: the snp column in str format ('rs1580653772' or 'nan')
    '''
    #a = df.column.replace(-1, 'nan')
    a = df[column]
    #a = a.apply(str)
    a = a.map(lambda x: 'rs' + str(int(x)) if not np.isnan(x) else x)
    df[column] = a

In [370]:
# Format snp_id col
format_snp(mutation, 'snp_id')

In [371]:
mutation.head()

Unnamed: 0,id_protein,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,nt_change,id_consequence
0,2290,1,rs558080743,2,61826480,61826482,709,709,E,,2121_2123AGA[1],4
1,2290,2,,2,61853886,61853888,52,52,E,,148GAG[2],4
2,233,3,rs794727435,19,36102785,36102787,1091,1091,H,,3271_3273del,4
3,233,4,rs1064797236,19,36067889,36067891,255,255,F,,761_763TCT[1],4
4,233,5,rs764610550,19,36090471,36090473,664,664,K,,1987_1989AAG[1],4


In [372]:
mutation = mutation[['id_mutation', 'snp_id', 'chromosome', 'start_genomic', 'end_genomic', 'start_aa','end_aa',
                    'from_aa', 'to_aa', 'id_protein', 'id_consequence', 'nt_change']].sort_values('id_mutation')
mutation

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence,nt_change
0,1,rs558080743,2,61826480,61826482,709,709,E,,2290,4,2121_2123AGA[1]
1,2,,2,61853886,61853888,52,52,E,,2290,4,148GAG[2]
2,3,rs794727435,19,36102785,36102787,1091,1091,H,,233,4,3271_3273del
3,4,rs1064797236,19,36067889,36067891,255,255,F,,233,4,761_763TCT[1]
4,5,rs764610550,19,36090471,36090473,664,664,K,,233,4,1987_1989AAG[1]
...,...,...,...,...,...,...,...,...,...,...,...,...
264550,264551,,1,35822917,35822917,81,81,Q,*,3658,3,241C>T
264551,264552,,6,36871796,36871796,45,45,R,*,4162,3,133C>T
264552,264553,,1,156309285,156309285,518,518,R,*,1342,3,1552C>T
264553,264554,,1,185301020,185301020,358,358,R,*,4266,3,1072C>T


In [373]:
mutation.chromosome = mutation.chromosome.apply(str)

In [374]:
mutation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264555 entries, 0 to 264554
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id_mutation     264555 non-null  int64 
 1   snp_id          209437 non-null  object
 2   chromosome      264555 non-null  object
 3   start_genomic   264555 non-null  int64 
 4   end_genomic     264555 non-null  int64 
 5   start_aa        264555 non-null  int64 
 6   end_aa          264555 non-null  int64 
 7   from_aa         264545 non-null  object
 8   to_aa           229506 non-null  object
 9   id_protein      264555 non-null  int64 
 10  id_consequence  264555 non-null  int64 
 11  nt_change       264555 non-null  object
dtypes: int64(7), object(5)
memory usage: 26.2+ MB


In [375]:
mutation.duplicated().any()

False

In [376]:
mutation.nt_change.str.len().max()

292

In [377]:
mutation.to_csv('db_tables/mutation.tsv', sep='\t', index = False)

## mutation_has_citation

---
## Para asignar los rangos debo tener:  
- Tabla de mutaciones con id_mutation, *id_protein(Chromosome), start_aa(Start), end_aa(End)*  
- Tablas de lc, idr y pfam con id unico 

---  
# Pfam Tables

In [378]:
pfam.head()

Unnamed: 0,uniprot_acc,pfam_name,start,end,pfam_acc
0,O94910,7tm_2,857,1093,PF00002
1,Q9HAR2,7tm_2,861,1097,PF00002
2,O14514,7tm_2,944,1180,PF00002
3,O75899,7tm_3,475,743,PF00003
4,Q9NZH0,7tm_3,49,291,PF00003


In [379]:
len(pfam.pfam_name.unique())

2939

In [380]:
len(pfam.pfam_acc.unique())

2939

In [381]:
pfam[['pfam_name', 'pfam_acc']].drop_duplicates()

Unnamed: 0,pfam_name,pfam_acc
0,7tm_2,PF00002
3,7tm_3,PF00003
9,ATP-synt_ab,PF00006
13,GTP_EFTU,PF00009
24,HLH,PF00010
...,...,...
10837,PhoLip_ATPase_C,PF16212
10840,HIP1_clath_bdg,PF16515
10842,DAO_C,PF16901
10843,Armet,PF10208


## pfam_domain  
cols: pfam_id, pfam_domain, por ej: PF00003 7tm_3

In [382]:
# Array with unique pfam domains
pf_domain = pfam.pfam_name.unique() # unique pfam domains (2939 for this set of proteins)

In [383]:
pfam_domain = pfam[['pfam_name', 'pfam_acc']].drop_duplicates()
pfam_domain.rename(columns={'pfam_acc': 'id_pfam', 'pfam_name': 'pfam_domain'}, inplace= True)

In [384]:
pfam_domain.to_csv('db_tables/pfam_domain.tsv', sep='\t', index= False)

## protein_has_pfam_domain  
cols: id_protein, id_pfam, start, end, length

In [385]:
protein_has_pfam_domain = pfam.merge(id_protein) # agregar col id_protein
protein_has_pfam_domain['length'] = protein_has_pfam_domain.end - protein_has_pfam_domain.start + 1 # col length

In [386]:
protein_has_pfam_domain

Unnamed: 0,uniprot_acc,pfam_name,start,end,pfam_acc,id_protein,length
0,O94910,7tm_2,857,1093,PF00002,477,237
1,O94910,GPS,800,844,PF01825,477,45
2,O94910,Gal_Lectin,48,128,PF02140,477,81
3,O94910,OLF,144,396,PF02191,477,253
4,O94910,Latrophilin,1113,1474,PF02354,477,362
...,...,...,...,...,...,...,...
10862,O95147,DSPc,34,164,PF00782,491,131
10863,Q9NRW4,DSPc,12,141,PF00782,3737,130
10864,O75319,DSPc,124,249,PF00782,363,126
10865,P51452,DSPc,37,176,PF00782,1413,140


In [387]:
protein_has_pfam_domain.drop(columns='pfam_name', inplace= True)
protein_has_pfam_domain = protein_has_pfam_domain.merge(pfam) # to add the col pfam_id

In [388]:
protein_has_pfam_domain.rename(columns={'pfam_acc': 'id_pfam'}, inplace= True)

In [389]:
protein_has_pfam_domain = protein_has_pfam_domain[['id_protein', 'id_pfam', 'start', 'end', 'length']].sort_values('id_protein')
protein_has_pfam_domain

Unnamed: 0,id_protein,id_pfam,start,end,length
3996,1,PF00293,17,144,128
7893,2,PF02320,28,91,64
2276,4,PF00076,153,220,68
2275,4,PF00076,248,312,65
2274,4,PF00076,73,141,69
...,...,...,...,...,...
531,4365,PF00008,647,678,32
536,4365,PF02210,1122,1241,120
9993,4367,PF13923,160,199,40
9994,4367,PF16207,305,369,65


In [390]:
protein_has_pfam_domain.duplicated().any()

False

In [391]:
protein_has_pfam_domain.to_csv('db_tables/protein_has_pfam_domain.tsv', sep='\t', index= False)

## mutation_has_pfam_domain  
cols: id_mutation, id_protein, id_pfam, start, end

### Pyranges  
columnas obligatorias: *Chromosome	 Start	End*  
Chromosome: id_protein    
otras columnas con ids son opcionales y cualquier nombre  
  
por ejemplo df seria tabla de mutaciones  
df = pr.PyRanges(df.rename(columns={'chromosome':'Chromosome','start_position':'Start','end_position':'End'}))  
  
df = pyrange de mutaciones (columnas: Chromosome, Start, End, id_mutacion)  
low_c = pyrange de low complexity(columnas: Chromosome, Start, End, id_low, id_proteina)  
data = df.join(low_c, strandedness=False, slack=1).drop(like="_b") # mutaciones lo junto con low_complex  
strandedness=False no tener en cuenta el Strand  
slack=1 coincidir los extremos. Importante  
drop(like="_b") eliminar el Chromosome, Start, End de low_c (en pfam no hacer el drop)  
data = data.df[[Chromosome, Start, End, id_mutacion, id_low, id_proteina]] # pasa de pyrange a dataframe

In [392]:
mutation.head()

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence,nt_change
0,1,rs558080743,2,61826480,61826482,709,709,E,,2290,4,2121_2123AGA[1]
1,2,,2,61853886,61853888,52,52,E,,2290,4,148GAG[2]
2,3,rs794727435,19,36102785,36102787,1091,1091,H,,233,4,3271_3273del
3,4,rs1064797236,19,36067889,36067891,255,255,F,,233,4,761_763TCT[1]
4,5,rs764610550,19,36090471,36090473,664,664,K,,233,4,1987_1989AAG[1]


In [394]:
pfam.columns

Index(['uniprot_acc', 'pfam_name', 'start', 'end', 'pfam_acc'], dtype='object')

In [395]:
# df has pfam domains data
df = pfam.rename(columns={'pfam_name': 'pfam_domain'}).merge(pfam_domain)
df = df.merge(id_protein)                      # mapping uniprot_acc - id_protein
df.drop(columns='uniprot_acc', inplace= True)

In [396]:
df.head()

Unnamed: 0,pfam_domain,start,end,pfam_acc,id_pfam,id_protein
0,7tm_2,857,1093,PF00002,PF00002,477
1,GPS,800,844,PF01825,PF01825,477
2,Gal_Lectin,48,128,PF02140,PF02140,477
3,OLF,144,396,PF02191,PF02191,477
4,Latrophilin,1113,1474,PF02354,PF02354,477


In [398]:
df.rename(columns={'id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)

In [399]:
df.head()

Unnamed: 0,pfam_domain,Start,End,pfam_acc,id_pfam,Chromosome
0,7tm_2,857,1093,PF00002,PF00002,477
1,GPS,800,844,PF01825,PF01825,477
2,Gal_Lectin,48,128,PF02140,PF02140,477
3,OLF,144,396,PF02191,PF02191,477
4,Latrophilin,1113,1474,PF02354,PF02354,477


In [400]:
# Create the pyranges object of pfam domains
df_py = pr.PyRanges(df)

In [401]:
aux = mutation[['start_aa', 'end_aa', 'id_mutation', 'id_protein']].copy()
aux.rename(columns={'id_protein': 'Chromosome', 'start_aa': 'Start', 'end_aa': 'End'}, inplace= True)

In [402]:
aux.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,709,709,1,2290
1,52,52,2,2290
2,1091,1091,3,233
3,255,255,4,233
4,664,664,5,233


In [403]:
# Pyranges object of mutations
aux_py = pr.PyRanges(aux)

In [404]:
# Join both pyranges object: this assings mutations to pfam domains
pfam_py = df_py.join(aux_py, strandedness= False, slack= 1)  # strandedness= False doesnt take count of the chain strand; slack= 1 include bounds

In [405]:
pfam_py.head() # Start and End are from the pfam domain in that protein (a protein may have the same pfam domain repeated at different positions along its sequence).
                # Start_b and End_b are from the mutation in this case

Unnamed: 0,pfam_domain,Start,End,pfam_acc,id_pfam,Chromosome,Start_b,End_b,id_mutation
0,UCR_hinge,28,91,PF02320,PF02320,2,53,53,246674
1,An_peroxidase,727,1272,PF03098,PF03098,9,981,981,244469
2,An_peroxidase,727,1272,PF03098,PF03098,9,1039,1039,244475
3,An_peroxidase,727,1272,PF03098,PF03098,9,1133,1133,244470
4,An_peroxidase,727,1272,PF03098,PF03098,9,1207,1207,244474
5,LRR_8,50,110,PF13855,PF13855,9,65,65,244477
6,Ig_3,329,402,PF13927,PF13927,9,391,391,244481
7,I-set,511,597,PF07679,PF07679,9,538,538,244478


In [406]:
# Pyranges to DataFrame
mutation_has_pfam_domain = pfam_py.df[['id_mutation', 'Chromosome', 'id_pfam', 'Start', 'End']] # cols to keep

In [407]:
mutation_has_pfam_domain.rename(columns={'Chromosome': 'id_protein', 'Start': 'start', 'End': 'end'}, inplace= True)

In [408]:
mutation_has_pfam_domain.head()

Unnamed: 0,id_mutation,id_protein,id_pfam,start,end
0,246674,2,PF02320,28,91
1,244469,9,PF03098,727,1272
2,244475,9,PF03098,727,1272
3,244470,9,PF03098,727,1272
4,244474,9,PF03098,727,1272


In [409]:
# control
mutation[mutation.id_mutation == 23987]

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence,nt_change
23986,23987,rs1562846257,7,92493045,92493045,1039,1039,T,,281,2,3115del


In [410]:
# control
pfam_domain[pfam_domain.id_pfam == 814] 

Unnamed: 0,pfam_domain,id_pfam


In [411]:
mutation_has_pfam_domain[mutation_has_pfam_domain.id_pfam == 814] # ok!

Unnamed: 0,id_mutation,id_protein,id_pfam,start,end


In [412]:
mutation_has_pfam_domain.to_csv('db_tables/mutation_has_pfam_domain.tsv', sep='\t', index= False)

---  
# low-complexity Tables

## low_complexity  
cols: id_lc, start, end, length, id_protein

In [413]:
low_complexity.head()

Unnamed: 0,uniprot_acc,start,end,id_lc
0,P61981,236,243,1
1,P31947,235,247,2
2,P31947,248,247,3
3,P27348,230,244,4
4,P27348,245,244,5


In [414]:
# Add length col 
low_complexity['length'] = low_complexity.end - low_complexity.start + 1 

In [415]:
# Add id_proteins
low_complexity.rename(columns={'uniprot': 'uniprot_acc'}, inplace= True)
low_complexity = low_complexity.merge(id_protein)
low_complexity.drop(columns='uniprot_acc', inplace= True)

In [416]:
low_complexity.head()

Unnamed: 0,start,end,id_lc,length,id_protein
0,236,243,1,8,1602
1,235,247,2,13,1132
2,248,247,3,0,1132
3,230,244,4,15,1049
4,245,244,5,0,1049


In [417]:
low_complexity.to_csv('db_tables/low_complexity.tsv', sep='\t', index= False)

## mutation_has_low_complexity  
cols: id_mutation, id_lc

In [418]:
# Table for LC data
lc_has = low_complexity.copy()
lc_has.rename(columns={'id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)

In [419]:
# Auxiliar table for mutations
aux_lc = mutation[['start_aa', 'end_aa', 'id_mutation', 'id_protein']].copy()
aux_lc.rename(columns={'id_protein': 'Chromosome', 'start_aa': 'Start', 'end_aa': 'End'}, inplace= True)

In [420]:
aux_lc.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,709,709,1,2290
1,52,52,2,2290
2,1091,1091,3,233
3,255,255,4,233
4,664,664,5,233


In [421]:
# Create the Pyranges objects
lc_has_py = pr.PyRanges(lc_has)
aux_lc_py = pr.PyRanges(aux_lc)

In [422]:
# Join both pyranges object: this assings mutations to low-complexity regions
lc_py = aux_lc_py.join(lc_has_py, strandedness= False, slack=1).drop(like="_b") # strandedness= False doesnt take count of the chain strand;
                                                                                # slack= 1 include bounds; drop(like="_b"): delete those cols (redudants)

In [423]:
lc_py.head()

Unnamed: 0,Start,End,id_mutation,Chromosome,id_lc,length
0,1133,1133,244470,9,5240,12
1,287,287,36058,16,9733,28
2,197,197,36059,16,9732,29
3,336,336,247177,17,5023,26
4,1038,1038,244588,18,11903,20
5,1469,1469,242383,19,10455,18
6,5,5,242248,23,5786,14
7,7,7,242256,23,5786,14


In [424]:
# Pyrange to DataFrame
mutation_has_low_complexity = lc_py.df[['id_mutation', 'id_lc']] # cols to keep

In [425]:
mutation_has_low_complexity.head()

Unnamed: 0,id_mutation,id_lc
0,244470,5240
1,36058,9733
2,36059,9732
3,247177,5023
4,244588,11903


In [426]:
# Control
low_complexity[low_complexity.id_lc == 5240]

Unnamed: 0,start,end,id_lc,length,id_protein
5239,1128,1139,5240,12,9


In [427]:
protein.iloc[8]

id_protein                                                          9
uniprot_acc                                                    A1KZ92
hgnc_id                                                    HGNC:26359
gene_id                                                      137902.0
gene_name                                                       PXDNL
length                                                           1463
sequence            MEPRLFCWTTLFLLAGWCLPGLPCPSRCLCFKSTVRCMHLMLDHIP...
disorder_content                                                  NaN
Name: 8, dtype: object

In [428]:
mutation[mutation.id_mutation == 23989] # It's allright! Mutation in aa 1133, which belongs to the low-complexity region between 1128 - 1139 in that protein

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence,nt_change
23988,23989,rs766947924,7,92501622,92501622,823,823,P,,281,2,2468del


In [429]:
mutation_has_low_complexity.to_csv('db_tables/mutation_has_low_complexity.tsv', sep='\t', index= False)

---  
# Disorder Tables

## disorder_region  
cols: id_idr, start, end, length, id_protein

In [430]:
# Add length col 
disorder['length'] = disorder.end - disorder.start + 1 

In [431]:
disorder_region = disorder.rename(columns={'uniprot': 'uniprot_acc'}).merge(id_protein).sort_values('id_protein')
disorder_region.drop(columns='uniprot_acc', inplace= True)
disorder_region.head()

Unnamed: 0,start,end,id_idr,length,id_protein
2338,1,30,2339,30,2
1426,1,68,1427,68,3
2489,1,25,2490,25,4
5782,660,754,5783,95,5
5781,1,103,5782,103,5


In [432]:
disorder_region.to_csv('db_tables/disorder_region.tsv', sep='\t', index= False)

## mutation_has_disorder_region  
cols: id_mutation, id_idr

In [433]:
# Auxiliar table for mutations from low-complexity is the same for disorder. id-protein, start and end of the mutation
aux_idr = aux_lc
aux_idr.head()

Unnamed: 0,Start,End,id_mutation,Chromosome
0,709,709,1,2290
1,52,52,2,2290
2,1091,1091,3,233
3,255,255,4,233
4,664,664,5,233


In [434]:
# Table for IDRs data
idr_has = disorder_region.copy()
idr_has.rename(columns={'id_protein': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace= True)
idr_has.head()

Unnamed: 0,Start,End,id_idr,length,Chromosome
2338,1,30,2339,30,2
1426,1,68,1427,68,3
2489,1,25,2490,25,4
5782,660,754,5783,95,5
5781,1,103,5782,103,5


In [435]:
# Create the Pyranges objects
idr_has_py = pr.PyRanges(idr_has)
aux_idr_py = pr.PyRanges(aux_idr)

In [436]:
# Join both pyranges object: this assings mutations to pfam domains
idr_py = aux_idr_py.join(idr_has_py, strandedness= False, slack=1).drop(like="_b") # strandedness= False doesnt take count of the chain strand;
                                                                                   # slack= 1 include bounds; drop(like="_b"): delete those cols (redudants)

In [437]:
idr_py.head()

Unnamed: 0,Start,End,id_mutation,Chromosome,id_idr,length
0,70,70,247855,12,4339,28
1,287,287,36058,16,4224,129
2,336,336,247177,17,2225,157
3,1014,1014,247178,17,2227,49
4,1371,1371,4949,19,4637,76
5,1180,1180,242367,19,4635,293
6,2580,2580,242369,19,4645,65
7,213,213,242374,19,4630,49


In [438]:
# Pyrange to DataFrame
mutation_has_disorder_region = idr_py.df[['id_mutation', 'id_idr']] # cols to keep
mutation_has_disorder_region.head()

Unnamed: 0,id_mutation,id_idr
0,247855,4339
1,36058,4224
2,247177,2225
3,247178,2227
4,4949,4637


In [439]:
# Control
mutation[mutation.id_mutation == 24001]

Unnamed: 0,id_mutation,snp_id,chromosome,start_genomic,end_genomic,start_aa,end_aa,from_aa,to_aa,id_protein,id_consequence,nt_change
24000,24001,,7,92522126,92522126,83,83,K,,281,2,249del


In [440]:
id_protein[id_protein.id_protein == 12]

Unnamed: 0,id_protein,uniprot_acc
11,12,A2A288


In [441]:
disorder[disorder.id_idr == 4339] # It's Ok. A point mutation in position 70 in the idr region between 48-75

Unnamed: 0,uniprot_acc,start,end,id_idr,length
4338,A2A288,48,75,4339,28


In [442]:
mutation_has_disorder_region.to_csv('db_tables/mutation_has_disorder_region.tsv', sep='\t', index= False)

---   
# Rol table  
cols: id_rol, rol

In [443]:
database_entrada.rol.unique()

array(['driver', 'unassigned', 'regulator', 'client'], dtype=object)

In [444]:
database_entrada.rol.value_counts()

client        4138
unassigned    2272
regulator     1395
driver         377
Name: rol, dtype: int64

In [445]:
rol = pd.DataFrame({'rol': database_entrada.rol.value_counts().index, 'id_rol': range(1, len(database_entrada.rol.value_counts())+1)})
rol

Unnamed: 0,rol,id_rol
0,client,1
1,unassigned,2
2,regulator,3
3,driver,4


In [446]:
rol.to_csv('db_tables/rol.tsv', sep='\t', index= False)

---  
# dataset table  
cols: id_dataset, dataset

In [447]:
database_entrada.db.value_counts()

drllps              5034
phasepdb_ht         2346
phasepdb_uniprot     384
phasepdb_rev         297
phasepro             121
Name: db, dtype: int64

In [448]:
database = pd.DataFrame({'dataset': database_entrada.db.value_counts().index, 'id_dataset': range(1, len(database_entrada.db.value_counts())+1)})
database

Unnamed: 0,dataset,id_dataset
0,drllps,1
1,phasepdb_ht,2
2,phasepdb_uniprot,3
3,phasepdb_rev,4
4,phasepro,5


In [449]:
database.to_csv('db_tables/dataset.tsv', sep='\t', index= False)

---  
# MLOs tables  

In [450]:
database_entrada.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8182 entries, 0 to 8381
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uniprot_acc  8182 non-null   object
 1   mlo          7361 non-null   object
 2   rol          8182 non-null   object
 3   db           8182 non-null   object
dtypes: object(4)
memory usage: 319.6+ KB


In [451]:
database_entrada.mlo.value_counts()

Nucleolus                       2063
Postsynaptic density            1374
Stress granule                  1334
P-body                           823
Centrosome/Spindle pole body     534
                                ... 
inclusion body                     1
MORC3-NBs                          1
granular component                 1
TIS granule                        1
extracellular matrix               1
Name: mlo, Length: 113, dtype: int64

In [452]:
len(database_entrada.uniprot_acc.unique())

4368

## Deal with mlos annotations

In [453]:
database_entrada.mlo = database_entrada.mlo.str.strip()

In [454]:
len(database_entrada.mlo.unique()) # no blank spaces

114

In [455]:
database_entrada.mlo.value_counts()

Nucleolus                       2063
Postsynaptic density            1374
Stress granule                  1334
P-body                           823
Centrosome/Spindle pole body     534
                                ... 
inclusion body                     1
MORC3-NBs                          1
granular component                 1
TIS granule                        1
extracellular matrix               1
Name: mlo, Length: 113, dtype: int64

### Paraspeckle

In [456]:
(database_entrada.mlo == 'Paraspeckle').sum()

101

In [457]:
# Unify paraspeckle with Paraspeckle
database_entrada.replace('paraspeckle', 'Paraspeckle', inplace= True)
(database_entrada.mlo == 'Paraspeckle').sum()

104

### Sam68

In [458]:
(database_entrada.mlo == 'Sam68 nuclear bodies').sum()

11

In [459]:
(database_entrada.mlo == 'Sam68 nuclear bodies (SNBs)').sum()

2

In [460]:
(database_entrada.mlo == 'Sam68 nuclear body').sum()

12

In [461]:
database_entrada.replace(['Sam68 nuclear bodies', 'Sam68 nuclear bodies (SNBs)'], 'Sam68 nuclear body', inplace= True)
(database_entrada.mlo == 'Sam68 nuclear body').sum()

25

### PML body  
**PhaSepDB**: The PML bodies are dynamic nuclear protein aggregates interspersed between chromatin. These punctate nuclear structures are call PML bodies because the PML gene is essential for their formation. are present in most mammalian cell nuclei and typically number 1 to 30 bodies per nucleus.  
**DrLLPS**: PML nuclear bodies are annotetad in the nucleus. They are matrix-associated domains that recruit an astonishing variety of seemingly unrelated proteins.

In [462]:
(database_entrada.mlo == 'PML nuclear body').sum()

97

In [463]:
(database_entrada.mlo == 'PML body').sum()

77

In [464]:
database_entrada.replace('PML body', 'PML nuclear body', inplace= True)
(database_entrada.mlo == 'PML nuclear body').sum()

174

### Polycomb body

In [465]:
(database_entrada.mlo == 'Polycomb bodies').sum()

2

In [466]:
database_entrada.replace('Polycomb bodies', 'Polycomb body', inplace= True)
(database_entrada.mlo == 'Polycomb body').sum()

4

### Pre and postsynaptic density

In [467]:
database_entrada.replace('Pre and postsynaptic densities', 'Pre and postsynaptic density', inplace= True)
(database_entrada.mlo == 'Pre and postsynaptic density').sum()

8

### Nuclear speckle

In [468]:
(database_entrada.mlo == 'Nucleus speckles').sum() #phasepdb

115

In [469]:
(database_entrada.mlo == 'Nuclear speckle').sum() # drllps

110

In [470]:
(database_entrada.mlo == 'Nuclear speckles').sum() #phasepdb

24

In [471]:
(database_entrada.mlo == 'nuclear speckle').sum()

3

In [472]:
database_entrada.replace(['Nucleus speckles', 'Nuclear speckles', 'nuclear speckle'], 'Nuclear speckle', inplace= True)
(database_entrada.mlo == 'Nuclear speckle').sum()

252

### Heterochromatin

In [473]:
(database_entrada.mlo == 'heterochromatin').sum()

2

In [474]:
database_entrada.replace('heterochromatin', 'Heterochromatin', inplace= True)
(database_entrada.mlo == 'Heterochromatin').sum()

3

### Cytoplasmic ribonucleoprotein granule

In [475]:
(database_entrada.mlo == 'cytoplasmic ribonucleoprotein granule').sum()


5

In [476]:
database_entrada.replace('cytoplasmic ribonucleoprotein granule', 'Cytoplasmic ribonucleoprotein granule', inplace= True)

### Membrane cluster

In [477]:
(database_entrada.mlo == 'Membrane clusters').sum()

4

In [478]:
(database_entrada.mlo == 'membrane cluster').sum()

3

In [479]:
database_entrada.replace(['Membrane clusters', 'membrane cluster'], 'Membrane cluster', inplace= True)
(database_entrada.mlo == 'Membrane cluster').sum()

7

### Nuclear body

In [480]:
database_entrada.replace('nuclear body', 'Nuclear body', inplace= True)
(database_entrada.mlo == 'Nuclear body').sum()

11

### Nucleolus

In [481]:
database_entrada.replace('nucleolus', 'Nucleolus', inplace= True)
(database_entrada.mlo == 'Nucleolus').sum()

2064

In [482]:
(database_entrada.mlo == 'Centrosome/Spindle pole body').sum() # keep this annotation

534

## OK, now mlo table  
cols: id_mlo, mlo

In [483]:
database_entrada.mlo.value_counts()

Nucleolus                       2064
Postsynaptic density            1374
Stress granule                  1334
P-body                           823
Centrosome/Spindle pole body     534
                                ... 
inclusion body                     1
liquid-like DYRK3 speckles         1
MORC3-NBs                          1
granular component                 1
extracellular matrix               1
Name: mlo, Length: 99, dtype: int64

In [484]:
# EXPLODE:
# P-body, Stress granule
# P-body, GW body
# Set mlo col as list-like and explode() to separate list elements into separate rows
# before:  rows
database_entrada = database_entrada.assign(mlo= database_entrada.mlo.str.split(',')).explode('mlo')
database_entrada.mlo = database_entrada.mlo.str.strip()
database_entrada.drop_duplicates(inplace= True)
database_entrada
# after: 8187 rows

Unnamed: 0,uniprot_acc,mlo,rol,db
0,P35637,cytoplasmic stress granule,driver,phasepro
1,P35637,Cytoplasmic ribonucleoprotein granule,driver,phasepro
2,Q06787,cytoplasmic stress granule,driver,phasepro
3,Q06787,Cytoplasmic ribonucleoprotein granule,driver,phasepro
4,Q06787,synaptosome,driver,phasepro
...,...,...,...,...
8377,O95670,Nucleolus,client,drllps
8378,O95670,Postsynaptic density,client,drllps
8379,Q9H269,Postsynaptic density,client,drllps
8380,Q9Y3D7,Postsynaptic density,client,drllps


In [485]:
# GW-body
database_entrada.replace('GW body', 'GW-body', inplace= True)
(database_entrada.mlo == 'GW-body').sum()

3

In [486]:
# Postsynaptic density
database_entrada.replace('postsynaptic density', 'Postsynaptic density', inplace= True)
(database_entrada.mlo == 'Postsynaptic density').sum()

1375

In [487]:
# Cytoplasmic ribonucleoprotein granule
database_entrada.replace('cytoplasmic ribonucleoprotein granule', 'Cytoplasmic ribonucleoprotein granule', inplace= True)
(database_entrada.mlo == 'Cytoplasmic ribonucleoprotein granule').sum()

6

In [488]:
# Histone locus body
database_entrada.replace('Histone Locus body', 'Histone locus body', inplace= True)
(database_entrada.mlo == 'Histone locus body').sum()

16

In [489]:
# Stress granule
database_entrada.replace('Sress granule', 'Stress granule', inplace= True)
(database_entrada.mlo == 'Stress granule').sum()

1337

In [490]:
database_entrada.drop_duplicates(inplace= True)

In [491]:
database_entrada.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8186 entries, 0 to 8381
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uniprot_acc  8186 non-null   object
 1   mlo          7365 non-null   object
 2   rol          8186 non-null   object
 3   db           8186 non-null   object
dtypes: object(4)
memory usage: 319.8+ KB


In [492]:
database_entrada.mlo.value_counts()

Nucleolus                       2064
Postsynaptic density            1375
Stress granule                  1337
P-body                           827
Centrosome/Spindle pole body     534
                                ... 
inclusion body                     1
neuron projection                  1
MORC3-NBs                          1
TIS granule                        1
extracellular matrix               1
Name: mlo, Length: 96, dtype: int64

In [493]:
mlo = pd.DataFrame({'mlo': database_entrada.mlo.value_counts().index, 'id_mlo': range(1, len(database_entrada.mlo[database_entrada.mlo.notnull()].unique())+1)})
mlo

Unnamed: 0,mlo,id_mlo
0,Nucleolus,1
1,Postsynaptic density,2
2,Stress granule,3
3,P-body,4
4,Centrosome/Spindle pole body,5
...,...,...
91,inclusion body,92
92,neuron projection,93
93,MORC3-NBs,94
94,TIS granule,95


In [494]:
mlo.to_csv('db_tables/mlo.tsv', sep='\t', index= False)

## protein_has_mlo  
cols: id_protein, id_mlo, id_rol, id_database

In [495]:
len(database_entrada.uniprot_acc.unique())

4368

In [496]:
protein_has_mlo = database_entrada.copy()

In [497]:
protein_has_mlo.head()

Unnamed: 0,uniprot_acc,mlo,rol,db
0,P35637,cytoplasmic stress granule,driver,phasepro
1,P35637,Cytoplasmic ribonucleoprotein granule,driver,phasepro
2,Q06787,cytoplasmic stress granule,driver,phasepro
3,Q06787,Cytoplasmic ribonucleoprotein granule,driver,phasepro
4,Q06787,synaptosome,driver,phasepro


In [498]:
# Add id_protein
protein_has_mlo = protein_has_mlo.merge(id_protein)
# Add id_mlo
protein_has_mlo = protein_has_mlo.merge(mlo, how= 'left')
# Add id_rol and id_database
protein_has_mlo = protein_has_mlo.merge(rol)
protein_has_mlo = protein_has_mlo.rename(columns={'db': 'dataset'}).merge(database).sort_values('id_protein')

In [499]:
protein_has_mlo.drop(columns=['uniprot_acc', 'mlo', 'rol', 'dataset'], inplace= True)
protein_has_mlo

Unnamed: 0,id_protein,id_mlo,id_rol,id_dataset
7800,1,,3,2
918,2,3.0,3,1
1162,3,4.0,1,1
7823,3,4.0,2,3
2831,4,3.0,1,1
...,...,...,...,...
5042,4366,4.0,1,1
3074,4367,18.0,1,1
3780,4368,9.0,1,1
3779,4368,1.0,1,1


In [500]:
protein_has_mlo[protein_has_mlo.duplicated()] # OK

Unnamed: 0,id_protein,id_mlo,id_rol,id_dataset


In [526]:
protein_has_mlo['id_proteinmlo'] = range(1, len(protein_has_mlo)+1)

In [527]:
protein_has_mlo

Unnamed: 0,id_protein,id_mlo,id_rol,id_dataset,id_proteinmlo
7800,1,,3,2,1
918,2,3.0,3,1,2
1162,3,4.0,1,1,3
7823,3,4.0,2,3,4
2831,4,3.0,1,1,5
...,...,...,...,...,...
5042,4366,4.0,1,1,8182
3074,4367,18.0,1,1,8183
3780,4368,9.0,1,1,8184
3779,4368,1.0,1,1,8185


In [529]:
protein_has_mlo.duplicated().any()

False

In [528]:
protein_has_mlo.to_csv('db_tables/protein_has_mlo.tsv', sep='\t', index= False)

# source

In [503]:
source = pd.DataFrame({'id_source': [1,2,3], 'source': ['clinvar', 'disgenet', 'uniprot']})
source
source.to_csv('db_tables/source.tsv', sep='\t', index = False)

# mutation_has_source

In [504]:
mutations_with_source = pd.read_csv('../datasets/mutations_with_source.tsv.gz', sep='\t', compression='gzip')

In [505]:
mutation_has_source = mutations_with_source.copy()

In [506]:
mutation_has_source.head()

Unnamed: 0,id_mutation,variationid,source
0,1,553904,clinvar
1,2,968278,clinvar
2,3,196011,clinvar
3,4,425169,clinvar
4,5,598371,clinvar


In [507]:
mutation_has_source.rename(columns={'variationid': 'id_insource'}, inplace= True)

In [508]:
mutation_has_source.drop_duplicates(inplace= True)

In [509]:
mutation_has_source = mutation_has_source.merge(source).drop(columns='source')

In [510]:
mutation_has_source

Unnamed: 0,id_mutation,id_insource,id_source
0,1,553904,1
1,2,968278,1
2,3,196011,1
3,4,425169,1
4,5,598371,1
...,...,...,...
264550,264551,929454,1
264551,264552,929944,1
264552,264553,930642,1
264553,264554,974683,1


In [511]:
mutation_has_source.to_csv('db_tables/mutation_has_source.tsv', sep='\t', index= False)

# citation_source

In [512]:

pmid.columns = pmid.columns.str.lower().str.replace(' ',"_").str.replace("-",'_').str.replace('/','_')
pmid = pmid[['variationid', 'citation_source', 'citation_id']].copy()
pmid.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1241046 entries, 0 to 1241045
Data columns (total 3 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   variationid      1241046 non-null  int64 
 1   citation_source  1241046 non-null  object
 2   citation_id      1241046 non-null  object
dtypes: int64(1), object(2)
memory usage: 28.4+ MB


In [513]:
pmid.variationid.isnull().any() # False. This is the ClinVar ID

False

In [514]:
pmid.variationid.drop_duplicates(inplace= True)

In [515]:
pmid.citation_source.value_counts()

PubMed           1232172
NCBIBookShelf       7089
PubMedCentral       1785
Name: citation_source, dtype: int64

In [516]:
citation_source = pd.DataFrame({'name': pmid.citation_source.value_counts().index, 'id_citation_source': range(1, len(pmid.citation_source.unique())+1) })

In [517]:
citation_source

Unnamed: 0,name,id_citation_source
0,PubMed,1
1,NCBIBookShelf,2
2,PubMedCentral,3


In [518]:
citation_source.to_csv('db_tables/citation_source.tsv', sep='\t', index= False)

# mutation_has_citation

In [519]:
mutation_has_citation = pmid.rename(columns={'citation_id': 'id_citation'})

In [520]:
mutation_has_citation.head()

Unnamed: 0,variationid,citation_source,id_citation
0,4,PubMed,12030328
1,4,PubMed,20531441
2,5,PubMed,25678554
3,6,PubMed,20818383
4,7,PubMed,20818383


In [521]:
citation_source

Unnamed: 0,name,id_citation_source
0,PubMed,1
1,NCBIBookShelf,2
2,PubMedCentral,3


In [522]:
mutation_has_citation = mutation_has_citation.merge(citation_source.rename(columns={'name': 'citation_source'})).drop(columns= 'citation_source')
mutation_has_citation

Unnamed: 0,variationid,id_citation,id_citation_source
0,4,12030328,1
1,4,20531441,1
2,5,25678554,1
3,6,20818383,1
4,7,20818383,1
...,...,...,...
1241041,982315,4544753,3
1241042,984966,6423636,3
1241043,984965,6423636,3
1241044,984967,6423636,3


In [523]:
mutation_has_citation = mutation_has_citation.merge(mutations_with_source).drop(columns=['source', 'variationid']).drop_duplicates()
mutation_has_citation

Unnamed: 0,id_citation,id_citation_source,id_mutation
0,10507729,1,248191
1,24651477,1,248191
2,26355662,1,248191
3,20705278,1,248191
4,20705279,1,248191
...,...,...,...
393300,4544753,3,15676
393301,4544753,3,15677
393302,4544753,3,15678
393303,4544753,3,15679


In [524]:
mutation_has_citation.to_csv('db_tables/mutation_has_citation.tsv', sep='\t', index= False)