In [None]:
import pandas as pd
import numpy as np
import re
from Bio.SeqUtils import seq1
from functools import reduce

## Cargo datasets

In [None]:
# all LLPS human proteins dataset (Orti et al.)
proteins = pd.read_csv('datasets/all_proteins_table.csv')

In [None]:
len(proteins.uniprot_acc.unique())

In [None]:
# Leo el txt de variantes humanas en uniprot
'''humsavar.txt:
Index of manually curated Human polymorphisms and disease mutations from UniProtKB/Swiss-Prot.
This file lists all missense variants annotated in UniProtKB/Swiss-Prot human
entries. It provides a variant classification which is intended for research
purposes only, not for clinical and diagnostic use.
 - The column 'Variant category' shows the classification of the variant using
   the American College of Medical Genetics and Genomics/Association for
   Molecular Pathology (ACMG/AMP) terminology (Richards et al. PubMed:25741868)
   into the following categories:
   
   LP/P = likely pathogenic or pathogenic
   LB/B = likely benign or benign
   US   = uncertain significance

   These categories are assigned based on the variant annotation in the
   corresponding UniProtKB/Swiss-Prot entries that is curated from literature
   reports. The classification may change over time and must not be considered
   as a definitive statement about the pathogenic role of a variant.

 - The column 'Disease name' shows the name of the disease or the disease sample
   in which variants have been found. Names are only provided for diseases
   catalogued in OMIM and for cancer samples.
'''
with open('humsavar.txt') as f:
    gene_name=[]
    uniprot=[]
    ft_id=[]
    change=[]
    category=[]
    snp_id=[]
    disease_name=[]
    for line in f:
        stripped_line = line.strip()
        gene_name.append(stripped_line[0:10])
        uniprot.append(stripped_line[10:21])
        ft_id.append(stripped_line[21:33])
        change.append(stripped_line[33:48])
        category.append(stripped_line[48:57])
        snp_id.append(stripped_line[57:72])
        disease_name.append(stripped_line[72:])

In [None]:
# Creo el dataframe: humsavar
humsavar = pd.DataFrame(list(zip(gene_name, uniprot, ft_id, change, category, snp_id, disease_name)), columns=['gene_name', 'uniprot', 'ft_id', 'change', 'category', 'snp_id', 'disease_name'])
humsavar = humsavar.drop([0]).reset_index(drop=True)

# Eliminar los espacios en blanco
humsavar = humsavar.applymap(lambda x: x.strip())

# agrego los mim access en otra col
humsavar['mim'] = humsavar.disease_name.map(lambda x: re.findall('\[(.*?)\]', x))
humsavar['mim'] = humsavar.mim.str[0]

'''#Lo guardo
humsavar.to_csv('humsavar.csv', index=False)
print(humsavar.head())
print('Total de entradas:', len(humsavar))'''

In [None]:
humsavar.head()

In [None]:
# agrego los mim access en otra col
humsavar['mim'] = humsavar.disease_name.map(lambda x: re.findall('\[(.*?)\]', x))
humsavar['mim'] = humsavar.mim.str[0]
humsavar.mim.head(50)

In [None]:
l= re.findall('(.*?) ?(\[.*?\])?$', 'Parietal foramina 2 (PFM2) [MIM: 05255256]')
l

In [None]:
l[0][0]

In [None]:
re.findall('(.*?) ?(\[.*?\])?$', 'A breast cancer sample')
	

In [None]:
# quiero solo el disease, o sea, todo menos el codigo MIM
humsavar['disease'] = humsavar.disease_name.map(lambda x: re.findall('(.*?) ?(\[.*?\])?$', x)) # el primer grupo atrapa cualquier cosa, luego viene un espacio, y el segundo atrapa el codigo [MIM], este o no
humsavar['disease'] = humsavar.disease.str[0].str[0]
#humsavar['disease'] = humsavar.disease.str[0]  # es el primer elemento de la tupla, strip() xq queda un espacio al final

In [None]:
humsavar[['disease_name','disease']][humsavar.disease.notnull()]

In [None]:
humsavar.disease.value_counts()[:10]

In [None]:
# replace "-" by NaNs
humsavar.disease = humsavar.disease.replace('-', np.nan)

In [None]:
# Top ten diseases
humsavar.disease.value_counts()[:10]

In [None]:
# Ojo, por que hay snps repetidos??
humsavar.snp_id.value_counts()

In [17]:
# Veo uno por ejemplo
humsavar[humsavar.snp_id == 'rs121913273']

Unnamed: 0,gene_name,uniprot,ft_id,change,category,snp_id,disease_name,mim,disease
53375,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,Breast cancer (BC) [MIM:114480],MIM:114480,Breast cancer (BC)
53376,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,CLAPO syndrome (CLAPO) [MIM:613089],MIM:613089,CLAPO syndrome (CLAPO)
53377,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,Colorectal cancer (CRC) [MIM:114500],MIM:114500,Colorectal cancer (CRC)
53378,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,"Congenital lipomatous overgrowth, vascular mal...",MIM:612918,"Congenital lipomatous overgrowth, vascular mal..."
53379,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,"Keratosis, seborrheic (KERSEB) [MIM:182000]",MIM:182000,"Keratosis, seborrheic (KERSEB)"
53380,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,Macrodactyly (MADAC) [MIM:155500],MIM:155500,Macrodactyly (MADAC)
53381,PIK3CA,P42336,VAR_026174,p.Glu542Gln,US,rs121913273,-,,


In [18]:
# Format snp_id col
humsavar.snp_id.replace('-', np.nan, inplace= True)

In [19]:
humsavar.snp_id.value_counts()

rs121913279    9
rs121918488    9
rs75076352     8
rs79781594     7
rs121913273    7
              ..
rs104893866    1
rs767748011    1
rs2271188      1
rs17884647     1
rs41299037     1
Name: snp_id, Length: 63271, dtype: int64

In [20]:
# Remove 'rs'
humsavar.snp_id = humsavar.snp_id.str.strip('rs').apply(float)  # float because there's nans

In [21]:
# Format change col
humsavar.change.str.startswith('p.').value_counts() # all entries starts with 'p.'

True    79376
Name: change, dtype: int64

In [22]:
humsavar.change = humsavar.change.str.lstrip('p.')

In [23]:
humsavar.sort_values('disease')

Unnamed: 0,gene_name,uniprot,ft_id,change,category,snp_id,disease_name,mim,disease
18949,DHTKD1,Q96HY7,VAR_069585,Gly729Arg,LP/P,117225135.0,2-aminoadipic 2-oxoadipic aciduria (AMOXAD) [M...,MIM:204750,2-aminoadipic 2-oxoadipic aciduria (AMOXAD)
31001,HADH,Q16836,VAR_024079,Ala40Thr,LP/P,137853101.0,3-alpha-hydroxyacyl-CoA dehydrogenase deficien...,MIM:231530,3-alpha-hydroxyacyl-CoA dehydrogenase deficien...
31002,HADH,Q16836,VAR_024080,Asp57Glu,LP/P,137853102.0,3-alpha-hydroxyacyl-CoA dehydrogenase deficien...,MIM:231530,3-alpha-hydroxyacyl-CoA dehydrogenase deficien...
33077,HMGCL,P35914,VAR_065453,Arg165Gln,LP/P,199587895.0,3-hydroxy-3-methylglutaryl-CoA lyase deficienc...,MIM:246450,3-hydroxy-3-methylglutaryl-CoA lyase deficienc...
33076,HMGCL,P35914,VAR_058450,Leu263Pro,LP/P,,3-hydroxy-3-methylglutaryl-CoA lyase deficienc...,MIM:246450,3-hydroxy-3-methylglutaryl-CoA lyase deficienc...
...,...,...,...,...,...,...,...,...,...
79371,-,Q8N402,VAR_042675,Thr131Pro,LB/B,6519442.0,-,,
79372,-,Q96M66,VAR_039178,Arg37His,LB/B,350229.0,-,,
79373,-,Q96M66,VAR_039179,Arg171Ser,LB/B,11648228.0,-,,
79374,-,Q9N2K0,VAR_017799,Val81Leu,LB/B,,-,,


In [24]:
humsavar.rename(columns={'uniprot': 'uniprot_acc'}, inplace= True)

In [49]:
humsavar.change[humsavar.change.notnull()] # all entries have a protein change

0         His52Arg
1        His395Arg
2        Val555Met
3        Ala558Ser
4        Arg704His
           ...    
79371    Thr131Pro
79372     Arg37His
79373    Arg171Ser
79374     Val81Leu
79375    Phe150Leu
Name: change, Length: 79376, dtype: object

## Merge with LLPS human proteins

In [25]:
proteins

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name,length,sequence
0,1,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,4369,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
1,2,A0A096LP55,HGNC:51714,440567.0,UQCRHL,4369,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...
2,3,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,4369,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
3,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...
4,5,A0FGR8,HGNC:22211,57488.0,ESYT2,4369,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...
...,...,...,...,...,...,...,...
4364,4365,Q9UNL7,,,,4369,KLDVEEPDSANSSFYSTRSAPASQASLRATSSTQSLARLGSPDYGN...
4365,4366,Q9Y4C0,HGNC:8010,9369.0,NRXN3,4369,MSSTLHSVFFTLKVSILLGSLLGLCLGLEFMGLPNQWARYLRWDAS...
4366,4367,Q9Y649,,,,4369,MNDLICFLDNTFKNNVLSQAWWCVHLVPTIWEAEAGGSLEPRSLKL...
4367,4368,R4GMX3,,,,4369,MELSESVQKGFQMLADPRSFDSNAFTLLLRAAFQSLLDAQADEAVL...


In [26]:
uniprot_llps = proteins.merge(humsavar, left_on='uniprot_acc', right_on='uniprot_acc', how= 'left')
uniprot_llps

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name_x,length,sequence,gene_name_y,ft_id,change,category,snp_id,disease_name,mim,disease
0,1,A0A024RBG1,HGNC:18012,440672.0,NUDT4B,4369,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,,,,,,,,
1,2,A0A096LP55,HGNC:51714,440567.0,UQCRHL,4369,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...,,,,,,,,
2,3,A0A0U1RRE5,HGNC:50713,550643.0,NBDY,4369,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...,,,,,,,,
3,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...,RBM47,VAR_054770,Met565Val,LB/B,278981.0,-,,
4,4,A0AV96,HGNC:30358,54502.0,RBM47,4369,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...,RBM47,VAR_061832,Gly538Arg,LB/B,35529250.0,-,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22718,4365,Q9UNL7,,,,4369,KLDVEEPDSANSSFYSTRSAPASQASLRATSSTQSLARLGSPDYGN...,,,,,,,,
22719,4366,Q9Y4C0,HGNC:8010,9369.0,NRXN3,4369,MSSTLHSVFFTLKVSILLGSLLGLCLGLEFMGLPNQWARYLRWDAS...,,,,,,,,
22720,4367,Q9Y649,,,,4369,MNDLICFLDNTFKNNVLSQAWWCVHLVPTIWEAEAGGSLEPRSLKL...,,,,,,,,
22721,4368,R4GMX3,,,,4369,MELSESVQKGFQMLADPRSFDSNAFTLLLRAAFQSLLDAQADEAVL...,,,,,,,,


In [27]:
uniprot_llps.duplicated().any() # False = no duplicated

False

In [28]:
len(uniprot_llps.uniprot_acc.unique()) # ok

4369

In [29]:
uniprot_llps.gene_name_y.notnull().sum()

21246

In [30]:
uniprot_llps.gene_name_x.notnull().sum()

22634

In [31]:
uniprot_llps.gene_name_y = uniprot_llps.gene_name_y.combine_first(uniprot_llps.gene_name_x) # keep this col

In [59]:
# Keep entries with non-null change col values
uniprot_llps = uniprot_llps[uniprot_llps.change.notnull()]

In [81]:
# Same with isin() method
humsavar[humsavar.uniprot_acc.isin(proteins.uniprot_acc)]

Unnamed: 0,gene_name,uniprot_acc,ft_id,change,category,snp_id,disease_name,mim,disease
4,A2M,P01023,VAR_000012,Arg704His,LB/B,1800434.0,-,,
5,A2M,P01023,VAR_000013,Cys972Tyr,LB/B,1800433.0,-,,
6,A2M,P01023,VAR_000014,Ile1000Val,LB/B,669.0,-,,
7,A2M,P01023,VAR_026820,Asn639Asp,LB/B,226405.0,-,,
8,A2M,P01023,VAR_026821,Leu815Gln,LB/B,3180392.0,-,,
...,...,...,...,...,...,...,...,...,...
79299,ZSCAN31,Q96LW9,VAR_024209,Lys205Arg,LB/B,853684.0,-,,
79300,ZSCAN31,Q96LW9,VAR_052809,Ala128Pro,LB/B,6922302.0,-,,
79301,ZSCAN31,Q96LW9,VAR_052810,Arg222Gln,LB/B,34223404.0,-,,
79302,ZSCAN31,Q96LW9,VAR_059911,Gly365Glu,LB/B,2394051.0,-,,


In [62]:
len(uniprot_llps.uniprot_acc.unique()) # only 2892 LLPS proteins with protein change in UniProt humsavar dataset

2892

## Separate mutations and positions in different cols

In [63]:
uniprot_llps[['id_protein', 'uniprot_acc', 'gene_name_y', 'snp_id', 'change', 'category', 'disease_name', 'mim', 'disease']].sort_values('change')

Unnamed: 0,id_protein,uniprot_acc,gene_name_y,snp_id,change,category,disease_name,mim,disease
6937,823,P13569,CFTR,397508480.0,Ala1006Glu,LP/P,Cystic fibrosis (CF) [MIM:219700],MIM:219700,Cystic fibrosis (CF)
14313,1966,Q13423,NNT,387907234.0,Ala1008Pro,LP/P,Glucocorticoid deficiency 4 with or without mi...,MIM:614736,Glucocorticoid deficiency 4 with or without mi...
16749,2594,Q7Z4S6,KIF21A,,Ala1008Pro,LP/P,"Fibrosis of extraocular muscles, congenital, 1...",MIM:135700,"Fibrosis of extraocular muscles, congenital, 1..."
22643,4342,P42771,CDKN2A,,Ala100Leu,LP/P,"Melanoma, cutaneous malignant 2 (CMM2) [MIM:15...",MIM:155601,"Melanoma, cutaneous malignant 2 (CMM2)"
22644,4342,P42771,CDKN2A,,Ala100Pro,LB/B,-,,
...,...,...,...,...,...,...,...,...,...
2434,568,P00451,F8,137852382.0,Val99Asp,LP/P,Hemophilia A (HEMA) [MIM:306700],MIM:306700,Hemophilia A (HEMA)
8889,1131,P31946,YWHAB,,Val99Ile,US,-,,
13691,1871,Q08999,RBL2,,Val99Phe,LB/B,-,,
10432,1286,P43628,KIR2DL3,3810343.0,Val9Ala,LB/B,-,,


In [64]:
uniprot_llps.change

3        Met565Val
4        Gly538Arg
5        Cys210Ser
6        Ser638Gly
7        Pro246Gln
           ...    
22703    Gly116Asp
22713     Glu26Lys
22714    Ser215Asn
22715    Tyr400Asn
22716     Gly28Ala
Name: change, Length: 21246, dtype: object

In [65]:
# Subset of non-null changes in proteins
# Regex for extracting Missense mutations (as they are in uniprot's humsavar dataset)
uniprot_llps['aux'] = uniprot_llps.change[uniprot_llps.change.notnull()].map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})$', x)).str[0]
uniprot_llps.aux

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_llps['aux'] = uniprot_llps.change[uniprot_llps.change.notnull()].map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})$', x)).str[0]


3        (Met, 565, Val)
4        (Gly, 538, Arg)
5        (Cys, 210, Ser)
6        (Ser, 638, Gly)
7        (Pro, 246, Gln)
              ...       
22703    (Gly, 116, Asp)
22713     (Glu, 26, Lys)
22714    (Ser, 215, Asn)
22715    (Tyr, 400, Asn)
22716     (Gly, 28, Ala)
Name: aux, Length: 21246, dtype: object

In [66]:
uniprot_llps['from_aa'] = uniprot_llps.aux[uniprot_llps.aux.notnull()].map(lambda x: x[0])
uniprot_llps['to_aa'] = uniprot_llps.aux[uniprot_llps.aux.notnull()].map(lambda x: x[2])
uniprot_llps['start_aa'] = uniprot_llps.aux[uniprot_llps.aux.notnull()].map(lambda x: x[1])
uniprot_llps['end_aa'] = uniprot_llps.aux[uniprot_llps.aux.notnull()].map(lambda x: x[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_llps['from_aa'] = uniprot_llps.aux[uniprot_llps.aux.notnull()].map(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_llps['to_aa'] = uniprot_llps.aux[uniprot_llps.aux.notnull()].map(lambda x: x[2])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_llps['start_aa'] 

In [68]:
uniprot_llps[['change', 'from_aa', 'to_aa',	'start_aa',	'end_aa']]

Unnamed: 0,change,from_aa,to_aa,start_aa,end_aa
3,Met565Val,Met,Val,565,565
4,Gly538Arg,Gly,Arg,538,538
5,Cys210Ser,Cys,Ser,210,210
6,Ser638Gly,Ser,Gly,638,638
7,Pro246Gln,Pro,Gln,246,246
...,...,...,...,...,...
22703,Gly116Asp,Gly,Asp,116,116
22713,Glu26Lys,Glu,Lys,26,26
22714,Ser215Asn,Ser,Asn,215,215
22715,Tyr400Asn,Tyr,Asn,400,400


## Control: aa corresponding to sequence

In [69]:
#Paso los aa de tres letras a una
uniprot_llps['ctrl'] = False
uniprot_llps['aa_ctrl'] = np.nan

for i in uniprot_llps.index:
    aa1 = uniprot_llps.from_aa[i]
    aa2 = uniprot_llps.to_aa[i]
    if str(aa1) != 'nan':
        uniprot_llps['aa_ctrl'][i] = str(seq1(aa1))

        #Evaluo
        if int(uniprot_llps.start_aa[i]) <= int(uniprot_llps.length[i]): 
            if uniprot_llps.sequence[i][int(uniprot_llps.start_aa[i])-1] == uniprot_llps.aa_ctrl[i]:
                uniprot_llps.ctrl[i] = True
    #if str(aa2) != 'nan':
        #box_uniprot_variants.aa2[i] = str(seq1(aa2))

uniprot_llps.ctrl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_llps['ctrl'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_llps['aa_ctrl'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_llps['aa_ctrl'][i] = str(seq1(aa1))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

3        True
4        True
5        True
6        True
7        True
         ... 
22703    True
22713    True
22714    True
22715    True
22716    True
Name: ctrl, Length: 21246, dtype: bool

In [71]:
uniprot_llps.aa_ctrl.notnull().sum()

21246

In [73]:
uniprot_llps.ctrl.value_counts()

True     20935
False      311
Name: ctrl, dtype: int64

In [74]:
uniprot_llps.columns

Index(['id_protein', 'uniprot_acc', 'hgnc_id', 'gene_id', 'gene_name_x',
       'length', 'sequence', 'gene_name_y', 'ft_id', 'change', 'category',
       'snp_id', 'disease_name', 'mim', 'disease', 'aux', 'from_aa', 'to_aa',
       'start_aa', 'end_aa', 'ctrl', 'aa_ctrl'],
      dtype='object')

In [75]:
uniprot_llps[['uniprot_acc', 'gene_name_y', 'snp_id', 'from_aa', 'to_aa',
               'start_aa', 'end_aa', 'ctrl', 'aa_ctrl', 'disease_name', 'disease', 'mim']]

Unnamed: 0,uniprot_acc,gene_name_y,snp_id,from_aa,to_aa,start_aa,end_aa,ctrl,aa_ctrl,disease_name,disease,mim
3,A0AV96,RBM47,278981.0,Met,Val,565,565,True,M,-,,
4,A0AV96,RBM47,35529250.0,Gly,Arg,538,538,True,G,-,,
5,A0FGR8,ESYT2,13233513.0,Cys,Ser,210,210,True,C,-,,
6,A0FGR8,ESYT2,2305473.0,Ser,Gly,638,638,True,S,-,,
7,A0FGR9,ESYT3,17857138.0,Pro,Gln,246,246,True,P,-,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22703,Q8N726,CDKN2A,35741010.0,Gly,Asp,116,116,True,G,-,,
22713,Q9P0M2,AKAP7,7771473.0,Glu,Lys,26,26,True,E,-,,
22714,Q9P0M2,AKAP7,1190788.0,Ser,Asn,215,215,True,S,-,,
22715,Q9ULB1,NRXN1,17040901.0,Tyr,Asn,400,400,True,Y,-,,


In [76]:
uniprot_llps[uniprot_llps.disease.notnull()] # 11926 entries with a disease annotated

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,gene_name_x,length,sequence,gene_name_y,ft_id,change,...,disease_name,mim,disease,aux,from_aa,to_aa,start_aa,end_aa,ctrl,aa_ctrl
23,12,A2A288,HGNC:21175,340152.0,ZC3H12D,4369,MEHPSKMEFFQKLGYDREDVLRVLGKLGEGALVNDVLQELIRTGSR...,ZC3H12D,VAR_046200,Lys106Arg,...,Some sporadic lung cancer sample,,Some sporadic lung cancer sample,"(Lys, 106, Arg)",Lys,Arg,106,106,True,K
33,18,A5YKK6,HGNC:7877,23019.0,CNOT1,4369,MNLDSLSLALSQISYLVDNLTKKNYRASQQEIQHIVNRHGPEADRH...,CNOT1,VAR_083066,Arg535Cys,...,Holoprosencephaly 12 with or without pancreati...,MIM:618500,Holoprosencephaly 12 with or without pancreati...,"(Arg, 535, Cys)",Arg,Cys,535,535,True,R
56,25,A6NHR9,HGNC:29090,23347.0,SMCHD1,4369,MAAADGGGPGGASVGTEEDGGGVGHRTVYLFDRREKESELGDRPLQ...,SMCHD1,VAR_069067,Tyr353Cys,...,Facioscapulohumeral muscular dystrophy 2 (FSHD...,MIM:158901,Facioscapulohumeral muscular dystrophy 2 (FSHD2),"(Tyr, 353, Cys)",Tyr,Cys,353,353,True,Y
57,25,A6NHR9,HGNC:29090,23347.0,SMCHD1,4369,MAAADGGGPGGASVGTEEDGGGVGHRTVYLFDRREKESELGDRPLQ...,SMCHD1,VAR_069068,Arg479Pro,...,Facioscapulohumeral muscular dystrophy 2 (FSHD...,MIM:158901,Facioscapulohumeral muscular dystrophy 2 (FSHD2),"(Arg, 479, Pro)",Arg,Pro,479,479,True,R
58,25,A6NHR9,HGNC:29090,23347.0,SMCHD1,4369,MAAADGGGPGGASVGTEEDGGGVGHRTVYLFDRREKESELGDRPLQ...,SMCHD1,VAR_069069,Cys492Arg,...,Facioscapulohumeral muscular dystrophy 2 (FSHD...,MIM:158901,Facioscapulohumeral muscular dystrophy 2 (FSHD2),"(Cys, 492, Arg)",Cys,Arg,492,492,True,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22682,4342,P42771,HGNC:1787,1029.0,CDKN2A,4369,MEPAAGSSMEPSADWLATAAARGRVEEVRALLEAGALPNAPNSYGR...,CDKN2A,VAR_058556,Thr77Pro,...,"Melanoma, cutaneous malignant 2 (CMM2) [MIM:15...",MIM:155601,"Melanoma, cutaneous malignant 2 (CMM2)","(Thr, 77, Pro)",Thr,Pro,77,77,True,T
22683,4342,P42771,HGNC:1787,1029.0,CDKN2A,4369,MEPAAGSSMEPSADWLATAAARGRVEEVRALLEAGALPNAPNSYGR...,CDKN2A,VAR_058557,Arg80Pro,...,"Melanoma, cutaneous malignant 2 (CMM2) [MIM:15...",MIM:155601,"Melanoma, cutaneous malignant 2 (CMM2)","(Arg, 80, Pro)",Arg,Pro,80,80,True,R
22684,4342,P42771,HGNC:1787,1029.0,CDKN2A,4369,MEPAAGSSMEPSADWLATAAARGRVEEVRALLEAGALPNAPNSYGR...,CDKN2A,VAR_058558,Pro81Thr,...,"Melanoma, cutaneous malignant 2 (CMM2) [MIM:15...",MIM:155601,"Melanoma, cutaneous malignant 2 (CMM2)","(Pro, 81, Thr)",Pro,Thr,81,81,True,P
22690,4347,Q5JWF2,HGNC:4392,2778.0,GNAS,4369,MGVRNCLYGNNMSGQRDIPPEIGEQPEQPPLEAPGAAAPGAGPSPA...,GNAS,VAR_028777,Ala436Asp,...,GNAS hyperfunction (GNASHYP) [MIM:139320],MIM:139320,GNAS hyperfunction (GNASHYP),"(Ala, 436, Asp)",Ala,Asp,436,436,True,A


In [77]:
uniprot_llps.columns

Index(['id_protein', 'uniprot_acc', 'hgnc_id', 'gene_id', 'gene_name_x',
       'length', 'sequence', 'gene_name_y', 'ft_id', 'change', 'category',
       'snp_id', 'disease_name', 'mim', 'disease', 'aux', 'from_aa', 'to_aa',
       'start_aa', 'end_aa', 'ctrl', 'aa_ctrl'],
      dtype='object')

In [85]:
uniprot_llps[['gene_name_x', 'disease_name', 'aux', 'ctrl', 'aa_ctrl']]

Unnamed: 0,gene_name_x,disease_name,aux,ctrl,aa_ctrl
3,RBM47,-,"(Met, 565, Val)",True,M
4,RBM47,-,"(Gly, 538, Arg)",True,G
5,ESYT2,-,"(Cys, 210, Ser)",True,C
6,ESYT2,-,"(Ser, 638, Gly)",True,S
7,ESYT3,-,"(Pro, 246, Gln)",True,P
...,...,...,...,...,...
22703,CDKN2A,-,"(Gly, 116, Asp)",True,G
22713,AKAP7,-,"(Glu, 26, Lys)",True,E
22714,AKAP7,-,"(Ser, 215, Asn)",True,S
22715,NRXN1,-,"(Tyr, 400, Asn)",True,Y


In [86]:
uniprot_llps_mutations = uniprot_llps.drop(columns= ['gene_name_x', 'disease_name', 'aux', 'ctrl', 'aa_ctrl'])
uniprot_llps_mutations.rename(columns={'gene_name_y': 'gene_name'}, inplace= True)

In [88]:
uniprot_llps_mutations['consequence'] = 'missense'
uniprot_llps_mutations['source'] = 'uniprot'

In [92]:
# Final table
uniprot_llps_mutations.sort_values('disease').head(10)

Unnamed: 0,id_protein,uniprot_acc,hgnc_id,gene_id,length,sequence,gene_name,ft_id,change,category,snp_id,mim,disease,from_aa,to_aa,start_aa,end_aa,consequence,source
15525,2259,Q16836,HGNC:4799,3033.0,4369,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,HADH,VAR_024079,Ala40Thr,LP/P,137853100.0,MIM:231530,3-alpha-hydroxyacyl-CoA dehydrogenase deficien...,Ala,Thr,40,40,missense,uniprot
15526,2259,Q16836,HGNC:4799,3033.0,4369,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,HADH,VAR_024080,Asp57Glu,LP/P,137853100.0,MIM:231530,3-alpha-hydroxyacyl-CoA dehydrogenase deficien...,Asp,Glu,57,57,missense,uniprot
16331,2471,Q6NVY1,HGNC:4908,26275.0,4369,MGQREMWRLMSRFNAFKRTNTILHHLRMSKHTDAAEEVLLEKKGCT...,HIBCH,VAR_031870,Tyr122Cys,LP/P,121918300.0,MIM:250620,3-hydroxyisobutryl-CoA hydrolase deficiency (H...,Tyr,Cys,122,122,missense,uniprot
20478,3651,Q9HCC0,HGNC:6937,64087.0,4369,MWAVLRLALRPCARASPAGPRAYHGDSVASLGTQPDLGSALYQENY...,MCCC2,VAR_072518,Ile200Asn,US,140806700.0,MIM:210210,3-methylcrotonoyl-CoA carboxylase 2 deficiency...,Ile,Asn,200,200,missense,uniprot
20479,3651,Q9HCC0,HGNC:6937,64087.0,4369,MWAVLRLALRPCARASPAGPRAYHGDSVASLGTQPDLGSALYQENY...,MCCC2,VAR_072519,Ala218Val,LP/P,760420200.0,MIM:210210,3-methylcrotonoyl-CoA carboxylase 2 deficiency...,Ala,Val,218,218,missense,uniprot
20480,3651,Q9HCC0,HGNC:6937,64087.0,4369,MWAVLRLALRPCARASPAGPRAYHGDSVASLGTQPDLGSALYQENY...,MCCC2,VAR_072520,Gly220Glu,LP/P,1254750000.0,MIM:210210,3-methylcrotonoyl-CoA carboxylase 2 deficiency...,Gly,Glu,220,220,missense,uniprot
20481,3651,Q9HCC0,HGNC:6937,64087.0,4369,MWAVLRLALRPCARASPAGPRAYHGDSVASLGTQPDLGSALYQENY...,MCCC2,VAR_072521,Pro224Leu,LP/P,1195601000.0,MIM:210210,3-methylcrotonoyl-CoA carboxylase 2 deficiency...,Pro,Leu,224,224,missense,uniprot
20477,3651,Q9HCC0,HGNC:6937,64087.0,4369,MWAVLRLALRPCARASPAGPRAYHGDSVASLGTQPDLGSALYQENY...,MCCC2,VAR_072517,Arg193His,LP/P,535519600.0,MIM:210210,3-methylcrotonoyl-CoA carboxylase 2 deficiency...,Arg,His,193,193,missense,uniprot
20482,3651,Q9HCC0,HGNC:6937,64087.0,4369,MWAVLRLALRPCARASPAGPRAYHGDSVASLGTQPDLGSALYQENY...,MCCC2,VAR_072522,Gly237Asp,LP/P,,MIM:210210,3-methylcrotonoyl-CoA carboxylase 2 deficiency...,Gly,Asp,237,237,missense,uniprot
20484,3651,Q9HCC0,HGNC:6937,64087.0,4369,MWAVLRLALRPCARASPAGPRAYHGDSVASLGTQPDLGSALYQENY...,MCCC2,VAR_072525,His282Arg,LP/P,,MIM:210210,3-methylcrotonoyl-CoA carboxylase 2 deficiency...,His,Arg,282,282,missense,uniprot


In [94]:
uniprot_llps_mutations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21246 entries, 3 to 22716
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id_protein   21246 non-null  int64  
 1   uniprot_acc  21246 non-null  object 
 2   hgnc_id      21242 non-null  object 
 3   gene_id      21230 non-null  float64
 4   length       21246 non-null  int64  
 5   sequence     21246 non-null  object 
 6   gene_name    21246 non-null  object 
 7   ft_id        21246 non-null  object 
 8   change       21246 non-null  object 
 9   category     21246 non-null  object 
 10  snp_id       16770 non-null  float64
 11  mim          9565 non-null   object 
 12  disease      11926 non-null  object 
 13  from_aa      21246 non-null  object 
 14  to_aa        21246 non-null  object 
 15  start_aa     21246 non-null  object 
 16  end_aa       21246 non-null  object 
 17  consequence  21246 non-null  object 
 18  source       21246 non-null  object 
dtypes: f

In [107]:
(uniprot_llps_mutations[uniprot_llps_mutations.disease.notnull()].disease.value_counts() > 10).sum()

225

In [109]:
uniprot_llps_mutations[uniprot_llps_mutations.disease.notnull()].disease.value_counts()[:20]

Sporadic cancers                                                                852
Hemophilia A (HEMA)                                                             474
A sporadic cancer                                                               367
A breast cancer sample                                                          221
Breast cancer (BC)                                                              198
A colorectal cancer sample                                                      153
Carbamoyl phosphate synthetase 1 deficiency (CPS1D)                             145
Cystic fibrosis (CF)                                                            137
Alzheimer disease 3 (AD3)                                                       123
Polycystic kidney disease 1 with or without polycystic liver disease (PKD1)     107
Hereditary non-polyposis colorectal cancer 2 (HNPCC2)                           107
Spastic paraplegia 4, autosomal dominant (SPG4)                             

## Save

In [93]:
uniprot_llps_mutations.to_csv('datasets/uniprot_all_proteins_mutations.csv', index= False)