In [5]:
import pandas as pd
import numpy as np
import re
from Bio.SeqUtils import seq1
from functools import reduce

## Cargo datasets

In [6]:
# Box1 database
box = pd.read_csv('box1_proteins.csv')
box.head()

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence
0,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...
1,O43663,Homo sapiens,pcg body,HGNC:9341,620,MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...
2,O75494,Homo sapiens,nuclear speckle,HGNC:16713,262,MSRYLRPPNTSLFVRNVADDTRSEDLRREFGRYGPIVDVYVPLDFY...
3,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...
4,P08047,Homo sapiens,centrosome/spindle pole body,HGNC:11205,785,MSDQDHSMDEMTAVVKIEKGVGGNNGGNGNGGGAFSQARSSSTGSS...


In [7]:
# Dominios PFAM
pfam_box = pd.read_csv('pfam_box1.csv')
pfam_box = pfam_box.rename(columns={'pfam_acc': 'pfam_id', 'domain':'pfam_domain','start':'pfam_start', 'end':'pfam_end'})

# LC zones
lc_box = pd.read_csv('lc_zones_box1.csv')
lc_box = lc_box.rename(columns={'start': 'lc_start', 'end':'lc_end', 'seq':'lc_seq', 'largo':'lc_length'})

# Regiones desordenadas
idr_box = pd.read_csv('idrs_box1_mobidb.csv')
idr_box = idr_box.rename(columns={'tipo':'idr_tipo', 'start':'idr_start', 'end':'idr_end'})

In [8]:
# Leo el txt de variantes humanas en uniprot
'''humsavar.txt:
Index of manually curated Human polymorphisms and disease mutations from UniProtKB/Swiss-Prot.
This file lists all missense variants annotated in UniProtKB/Swiss-Prot human
entries. It provides a variant classification which is intended for research
purposes only, not for clinical and diagnostic use.
 - The column 'Variant category' shows the classification of the variant using
   the American College of Medical Genetics and Genomics/Association for
   Molecular Pathology (ACMG/AMP) terminology (Richards et al. PubMed:25741868)
   into the following categories:
   
   LP/P = likely pathogenic or pathogenic
   LB/B = likely benign or benign
   US   = uncertain significance

   These categories are assigned based on the variant annotation in the
   corresponding UniProtKB/Swiss-Prot entries that is curated from literature
   reports. The classification may change over time and must not be considered
   as a definitive statement about the pathogenic role of a variant.

 - The column 'Disease name' shows the name of the disease or the disease sample
   in which variants have been found. Names are only provided for diseases
   catalogued in OMIM and for cancer samples.
'''
with open('G:\My Drive\FIL\project\humsavar.txt') as f:
    gene_name=[]
    uniprot=[]
    ft_id=[]
    change=[]
    category=[]
    snp_id=[]
    disease_name=[]
    for line in f:
        stripped_line = line.strip()
        gene_name.append(stripped_line[0:10])
        uniprot.append(stripped_line[10:21])
        ft_id.append(stripped_line[21:33])
        change.append(stripped_line[33:48])
        category.append(stripped_line[48:57])
        snp_id.append(stripped_line[57:72])
        disease_name.append(stripped_line[72:])

In [9]:
# Creo el dataframe: humsavar
humsavar = pd.DataFrame(list(zip(gene_name, uniprot, ft_id, change, category, snp_id, disease_name)), columns=['gene_name', 'uniprot', 'ft_id', 'change', 'category', 'snp_id', 'disease_name'])
humsavar = humsavar.drop([0]).reset_index(drop=True)

# Eliminar los espacios en blanco
humsavar = humsavar.applymap(lambda x: x.strip())

# agrego los mim access en otra col
humsavar['mim'] = humsavar.disease_name.map(lambda x: re.findall('\[(.*?)\]', x))
humsavar['mim'] = humsavar.mim.str[0]

'''#Lo guardo
humsavar.to_csv('humsavar.csv', index=False)
print(humsavar.head())
print('Total de entradas:', len(humsavar))'''

"#Lo guardo\nhumsavar.to_csv('humsavar.csv', index=False)\nprint(humsavar.head())\nprint('Total de entradas:', len(humsavar))"

In [10]:
#agrego los mim access en otra col
humsavar['mim'] = humsavar.disease_name.map(lambda x: re.findall('\[(.*?)\]', x))
humsavar['mim'] = humsavar.mim.str[0]
humsavar.mim.head(50)

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
5            NaN
6            NaN
7            NaN
8            NaN
9            NaN
10           NaN
11           NaN
12           NaN
13           NaN
14           NaN
15           NaN
16           NaN
17           NaN
18           NaN
19           NaN
20           NaN
21           NaN
22           NaN
23           NaN
24           NaN
25           NaN
26           NaN
27           NaN
28           NaN
29    MIM:231550
30    MIM:231550
31    MIM:231550
32           NaN
33           NaN
34           NaN
35           NaN
36           NaN
37           NaN
38           NaN
39           NaN
40           NaN
41           NaN
42           NaN
43           NaN
44           NaN
45           NaN
46           NaN
47           NaN
48           NaN
49           NaN
Name: mim, dtype: object

In [11]:
# Ojo, por que hay snps repetidos??
humsavar.snp_id.value_counts()

-              13701
rs121913279        9
rs121918488        9
rs75076352         8
rs79781594         7
               ...  
rs55710741         1
rs1052030          1
rs35371077         1
rs394732           1
rs119482084        1
Name: snp_id, Length: 63272, dtype: int64

In [12]:
# Veo uno por ejemplo
humsavar[humsavar.snp_id == 'rs121913273']

Unnamed: 0,gene_name,uniprot,ft_id,change,category,snp_id,disease_name,mim
53375,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,Breast cancer (BC) [MIM:114480],MIM:114480
53376,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,CLAPO syndrome (CLAPO) [MIM:613089],MIM:613089
53377,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,Colorectal cancer (CRC) [MIM:114500],MIM:114500
53378,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,"Congenital lipomatous overgrowth, vascular mal...",MIM:612918
53379,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,"Keratosis, seborrheic (KERSEB) [MIM:182000]",MIM:182000
53380,PIK3CA,P42336,VAR_026173,p.Glu542Lys,US,rs121913273,Macrodactyly (MADAC) [MIM:155500],MIM:155500
53381,PIK3CA,P42336,VAR_026174,p.Glu542Gln,US,rs121913273,-,


## Merge con Box1

In [13]:
# Merge con box1
box_uniprot_variants = box.merge(humsavar)
box_uniprot_variants

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,snp_id,disease_name,mim
0,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009519,p.Arg46Gln,LB/B,rs104893751,-,
1,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009520,p.Arg154His,LB/B,rs56053615,-,
2,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009521,p.Ser326Cys,LB/B,rs1052133,-,
3,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_014487,p.Arg229Gln,LB/B,rs1805373,-,
4,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_014488,p.Ser320Thr,LB/B,rs1801128,-,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4089,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_017818,p.Trp274Arg,LP/P,rs104894163,"Hypoparathyroidism, sensorineural deafness, an...",MIM:146255
4090,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_019202,p.Gly242Ser,LB/B,rs11567901,-,
4091,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_033025,p.Arg366Leu,US,-,A breast cancer sample,
4092,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_075427,p.Arg298Gln,LP/P,-,"Hypoparathyroidism, sensorineural deafness, an...",MIM:146255


In [14]:
box_uniprot_variants.uniprot.value_counts()

P04637    1338
Q9P2D1     147
P02545     145
P40337     140
P21359     106
          ... 
Q9NWZ8       1
Q96SQ7       1
Q96HA1       1
O14641       1
Q96MU7       1
Name: uniprot, Length: 391, dtype: int64

In [15]:
box_uniprot_variants['aa1'] = box_uniprot_variants.change.str[2:5]
box_uniprot_variants['pos'] = box_uniprot_variants.change.str[5:-3]
box_uniprot_variants['aa2'] = box_uniprot_variants.change.str[-3:]
box_uniprot_variants

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,snp_id,disease_name,mim,aa1,pos,aa2
0,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009519,p.Arg46Gln,LB/B,rs104893751,-,,Arg,46,Gln
1,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009520,p.Arg154His,LB/B,rs56053615,-,,Arg,154,His
2,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009521,p.Ser326Cys,LB/B,rs1052133,-,,Ser,326,Cys
3,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_014487,p.Arg229Gln,LB/B,rs1805373,-,,Arg,229,Gln
4,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_014488,p.Ser320Thr,LB/B,rs1801128,-,,Ser,320,Thr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4089,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_017818,p.Trp274Arg,LP/P,rs104894163,"Hypoparathyroidism, sensorineural deafness, an...",MIM:146255,Trp,274,Arg
4090,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_019202,p.Gly242Ser,LB/B,rs11567901,-,,Gly,242,Ser
4091,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_033025,p.Arg366Leu,US,-,A breast cancer sample,,Arg,366,Leu
4092,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_075427,p.Arg298Gln,LP/P,-,"Hypoparathyroidism, sensorineural deafness, an...",MIM:146255,Arg,298,Gln


In [16]:
#Paso los aa de tres letras a una
box_uniprot_variants['ctrl'] = False
for i in box_uniprot_variants.index:
    aa1 = box_uniprot_variants.aa1[i]
    aa2 = box_uniprot_variants.aa2[i]
    if str(aa1) != 'nan':
        box_uniprot_variants.aa1[i] = str(seq1(aa1))
        #Evaluo
        if int(box_uniprot_variants.pos[i]) <= int(box_uniprot_variants.length[i]): 
            if box_uniprot_variants.sequence[i][int(box_uniprot_variants.pos[i])-1] == box_uniprot_variants.aa1[i]:
                box_uniprot_variants.ctrl[i] = True
    if str(aa2) != 'nan':
        box_uniprot_variants.aa2[i] = str(seq1(aa2))

box_uniprot_variants

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  box_uniprot_variants.aa1[i] = str(seq1(aa1))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  box_uniprot_variants.ctrl[i] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  box_uniprot_variants.aa2[i] = str(seq1(aa2))


Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,snp_id,disease_name,mim,aa1,pos,aa2,ctrl
0,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009519,p.Arg46Gln,LB/B,rs104893751,-,,R,46,Q,True
1,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009520,p.Arg154His,LB/B,rs56053615,-,,R,154,H,True
2,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009521,p.Ser326Cys,LB/B,rs1052133,-,,S,326,C,True
3,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_014487,p.Arg229Gln,LB/B,rs1805373,-,,R,229,Q,True
4,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_014488,p.Ser320Thr,LB/B,rs1801128,-,,S,320,T,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4089,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_017818,p.Trp274Arg,LP/P,rs104894163,"Hypoparathyroidism, sensorineural deafness, an...",MIM:146255,W,274,R,True
4090,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_019202,p.Gly242Ser,LB/B,rs11567901,-,,G,242,S,True
4091,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_033025,p.Arg366Leu,US,-,A breast cancer sample,,R,366,L,True
4092,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_075427,p.Arg298Gln,LP/P,-,"Hypoparathyroidism, sensorineural deafness, an...",MIM:146255,R,298,Q,True


In [17]:
box_uniprot_variants.ctrl.value_counts()
# todas los aa que cambian coinciden con la secuencia canonica

True    4094
Name: ctrl, dtype: int64

In [21]:
box_uniprot_variants[['uniprot', 'gene_name', 'mlo', 'snp_id', 'aa1', 'pos', 'aa2', 'disease_name', 'mim']]

Unnamed: 0,uniprot,gene_name,mlo,snp_id,aa1,pos,aa2,disease_name,mim
0,O15527,OGG1,nuclear speckle,rs104893751,R,46,Q,-,
1,O15527,OGG1,nuclear speckle,rs56053615,R,154,H,-,
2,O15527,OGG1,nuclear speckle,rs1052133,S,326,C,-,
3,O15527,OGG1,nuclear speckle,rs1805373,R,229,Q,-,
4,O15527,OGG1,nuclear speckle,rs1801128,S,320,T,-,
...,...,...,...,...,...,...,...,...,...
4089,P23771,GATA3,enhanceosome,rs104894163,W,274,R,"Hypoparathyroidism, sensorineural deafness, an...",MIM:146255
4090,P23771,GATA3,enhanceosome,rs11567901,G,242,S,-,
4091,P23771,GATA3,enhanceosome,-,R,366,L,A breast cancer sample,
4092,P23771,GATA3,enhanceosome,-,R,298,Q,"Hypoparathyroidism, sensorineural deafness, an...",MIM:146255


## Guardo

In [70]:
import os
os.getcwd()

'g:\\My Drive\\FIL\\project'

In [73]:
path = 'g:\\My Drive\\FIL\\project\\datasets\\uniprot_box1_variants.csv'
box_uniprot_variants.to_csv(path, index=False)

In [37]:
box_uniprot_variants.snp_id.value_counts()

-               1254
rs121909329        4
rs5030808          4
rs1057520007       3
rs1057519998       3
                ... 
rs1555532483       1
rs886037953        1
rs397509428        1
rs796052231        1
rs148489044        1
Name: snp_id, Length: 2554, dtype: int64

In [38]:
box_uniprot_variants[box_uniprot_variants.snp_id == '-']

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,snp_id,disease_name,mim,aa1,pos,aa2,ctrl
10,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_024834,p.Ser232Thr,US,-,-,,S,232,T,True
58,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,ATRX,VAR_001227,p.Leu192Phe,LP/P,-,"Alpha-thalassemia mental retardation syndrome,...",MIM:301040,L,192,F,True
59,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,ATRX,VAR_001228,p.Cys200Ser,LP/P,-,"Alpha-thalassemia mental retardation syndrome,...",MIM:301040,C,200,S,True
60,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,ATRX,VAR_001229,p.Cys220Arg,LP/P,-,"Alpha-thalassemia mental retardation syndrome,...",MIM:301040,C,220,R,True
61,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,ATRX,VAR_001230,p.Trp222Ser,LP/P,-,"Alpha-thalassemia mental retardation syndrome,...",MIM:301040,W,222,S,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4080,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_008748,p.Thr188Lys,US,-,-,,T,188,K,True
4081,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_008749,p.Glu196Lys,LP/P,-,Creutzfeldt-Jakob disease (CJD) [MIM:123400],MIM:123400,E,196,K,True
4086,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_008754,p.Pro238Ser,LB/B,-,-,,P,238,S,True
4091,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_033025,p.Arg366Leu,US,-,A breast cancer sample,,R,366,L,True


In [19]:
pfam_box

Unnamed: 0,uniprot,pfam_id,pfam_domain,pfam_start,pfam_end
0,P55072,PF00004,AAA,241,371
1,P55072,PF00004,AAA,514,647
2,Q5HY92,PF00004,AAA,522,652
3,O15381,PF00004,AAA,618,748
4,O15381,PF00004,AAA,301,434
...,...,...,...,...,...
1422,Q6PJT7,PF14608,zf-CCCH_2,681,698
1423,Q6PJT7,PF14608,zf-CCCH_2,701,717
1424,Q6PJT7,PF14608,zf-CCCH_2,640,656
1425,P61129,PF14608,zf-CCCH_2,329,349


In [22]:
# Merge con pfam
pfam = box_uniprot_variants.merge(pfam_box, on='uniprot')
# Variantes que caen en dominios
pfam['in_dom'] = False
for i in pfam.index:
    if str(pfam.pos[i]) != 'nan':
        if (int(pfam.pos[i]) >= pfam.pfam_start[i]) & (int(pfam.pos[i]) <= pfam.pfam_end[i]):
            pfam.in_dom[i] = True
        else:
            pfam.in_dom[i] = False

#pfam = pfam[pfam.in_dom == True]
pfam

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pfam.in_dom[i] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pfam.in_dom[i] = True


Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,...,mim,aa1,pos,aa2,ctrl,pfam_id,pfam_domain,pfam_start,pfam_end,in_dom
0,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009519,p.Arg46Gln,LB/B,...,,R,46,Q,True,PF00730,HhH-GPD,142,301,False
1,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009519,p.Arg46Gln,LB/B,...,,R,46,Q,True,PF07934,OGG_N,25,141,True
2,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009520,p.Arg154His,LB/B,...,,R,154,H,True,PF00730,HhH-GPD,142,301,True
3,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009520,p.Arg154His,LB/B,...,,R,154,H,True,PF07934,OGG_N,25,141,False
4,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009521,p.Ser326Cys,LB/B,...,,S,326,C,True,PF00730,HhH-GPD,142,301,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14706,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_033025,p.Arg366Leu,US,...,,R,366,L,True,PF00320,GATA,317,351,False
14707,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_075427,p.Arg298Gln,LP/P,...,MIM:146255,R,298,Q,True,PF00320,GATA,263,297,False
14708,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_075427,p.Arg298Gln,LP/P,...,MIM:146255,R,298,Q,True,PF00320,GATA,317,351,False
14709,Q8NE35,Homo sapiens,p-body,HGNC:21746,698,MQDDLLMDKSKTQPQPQQQQRQQQQPQPESSVSEAPSTPLSSETPK...,CPEB3,VAR_029776,p.Arg324Trp,LB/B,...,,R,324,W,True,PF16366,CEBP_ZZ,622,685,False


In [36]:
pfam = pfam[pfam.in_dom == True]
pfam[['uniprot', 'gene_name', 'mlo', 'snp_id', 'aa1', 'pos', 'aa2', 'pfam_id', 'pfam_domain', 'pfam_start', 'pfam_end', 'disease_name', 'mim']]

Unnamed: 0,uniprot,gene_name,mlo,snp_id,aa1,pos,aa2,pfam_id,pfam_domain,pfam_start,pfam_end,disease_name,mim
1,O15527,OGG1,nuclear speckle,rs104893751,R,46,Q,PF07934,OGG_N,25,141,-,
2,O15527,OGG1,nuclear speckle,rs56053615,R,154,H,PF00730,HhH-GPD,142,301,-,
6,O15527,OGG1,nuclear speckle,rs1805373,R,229,Q,PF00730,HhH-GPD,142,301,-,
10,O15527,OGG1,nuclear speckle,rs3219012,A,288,V,PF00730,HhH-GPD,142,301,-,
17,O15527,OGG1,nuclear speckle,rs17050550,A,85,S,PF07934,OGG_N,25,141,-,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14689,P04156,PRNP,cytoplasmic protein granule,rs776593792,V,203,I,PF00377,Prion,134,252,Creutzfeldt-Jakob disease (CJD) [MIM:123400],MIM:123400
14691,P04156,PRNP,cytoplasmic protein granule,rs398122370,E,211,Q,PF00377,Prion,134,252,Creutzfeldt-Jakob disease (CJD) [MIM:123400],MIM:123400
14693,P04156,PRNP,cytoplasmic protein granule,rs751882709,Q,212,P,PF00377,Prion,134,252,Gerstmann-Straussler disease (GSD) [MIM:137440],MIM:137440
14695,P04156,PRNP,cytoplasmic protein granule,-,P,238,S,PF00377,Prion,134,252,-,


In [29]:
# Merge con Low Complexity
lc = box_uniprot_variants.merge(lc_box, on='uniprot')
lc['in_lc'] = False
for i in lc.index:
    if str(lc.pos[i]) != 'nan':
        if (int(lc.pos[i]) >= lc.lc_start[i]) & (int(lc.pos[i]) <= lc.lc_end[i]):
            lc.in_lc[i] = True
        else:
            lc.in_lc[i] = False

#lc = lc[lc.in_lc == True]
lc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lc.in_lc[i] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lc.in_lc[i] = True


Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,...,mim,aa1,pos,aa2,ctrl,lc_start,lc_end,lc_seq,lc_length,in_lc
0,P08047,Homo sapiens,centrosome/spindle pole body,HGNC:11205,785,MSDQDHSMDEMTAVVKIEKGVGGNNGGNGNGGGAFSQARSSSTGSS...,SP1,VAR_019971,p.Thr737Ala,LB/B,...,,T,737,A,True,19,53,KGVGGNNGGNGNGGGAFSQARSSSTGSSSSTGGGG,35,False
1,P08047,Homo sapiens,centrosome/spindle pole body,HGNC:11205,785,MSDQDHSMDEMTAVVKIEKGVGGNNGGNGNGGGAFSQARSSSTGSS...,SP1,VAR_019971,p.Thr737Ala,LB/B,...,,T,737,A,True,305,318,ISSASLVSSQASSS,14,False
2,P08047,Homo sapiens,centrosome/spindle pole body,HGNC:11205,785,MSDQDHSMDEMTAVVKIEKGVGGNNGGNGNGGGAFSQARSSSTGSS...,SP1,VAR_019971,p.Thr737Ala,LB/B,...,,T,737,A,True,380,409,LQAGQQKEGEQNQQTQQQQILIQPQLVQGG,30,False
3,P42858,Homo sapiens,centrosome/spindle pole body,HGNC:4851,3142,MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQPPPPPPPP...,HTT,VAR_054017,p.Glu1382Ala,LB/B,...,,E,1382,A,True,17,75,FQQQQQQQQQQQQQQQQQQQQQPPPPPPPPPPPQLPQPPPQAQPLL...,59,False
4,P42858,Homo sapiens,centrosome/spindle pole body,HGNC:4851,3142,MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQPPPPPPPP...,HTT,VAR_054017,p.Glu1382Ala,LB/B,...,,E,1382,A,True,2632,2655,REEEWDEEEEEEADAPAPSSPPTS,24,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4185,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_073722,p.Gly127Val,LB/B,...,,G,127,V,True,111,132,HMAGAAAAGAVVGGLGGYMLGS,22,True
4186,Q8NE35,Homo sapiens,p-body,HGNC:21746,698,MQDDLLMDKSKTQPQPQQQQRQQQQPQPESSVSEAPSTPLSSETPK...,CPEB3,VAR_029776,p.Arg324Trp,LB/B,...,,R,324,W,True,12,27,TQPQPQQQQRQQQQPQ,16,False
4187,Q8NE35,Homo sapiens,p-body,HGNC:21746,698,MQDDLLMDKSKTQPQPQQQQRQQQQPQPESSVSEAPSTPLSSETPK...,CPEB3,VAR_029776,p.Arg324Trp,LB/B,...,,R,324,W,True,75,105,SPLLPGLSFHQPPQQPPPPQEPAAPGASLSP,31,False
4188,Q8NE35,Homo sapiens,p-body,HGNC:21746,698,MQDDLLMDKSKTQPQPQQQQRQQQQPQPESSVSEAPSTPLSSETPK...,CPEB3,VAR_029776,p.Arg324Trp,LB/B,...,,R,324,W,True,163,199,HQQPPPPAPAPQPAQPAQPPQAQPPQQRRSPASPSQA,37,False


In [30]:
lc[lc.in_lc == True]

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,...,mim,aa1,pos,aa2,ctrl,lc_start,lc_end,lc_seq,lc_length,in_lc
229,Q07955,Homo sapiens,nuclear speckle,HGNC:10780,248,MSGGGVIRGPAGNNDCRIYVGNLPPDIRTKDIEDVFYKYGAIRDID...,SRSF1,VAR_035488,p.Pro89Ser,US,...,,P,89,S,True,88,118,FPRSGRGTGRGGGGGGGGGAPRGRYGPPSRR,31,True
231,Q08170,Homo sapiens,nuclear speckle,HGNC:10786,494,MPRVYIGRLSYQARERDVERFFKGYGKILEVDLKNGYGFVEFDDLR...,SRSF4,VAR_052230,p.Glu253Asp,LB/B,...,,E,253,D,True,178,260,GSRRRRSYSRSRSHSRSRSRSRHSRKSRSRSGSSKSSHSKSRSRSR...,83,True
235,Q08170,Homo sapiens,nuclear speckle,HGNC:10786,494,MPRVYIGRLSYQARERDVERFFKGYGKILEVDLKNGYGFVEFDDLR...,SRSF4,VAR_052231,p.Gly338Ala,LB/B,...,,G,338,A,True,317,386,VSRGRSQEKSLRQSRSRSRSKGGSRSRSRSRSKSKDKRKGRKRSRE...,70,True
238,Q08170,Homo sapiens,nuclear speckle,HGNC:10786,494,MPRVYIGRLSYQARERDVERFFKGYGKILEVDLKNGYGFVEFDDLR...,SRSF4,VAR_052232,p.Gly356Ser,LB/B,...,,G,356,S,True,317,386,VSRGRSQEKSLRQSRSRSRSKGGSRSRSRSRSKSKDKRKGRKRSRE...,70,True
244,Q66PJ3,Homo sapiens,nuclear speckle,HGNC:18076,421,MPRCTYQLEQNPGFLPDGPGVHARAHCQDLSGPYGHEFATSESLGG...,ARL6IP4,VAR_058333,p.Lys286Arg,US,...,,K,286,R,True,246,311,ERSKQKARRRTRSSSSSSSSSSSSSSSSSSSSSSSSSDGRKKRGKY...,66,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4098,Q16637,Homo sapiens,cajal body,HGNC:11118,294,MAMSSGGSGGGVPEQEDSVLFRRGTGQSDDSDIWDDTALIKAYDKA...,SMN1,VAR_010051,p.Pro245Leu,LP/P,...,MIM:253400,P,245,L,True,193,250,FLPPPPPMPGPRLGPGKPGLKFNGPPPPPPPPPPHLLSCWLPPFPS...,58,True
4137,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_006466,p.Ala117Val,LB/B,...,,A,117,V,True,111,132,HMAGAAAAGAVVGGLGGYMLGS,22,True
4139,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_006467,p.Met129Val,LB/B,...,,M,129,V,True,111,132,HMAGAAAAGAVVGGLGGYMLGS,22,True
4183,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_014264,p.Gly131Val,LP/P,...,MIM:137440,G,131,V,True,111,132,HMAGAAAAGAVVGGLGGYMLGS,22,True


In [34]:
lc.sequence[229][87:118] == lc.lc_seq[229]

True

In [27]:
# Merge con IDRs
idr= box_uniprot_variants.merge(idr_box, on='uniprot')
idr['in_idr'] = False
for i in idr.index:
    if str(idr.pos[i]) != 'nan':
        if (int(idr.pos[i]) >= idr.idr_start[i]) & (int(idr.pos[i]) <= idr.idr_end[i]):
            idr.in_idr[i] = True
        else:
            idr.in_idr[i] = False

#idr = idr[idr.in_idr == True]
idr

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  idr.in_idr[i] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  idr.in_idr[i] = True


Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,...,mim,aa1,pos,aa2,ctrl,org,idr_tipo,idr_start,idr_end,in_idr
0,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009519,p.Arg46Gln,LB/B,...,,R,46,Q,True,Homo sapiens,D_PA,324,345,False
1,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009520,p.Arg154His,LB/B,...,,R,154,H,True,Homo sapiens,D_PA,324,345,False
2,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009521,p.Ser326Cys,LB/B,...,,S,326,C,True,Homo sapiens,D_PA,324,345,True
3,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_014487,p.Arg229Gln,LB/B,...,,R,229,Q,True,Homo sapiens,D_PA,324,345,False
4,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_014488,p.Ser320Thr,LB/B,...,,S,320,T,True,Homo sapiens,D_PA,324,345,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13301,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_075427,p.Arg298Gln,LP/P,...,MIM:146255,R,298,Q,True,Homo sapiens,D_WC,194,216,False
13302,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_075427,p.Arg298Gln,LP/P,...,MIM:146255,R,298,Q,True,Homo sapiens,D_PA,361,389,False
13303,P23771,Homo sapiens,enhanceosome,HGNC:4172,443,MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLSHSYMDAAQYPLPEE...,GATA3,VAR_075427,p.Arg298Gln,LP/P,...,MIM:146255,R,298,Q,True,Homo sapiens,D_WC,414,443,False
13304,Q8NE35,Homo sapiens,p-body,HGNC:21746,698,MQDDLLMDKSKTQPQPQQQQRQQQQPQPESSVSEAPSTPLSSETPK...,CPEB3,VAR_029776,p.Arg324Trp,LB/B,...,,R,324,W,True,Homo sapiens,D_WC,1,114,False


In [28]:
idr[idr.in_idr == True]

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,...,mim,aa1,pos,aa2,ctrl,org,idr_tipo,idr_start,idr_end,in_idr
2,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009521,p.Ser326Cys,LB/B,...,,S,326,C,True,Homo sapiens,D_PA,324,345,True
111,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,PCNT,VAR_043887,p.Arg1960Gln,LB/B,...,,R,1960,Q,True,Homo sapiens,D_WC,1954,1974,True
142,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,PCNT,VAR_043890,p.Met2188Arg,LB/B,...,,M,2188,R,True,Homo sapiens,D_WC,2168,2214,True
152,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,PCNT,VAR_043891,p.Ser2191Pro,LB/B,...,,S,2191,P,True,Homo sapiens,D_WC,2168,2214,True
183,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,PCNT,VAR_056963,p.Pro2329Arg,LB/B,...,,P,2329,R,True,Homo sapiens,D_WC,2318,2374,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13236,P03372,Homo sapiens,enhanceosome,HGNC:3467,595,MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDS...,ESR1,VAR_004671,p.Gly160Cys,LB/B,...,,G,160,C,True,Homo sapiens,D_WC,146,174,True
13249,P03372,Homo sapiens,enhanceosome,HGNC:3467,595,MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDS...,ESR1,VAR_033029,p.Met264Ile,US,...,,M,264,I,True,Homo sapiens,D_PA,259,286,True
13257,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_006464,p.Pro102Leu,LP/P,...,MIM:137440,P,102,L,True,Homo sapiens,D_WC,26,108,True
13258,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,PRNP,VAR_006465,p.Pro105Leu,LP/P,...,MIM:137440,P,105,L,True,Homo sapiens,D_WC,26,108,True


In [57]:
#MERGE de las tres tablas
#df_list = [pfam, lc, idr]
#dom_merged = reduce(lambda left, right: pd.merge(left, right, on=list(box_uniprot_variants.columns), how='outer'), df_list)
#dom_merged = dom_merged.astype({"pos": np.int64})
#dom_merged

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,gene_name,ft_id,change,category,...,lc_start,lc_end,lc_seq,lc_length,in_lc,org,idr_tipo,idr_start,idr_end,in_idr
0,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009519,p.Arg46Gln,LB/B,...,,,,,,Homo sapiens,D_PA,324.0,345.0,False
1,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009519,p.Arg46Gln,LB/B,...,,,,,,Homo sapiens,D_PA,324.0,345.0,False
2,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009520,p.Arg154His,LB/B,...,,,,,,Homo sapiens,D_PA,324.0,345.0,False
3,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009520,p.Arg154His,LB/B,...,,,,,,Homo sapiens,D_PA,324.0,345.0,False
4,O15527,Homo sapiens,nuclear speckle,HGNC:8125,345,MPARALLPRRMGHRTLASTPALWASIPCPRSELRLDLVLPSGQSFR...,OGG1,VAR_009521,p.Ser326Cys,LB/B,...,,,,,,Homo sapiens,D_PA,324.0,345.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109393,O00165,Homo sapiens,p-body,HGNC:16915,279,MSLFDLFRGFFGFPGPRSHRDPFFGGMTRDEDDDEEEEEEGGSWGR...,HAX1,VAR_064514,p.Leu130Arg,LP/P,...,,,,,,Homo sapiens,D_WC,99.0,262.0,True
109394,O00165,Homo sapiens,p-body,HGNC:16915,279,MSLFDLFRGFFGFPGPRSHRDPFFGGMTRDEDDDEEEEEEGGSWGR...,HAX1,VAR_064515,p.Val172Ile,LB/B,...,,,,,,Homo sapiens,D_PA,16.0,65.0,False
109395,O00165,Homo sapiens,p-body,HGNC:16915,279,MSLFDLFRGFFGFPGPRSHRDPFFGGMTRDEDDDEEEEEEGGSWGR...,HAX1,VAR_064515,p.Val172Ile,LB/B,...,,,,,,Homo sapiens,D_WC,99.0,262.0,True
109396,Q9NRX1,Homo sapiens,nucleolus,HGNC:32790,252,MESEMETQSARAEEGFTQVTRKGGRRAKKRQAEQLSAAGEGGDAGR...,PNO1,VAR_029814,p.Arg11Gly,LB/B,...,,,,,,Homo sapiens,D_WC,1.0,71.0,True


In [58]:
len(dom_merged.uniprot.unique())

391

In [59]:
dom_merged.snp_id.value_counts()

-               29706
rs281875187       686
rs200806228       396
rs1554602465      396
rs200898742       396
                ...  
rs17850813          1
rs55813244          1
rs6742946           1
rs730882216         1
rs610913            1
Name: snp_id, Length: 2554, dtype: int64

In [None]:
dom_merged.pfam_id.fillna(df.)