In [447]:
import pandas as pd
import numpy as np
import re
from Bio.SeqUtils import seq3

In [339]:
# dataset: Variant Summary de ClinVar (es la db completa)
vs = pd.read_csv('G:\My Drive\FIL\project\datasets\\vs.csv.gz')

In [340]:
# Box1 database
box = pd.read_csv('box1_proteins.csv')

# Merge box1 con clinvar dataset

In [341]:
# Mergeo con el dataset completo
box1_clinvar_total = box.merge(vs)

In [342]:
len(box1_clinvar_total.uniprot.unique())

396

In [343]:
# Para generar una col con los codigos NM... estos son los id de los transcriptos
box1_clinvar_total['nuccore_id'] = box1_clinvar_total.name.map(lambda x: re.findall('[A-Z]{2}\_[0-9]+\.[0-9]*', x))
box1_clinvar_total['nuccore_id'] = box1_clinvar_total.nuccore_id.str[0]
box1_clinvar_total.nuccore_id

0        NM_002542.5
1        NM_003656.5
2        NM_002542.5
3        NM_002542.5
4        NM_003981.4
            ...     
40991    NM_007363.5
40992    NM_007363.5
40993    NM_007363.5
40994    NM_005066.3
40995    NM_005066.3
Name: nuccore_id, Length: 40996, dtype: object

## Primero: entradas que tienen cambio en proteinas (p. ...)

In [344]:
box1_clinvar_total['cambio'] = box1_clinvar_total.name.map(lambda x: re.findall('\(p\..*\)$', x))
box1_clinvar_total['cambio'] = box1_clinvar_total.cambio.str[0]
box1_clinvar_total.cambio = box1_clinvar_total.cambio.str.strip('()')  # para quitar los parentesis
box1_clinvar_total.cambio = box1_clinvar_total.cambio.str.lstrip('p.')                             # se usa lstrip xq strip tambien saca las p del final 
box1_clinvar_total.cambio

0         Arg46Gln
1              NaN
2              NaN
3              NaN
4              NaN
           ...    
40991          NaN
40992    Thr424Ser
40993      Pro83fs
40994      Thr411=
40995      Gln564=
Name: cambio, Length: 40996, dtype: object

## Posibilidades de cambio
------------------------------- 
- *=*      es un sinonimo, el mismo aa  
- *aa*       cambia a otro aa
- *Ter*      se inserta un aa de terminacion: nonsense mutation
- *fs*       frameshift, corrimiento del marco de lectura
- *ins*      insercion de uno o mas aa en esa posicion
- *del*      delecion de uno o mas aa en esa posicion
- *delins*   delecion de uno o un segmento de aa e insercion de uno o mas aa  

Las nomenclaturas para cada tipo de variante se pueden consultar en Sequence Variant Nomenclature (https://varnomen.hgvs.org/)

In [345]:
box1_clinvar_total.cambio[500:550]

500    Glu1828Ter
501      Ala1861=
502    Thr1869Met
503      Ala1893=
504    Val1894Ile
505    Leu1919Pro
506    Arg1939Trp
507      Val1994=
508    Gln1998Ter
509      Gln2031=
510      Ser2032=
511      Leu2034=
512    Asp2065Glu
513      Ala2092=
514      Asp2116=
515    Gly2117Arg
516      Ala2149=
517    Pro2193Leu
518      Pro2296=
519    Arg2400Cys
520      Asp2427=
521    Glu2428Lys
522    His2435Arg
523    Gly2437Arg
524    Asp2448Asn
525    Asp2452Tyr
526    Arg2514Gln
527      Thr2531=
528    Ala2545Val
529      Ser2548=
530    Arg2663His
531      Cys2705=
532    His2710Tyr
533      His2710=
534    Arg2855Gln
535      Leu2857=
536      Glu2874=
537    His2929Pro
538    Ala2977Val
539    Ser2984Cys
540    Ala2986Thr
541    Thr2993Ala
542      Arg2999=
543    Val3012Ala
544    His3017Gln
545    Asn3052Ser
546    Glu3102Lys
547    Arg3111Cys
548           NaN
549    Leu3137Val
Name: cambio, dtype: object

In [346]:
# saco los nans
box1_clinvar_total = box1_clinvar_total[box1_clinvar_total.cambio.notnull()]
box1_clinvar_total.cambio

0         Arg46Gln
6           Thr94=
17       Tyr511Cys
27       Glu220Ter
28        Ala630fs
           ...    
40990    Arg153Ter
40992    Thr424Ser
40993      Pro83fs
40994      Thr411=
40995      Gln564=
Name: cambio, Length: 31563, dtype: object

In [347]:
len(box1_clinvar_total)

31563

## synonyms: son mutaciones silent. No las tenremos en cuenta

In [348]:
# separo las mutaciones sinonimas, no las tenemos en cuenta xq no surgen cambio en la proteina
syn = box1_clinvar_total[box1_clinvar_total.cambio.str.endswith('=')]

In [349]:
print(len(syn))
syn.head()

8432


Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,start,stop,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio
6,O43663,Homo sapiens,pcg body,HGNC:9341,620,MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...,9055,PRC1,2301826,167481,...,90981967,90981967,single nucleotide variant,NM_003981.4(PRC1):c.282G>A (p.Thr94=),germline,"MONDO:MONDO:0016419,MedGen:C0006142,OMIM:11448...",Familial cancer of breast,ClinGen:CA273794,NM_003981.4,Thr94=
38,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,16979162,101236,...,46411704,46411704,single nucleotide variant,NM_006031.6(PCNT):c.5631C>T (p.Ile1877=),germline,"MedGen:CN169374|MONDO:MONDO:0008872,MedGen:C04...",not specified|Microcephalic osteodysplastic pr...,ClinGen:CA148443,NM_006031.6,Ile1877=
39,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,61735811,101237,...,46411707,46411707,single nucleotide variant,NM_006031.6(PCNT):c.5634C>T (p.Asp1878=),germline,"MedGen:CN169374|MONDO:MONDO:0008872,MedGen:C04...",not specified|Microcephalic osteodysplastic pr...,ClinGen:CA148445,NM_006031.6,Asp1878=
44,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,17371795,101242,...,46435963,46435963,single nucleotide variant,NM_006031.6(PCNT):c.8811A>G (p.Thr2937=),germline,"MedGen:CN169374|MONDO:MONDO:0008872,MedGen:C04...",not specified|Microcephalic osteodysplastic pr...,ClinGen:CA148453,NM_006031.6,Thr2937=
45,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,61735826,101243,...,46436140,46436140,single nucleotide variant,NM_006031.6(PCNT):c.8988C>T (p.Ala2996=),germline,"MedGen:CN169374|MONDO:MONDO:0008872,MedGen:C04...",not specified|Microcephalic osteodysplastic pr...,ClinGen:CA148455,NM_006031.6,Ala2996=


### Voy eliminando del df original por el indice

In [350]:
cond = box1_clinvar_total.index.isin(syn.index) # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index) # drop esas filas
len(box1_clinvar_total)

23131

# Subset delins

In [351]:
((box1_clinvar_total.cambio.map(lambda x: re.findall('(delins)', x))).str[0]).notnull().value_counts() # hay 94 delins

False    23037
True        94
Name: cambio, dtype: int64

In [352]:
box1_clinvar_total.cambio.map(lambda x: re.findall('.*delins.*', x))

0        []
17       []
27       []
28       []
29       []
         ..
40988    []
40989    []
40990    []
40992    []
40993    []
Name: cambio, Length: 23131, dtype: object

## escribir explicacion de la regex

In [353]:
# prueba
a = re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?delins(.*)$', 'Lys1409_Ala1410delinsAsn')
a = a[0]
a

('Lys', '1409', 'Ala', '1410', 'Asn')

In [354]:
# capturo 'delins' en el medio de un cambio, .* es cualquier cosa 0 o mas veces
delins = box1_clinvar_total.copy()
delins['consequence'] = delins.cambio.map(lambda x: re.findall('.*delins.*', x))
delins['consequence'] = delins.consequence.str[0]
delins = delins[delins.consequence.notnull()]
delins

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,stop,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio,consequence
673,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,1601759740,800183,...,46334559,Insertion,NM_006031.6(PCNT):c.429_430insTGGGATGTTCACAGTC...,germline,"MONDO:MONDO:0008872,MedGen:C0432246,OMIM:21072...",Microcephalic osteodysplastic primordial dwarf...,-,NM_006031.6,Gly144delinsTrpAspValHisSerGlnTer,Gly144delinsTrpAspValHisSerGlnTer
815,P42858,Homo sapiens,centrosome/spindle pole body,HGNC:4851,3142,MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQPPPPPPPP...,3064,HTT,71180116,40570,...,3074877,Microsatellite,NM_002111.8(HTT):c.52CAG[(27_35)] (p.Gln18_Gln...,not provided,"MONDO:MONDO:0007739,MedGen:C0020179,OMIM:14310...",Huntington disease,-,NM_002111.8,Gln18_Gln38delinsGlnGlnGlnGlnGlnGlnGlnGlnGlnGl...,Gln18_Gln38delinsGlnGlnGlnGlnGlnGlnGlnGlnGlnGl...
1153,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,546,ATRX,1603095546,650237,...,77654188,Indel,NM_000489.5(ATRX):c.4227_4230delinsT (p.Lys140...,germline,"MONDO:MONDO:0010519,MedGen:C1845055,OMIM:30104...",Alpha thalassemia-X-linked intellectual disabi...,-,NM_000489.5,Lys1409_Ala1410delinsAsn,Lys1409_Ala1410delinsAsn
1501,Q99996,Homo sapiens,centrosome/spindle pole body,HGNC:379,3907,MEDEERQKKLEAGKAKLAQFRQRKAQSDGQSPSKKQKKKRKTSSSK...,10142,AKAP9,786205709,188452,...,92079169,Deletion,NM_005751.4(AKAP9):c.7034_7036del (p.Arg2345_G...,germline,"MONDO:MONDO:0020745,MedGen:C0003811,OMIM:11500...",Cardiac arrhythmia|Long QT syndrome,ClinGen:CA300621,NM_005751.4,Arg2345_Glu2346delinsLys,Arg2345_Glu2346delinsLys
2093,Q99996,Homo sapiens,centrosome/spindle pole body,HGNC:379,3907,MEDEERQKKLEAGKAKLAQFRQRKAQSDGQSPSKKQKKKRKTSSSK...,10142,AKAP9,1584041217,796083,...,92002132,Deletion,NM_005751.4(AKAP9):c.2213_2215del (p.Lys738_Gl...,germline,MedGen:CN517202,not provided,-,NM_005751.4,Lys738_Gly739delinsSer,Lys738_Gly739delinsSer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38252,Q9P2D1,Homo sapiens,nucleolus,HGNC:20626,2997,MADPGMMSLFGEDGNIFSEGLEGLGECGYPENPVNPMGQQMPIDQG...,55636,CHD7,1563625272,579625,...,60816409,Indel,NM_017780.4(CHD7):c.2520_2521delinsTC (p.Trp84...,germline,MedGen:C2711754,History of neurodevelopmental disorder,-,NM_017780.4,Trp840_Ala841delinsCysPro,Trp840_Ala841delinsCysPro
38494,Q9P2D1,Homo sapiens,nucleolus,HGNC:20626,2997,MADPGMMSLFGEDGNIFSEGLEGLGECGYPENPVNPMGQQMPIDQG...,55636,CHD7,1586317482,799557,...,60781388,Insertion,NM_017780.4(CHD7):c.2053_2054insGCAAAA (p.Ala6...,germline,MedGen:CN169374,not specified,-,NM_017780.4,Ala685delinsGlyLysThr,Ala685delinsGlyLysThr
38692,Q9P2D1,Homo sapiens,nucleolus,HGNC:20626,2997,MADPGMMSLFGEDGNIFSEGLEGLGECGYPENPVNPMGQQMPIDQG...,55636,CHD7,-1,946147,...,60824011,Indel,NM_017780.4(CHD7):c.3372_3373delinsAT (p.Met11...,germline,"MONDO:MONDO:0008965,MedGen:C0265354,OMIM:21480...",CHARGE association,-,NM_017780.4,Met1124_Asp1125delinsIleTyr,Met1124_Asp1125delinsIleTyr
38702,Q9P2D1,Homo sapiens,nucleolus,HGNC:20626,2997,MADPGMMSLFGEDGNIFSEGLEGLGECGYPENPVNPMGQQMPIDQG...,55636,CHD7,-1,955469,...,60741832,Indel,NM_017780.4(CHD7):c.395_400delinsCAA (p.Glu132...,germline,"MONDO:MONDO:0008965,MedGen:C0265354,OMIM:21480...",CHARGE association,-,NM_017780.4,Glu132_His134delinsAlaAsn,Glu132_His134delinsAlaAsn


In [355]:
delins['aux'] = delins.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?delins(.*)$', x))
delins['aux'] = delins.aux.str[0]
delins['aux']

673                  (Gly, 144, , , TrpAspValHisSerGlnTer)
815      (Gln, 18, Gln, 38, GlnGlnGlnGlnGlnGlnGlnGlnGln...
1153                           (Lys, 1409, Ala, 1410, Asn)
1501                           (Arg, 2345, Glu, 2346, Lys)
2093                             (Lys, 738, Gly, 739, Ser)
                               ...                        
38252                         (Trp, 840, Ala, 841, CysPro)
38494                            (Ala, 685, , , GlyLysThr)
38692                       (Met, 1124, Asp, 1125, IleTyr)
38702                         (Glu, 132, His, 134, AlaAsn)
39989                               (Arg, 208, , , AsnTer)
Name: aux, Length: 94, dtype: object

In [356]:
# las posiciones
delins['start_aa'] = delins.aux.map(lambda x: x[1])
delins.start_aa = delins.start_aa.apply(int)
delins.start_aa

673       144
815        18
1153     1409
1501     2345
2093      738
         ... 
38252     840
38494     685
38692    1124
38702     132
39989     208
Name: start_aa, Length: 94, dtype: int64

In [357]:
type(delins.start_aa[673])

numpy.int64

In [358]:
# end
delins['end_aa'] = delins.aux.map(lambda x: int(x[3]) if x[3] != '' else np.nan)
#delins.end_aa = delins.end_aa.astype('Int64')
delins.end_aa

673         NaN
815        38.0
1153     1410.0
1501     2346.0
2093      739.0
          ...  
38252     841.0
38494       NaN
38692    1125.0
38702     134.0
39989       NaN
Name: end_aa, Length: 94, dtype: float64

In [359]:
type(delins.end_aa[815])

numpy.float64

In [360]:
# from (el/los aa que cambian)
delins['from'] = delins.aux.map(lambda x: x[0] + x[2]) # concateno si existe mas de un aa que cambia (o sea, si es un rango)
delins['from']

673         Gly
815      GlnGln
1153     LysAla
1501     ArgGlu
2093     LysGly
          ...  
38252    TrpAla
38494       Ala
38692    MetAsp
38702    GluHis
39989       Arg
Name: from, Length: 94, dtype: object

In [361]:
delins['to'] = delins.aux.map(lambda x: x[4])
delins.to

673                                  TrpAspValHisSerGlnTer
815      GlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnG...
1153                                                   Asn
1501                                                   Lys
2093                                                   Ser
                               ...                        
38252                                               CysPro
38494                                            GlyLysThr
38692                                               IleTyr
38702                                               AlaAsn
39989                                               AsnTer
Name: to, Length: 94, dtype: object

In [362]:
delins['consequence'] = 'delins'

In [363]:
delins[['cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence']]

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
673,Gly144delinsTrpAspValHisSerGlnTer,144,,Gly,TrpAspValHisSerGlnTer,delins
815,Gln18_Gln38delinsGlnGlnGlnGlnGlnGlnGlnGlnGlnGl...,18,38.0,GlnGln,GlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnG...,delins
1153,Lys1409_Ala1410delinsAsn,1409,1410.0,LysAla,Asn,delins
1501,Arg2345_Glu2346delinsLys,2345,2346.0,ArgGlu,Lys,delins
2093,Lys738_Gly739delinsSer,738,739.0,LysGly,Ser,delins
...,...,...,...,...,...,...
38252,Trp840_Ala841delinsCysPro,840,841.0,TrpAla,CysPro,delins
38494,Ala685delinsGlyLysThr,685,,Ala,GlyLysThr,delins
38692,Met1124_Asp1125delinsIleTyr,1124,1125.0,MetAsp,IleTyr,delins
38702,Glu132_His134delinsAlaAsn,132,134.0,GluHis,AlaAsn,delins


In [364]:
# elimino la col auxiliar
delins = delins.drop(columns=['aux'])

## elimino estas rows del df original

In [365]:
cond = box1_clinvar_total.index.isin(delins.index)                           # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index) # drop esas filas
len(box1_clinvar_total)

23037

# Subset deleciones

In [366]:
# capturo 'del' en el medio de un cambio, .* es cualquier cosa 0 o mas veces
deletions = box1_clinvar_total.copy()
deletions['consequence'] = deletions.cambio.map(lambda x: re.findall('.*del.*', x))
deletions['consequence'] = deletions.consequence.str[0]
deletions = deletions[deletions.consequence.notnull()]
deletions

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,stop,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio,consequence
86,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,587784306,169616,...,46334596,Deletion,NM_006031.6(PCNT):c.467_505del (p.His156_Gln16...,germline,"MedGen:CN169374|MONDO:MONDO:0008872,MedGen:C04...",not specified|Microcephalic osteodysplastic pr...,ClinGen:CA173014,NM_006031.6,His156_Gln168del,His156_Gln168del
233,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,1555945478,208731,...,46334613,Deletion,NM_006031.6(PCNT):c.442_519del (p.Val148_Thr17...,germline,-,-,ClinGen:CA205246,NM_006031.6,Val148_Thr173del,Val148_Thr173del
404,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,562568796,351988,...,46435983,Deletion,NM_006031.6(PCNT):c.8830_8832del (p.Lys2944del),germline,"MONDO:MONDO:0000060,MedGen:CN239360|MedGen:CN5...",Microcephalic Osteodysplastic Primordial Dwarf...,ClinGen:CA10081094,NM_006031.6,Lys2944del,Lys2944del
603,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,759470656,578581,...,46385938,Microsatellite,NM_006031.6(PCNT):c.3418_3420CTC[1] (p.Leu1141...,unknown,"MONDO:MONDO:0008872,MedGen:C0432246,OMIM:21072...",Microcephalic osteodysplastic primordial dwarf...,-,NM_006031.6,Leu1141del,Leu1141del
869,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,546,ATRX,398123423,99044,...,77652294,Microsatellite,NM_000489.5(ATRX):c.4365_4367GGA[4] (p.Glu1464...,germline,"MedGen:CN169374|MONDO:MONDO:0010519,MedGen:C18...",not specified|Alpha thalassemia-X-linked intel...,ClinGen:CA175123,NM_000489.5,Glu1464del,Glu1464del
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40325,Q99700,Homo sapiens,stress granule,HGNC:10555,1313,MRSAAAAPRSPAVATESRRFAAARWPGWRSLQRPARRSGRGGGGAA...,6311,ATXN2,778119853,513609,...,111598859,Deletion,NM_002973.4(ATXN2):c.176_190del (p.Val59_Ser63...,unknown,"MONDO:MONDO:0012189,MedGen:C1836824,OMIM:60905...",Salt and pepper developmental regression syndrome,ClinGen:CA243622139,NM_002973.4,Val59_Ser63del,Val59_Ser63del
40348,P62913,Homo sapiens,nucleolus,HGNC:10301,178,MAQDQGEKENPMRELRIRKLCLNICVGESGDRLTRAAKVLEQLTGQ...,6135,RPL11,1570569083,20792,...,23695881,Microsatellite,NM_000975.5(RPL11):c.479_481AGG[1] (p.Glu161del),germline,"MONDO:MONDO:0012938,MedGen:C2675512,OMIM:61256...",Diamond-Blackfan anemia 7,OMIM:604175.0003,NM_000975.5,Glu161del,Glu161del
40367,P62913,Homo sapiens,nucleolus,HGNC:10301,178,MAQDQGEKENPMRELRIRKLCLNICVGESGDRLTRAAKVLEQLTGQ...,6135,RPL11,1553121852,427731,...,23694691,Deletion,NM_000975.5(RPL11):c.296_298del (p.Phe99del),germline,-,-,ClinGen:CA645372340,NM_000975.5,Phe99del,Phe99del
40600,Q14207,Homo sapiens,histone locus body,HGNC:7896,1427,MLLPSDVARLVLGYLQQENLISTCQTFILESSDLKEYAEHCTDEGF...,4863,NPAT,141989202,701546,...,108172811,Deletion,NM_002519.3(NPAT):c.2173_2175del (p.Ser726del),germline,MedGen:CN517202,not provided,-,NM_002519.3,Ser726del,Ser726del


In [367]:
deletions['aux'] = deletions.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?del(.*)$', x))
deletions['aux'] = deletions.aux.str[0]
deletions['aux']

86       (His, 156, Gln, 168, )
233      (Val, 148, Thr, 173, )
404           (Lys, 2944, , , )
603           (Leu, 1141, , , )
869           (Glu, 1464, , , )
                  ...          
40325      (Val, 59, Ser, 63, )
40348          (Glu, 161, , , )
40367           (Phe, 99, , , )
40600          (Ser, 726, , , )
40745      (Pro, 84, Gln, 91, )
Name: aux, Length: 396, dtype: object

In [368]:
def separar_en_cols(df, column, conseq):
    '''
    recibe un DataFrame, el nombre de una columna auxiliar (column)
    y un string con el tipo de consecuencia (conseq). La col. auxiliar
    es una tupla con los elementos implicados en una mutacion
    como la siguiente (aa1, start_pos, aa2, end_pos, aa/s_nuevos).
    Devuelve el DataFrame df con estas 5 nuevas columnas
    '''
    # start position
    df['start_aa'] = df[column].map(lambda x: x[1])
    df.start_aa = df.start_aa.apply(int)

    # end position
    df['end_aa'] = df[column].map(lambda x: int(x[3]) if x[3] != '' else np.nan)

    # from: es el/los aa que cambian
    df['from'] = df[column].map(lambda x: x[0] + x[2]) # concateno si existe mas de un aa que cambia (o sea, si es un rango)
    
    # to: aa/s nuevos
    df['to'] = df[column].map(lambda x: x[4] if x[4] != '' else np.nan)

    # consecuencia de la mutacion
    df['consequence'] = conseq


    return df[['cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence']]

In [369]:
separar_en_cols(deletions, 'aux', 'deletion')

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
86,His156_Gln168del,156,168.0,HisGln,,deletion
233,Val148_Thr173del,148,173.0,ValThr,,deletion
404,Lys2944del,2944,,Lys,,deletion
603,Leu1141del,1141,,Leu,,deletion
869,Glu1464del,1464,,Glu,,deletion
...,...,...,...,...,...,...
40325,Val59_Ser63del,59,63.0,ValSer,,deletion
40348,Glu161del,161,,Glu,,deletion
40367,Phe99del,99,,Phe,,deletion
40600,Ser726del,726,,Ser,,deletion


In [370]:
# elimino la col auxiliar
deletions = deletions.drop(columns=['aux'])

### elimino este subset del df original

In [371]:
cond = box1_clinvar_total.index.isin(deletions.index)                           # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)    # drop esas filas
len(box1_clinvar_total)

22641

# Subset inserciones

In [372]:
# capturo 'ins' en el medio de un cambio, .* es cualquier cosa 0 o mas veces
insertions = box1_clinvar_total.copy()
insertions['consequence'] = insertions.cambio.map(lambda x: re.findall('.*ins.*', x))
insertions['consequence'] = insertions.consequence.str[0]
insertions = insertions[insertions.consequence.notnull()]
insertions

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,stop,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio,consequence
1002,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,546,ATRX,1557142576,430943,...,77684327,Duplication,NM_000489.5(ATRX):c.927_929dup (p.Glu310_His31...,germline,MedGen:CN169374,not specified,ClinGen:CA645373309,NM_000489.5,Glu310_His311insAsp,Glu310_His311insAsp
1275,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,546,ATRX,-1,850300,...,77663386,Deletion,NM_000489.6(ATRX):c.4116_4119del (p.Arg1372_Ly...,germline,"MONDO:MONDO:0010519,MedGen:C1845055,OMIM:30104...",Alpha thalassemia-X-linked intellectual disabi...,-,NM_000489.6,Arg1372_Lys1373insTer,Arg1372_Lys1373insTer
1491,Q99996,Homo sapiens,centrosome/spindle pole body,HGNC:379,3907,MEDEERQKKLEAGKAKLAQFRQRKAQSDGQSPSKKQKKKRKTSSSK...,10142,AKAP9,10644111,188441,...,92022865,Duplication,NM_147185.3(AKAP9):c.4004_4006dup (p.Leu1336_G...,germline,MedGen:CN169374|MedGen:CN230736|MedGen:C003582...,not specified|Cardiovascular phenotype|Romano-...,ClinGen:CA199739,NM_147185.3,Leu1336_Glu1337insGln,Leu1336_Glu1337insGln
2443,P49711,Homo sapiens,centrosome/spindle pole body,HGNC:13723,727,MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...,10664,CTCF,886041901,264964,...,67628522,Deletion,NM_006565.4(CTCF):c.1670_1674del (p.Val556_Cys...,germline,MedGen:CN517202,not provided,ClinGen:CA10603544,NM_006565.4,Val556_Cys557insTer,Val556_Cys557insTer
2606,P54132,Homo sapiens,pml nuclear body,HGNC:1058,1417,MAAVPQNNLQEQLERHSARTLNNKLSLSKPKFSGFTFKKKTSSDNN...,641,BLM,367543026,186946,...,90749846,Deletion,NM_000057.4(BLM):c.581_582del (p.Phe193_Phe194...,germline,"MONDO:MONDO:0008876,MedGen:C0005859,OMIM:21090...",Bloom syndrome,ClinGen:CA274291,NM_000057.4,Phe193_Phe194insTer,Phe193_Phe194insTer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39692,Q13428,Homo sapiens,nucleolus,HGNC:11654,1488,MAEARKRRELLPLIYHHLLRAGYVRAAREVKEQSGQKCFLAQPVTL...,6949,TCOF1,1554081168,455619,...,150398417,Deletion,NM_000356.4(TCOF1):c.4175_4176del (p.Asp1391_S...,germline,"MONDO:MONDO:0007944,MedGen:CN119605,OMIM:15450...",Treacher Collins syndrome 1,ClinGen:CA658657564,NM_000356.4,Asp1391_Ser1392insTer,Asp1391_Ser1392insTer
39723,Q13428,Homo sapiens,nucleolus,HGNC:11654,1488,MAEARKRRELLPLIYHHLLRAGYVRAAREVKEQSGQKCFLAQPVTL...,6949,TCOF1,1581136492,633648,...,150378917,Deletion,NM_001371623.1(TCOF1):c.2353del (p.Ser784_Val7...,germline,"MONDO:MONDO:0007944,MedGen:CN119605,OMIM:15450...",Treacher Collins syndrome 1,-,NM_001371623.1,Ser784_Val785insTer,Ser784_Val785insTer
39782,Q13428,Homo sapiens,nucleolus,HGNC:11654,1488,MAEARKRRELLPLIYHHLLRAGYVRAAREVKEQSGQKCFLAQPVTL...,6949,TCOF1,1581075276,790527,...,150369594,Insertion,NM_001371623.1(TCOF1):c.630_631insCTG (p.Asp21...,unknown,"MONDO:MONDO:0007944,MedGen:CN119605,OMIM:15450...",Treacher Collins syndrome 1,-,NM_001371623.1,Asp211_Val212insLeu,Asp211_Val212insLeu
39792,Q13428,Homo sapiens,nucleolus,HGNC:11654,1488,MAEARKRRELLPLIYHHLLRAGYVRAAREVKEQSGQKCFLAQPVTL...,6949,TCOF1,-1,830545,...,150389930,Deletion,NM_001371623.1(TCOF1):c.3091del (p.Arg1030_Ile...,germline,"MONDO:MONDO:0007944,MedGen:CN119605,OMIM:15450...",Treacher Collins syndrome 1,-,NM_001371623.1,Arg1030_Ile1031insTer,Arg1030_Ile1031insTer


In [373]:
# genero la col con las tuplas
insertions['aux'] = insertions.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?ins(.*)$', x))
insertions['aux'] = insertions.aux.str[0]
insertions['aux']

1002                             (Glu, 310, His, 311, Asp)
1275                           (Arg, 1372, Lys, 1373, Ter)
1491                           (Leu, 1336, Glu, 1337, Gln)
2443                             (Val, 556, Cys, 557, Ter)
2606                             (Phe, 193, Phe, 194, Ter)
                               ...                        
39692                          (Asp, 1391, Ser, 1392, Ter)
39723                            (Ser, 784, Val, 785, Ter)
39782                            (Asp, 211, Val, 212, Leu)
39792                          (Arg, 1030, Ile, 1031, Ter)
40721    (Gln, 59, Pro, 60, GlnGlyGlyGlyGlyTrpGlyGlnGln...
Name: aux, Length: 143, dtype: object

In [374]:
separar_en_cols(insertions, 'aux', 'insertion')

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
1002,Glu310_His311insAsp,310,311,GluHis,Asp,insertion
1275,Arg1372_Lys1373insTer,1372,1373,ArgLys,Ter,insertion
1491,Leu1336_Glu1337insGln,1336,1337,LeuGlu,Gln,insertion
2443,Val556_Cys557insTer,556,557,ValCys,Ter,insertion
2606,Phe193_Phe194insTer,193,194,PhePhe,Ter,insertion
...,...,...,...,...,...,...
39692,Asp1391_Ser1392insTer,1391,1392,AspSer,Ter,insertion
39723,Ser784_Val785insTer,784,785,SerVal,Ter,insertion
39782,Asp211_Val212insLeu,211,212,AspVal,Leu,insertion
39792,Arg1030_Ile1031insTer,1030,1031,ArgIle,Ter,insertion


In [375]:
# elimino la col auxiliar
insertions = insertions.drop(columns=['aux'])

In [376]:
cond = box1_clinvar_total.index.isin(insertions.index)                           # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)     # drop esas filas
len(box1_clinvar_total)

22498

# Subset frameshifts

In [377]:
((box1_clinvar_total.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?fs(.*)$', x))).str[0]).notnull().value_counts() # da 2759 y con endswith 2757

False    19739
True      2759
Name: cambio, dtype: int64

In [378]:
box1_clinvar_total[box1_clinvar_total.cambio.str.endswith('fs')] # 2757 filas

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,start,stop,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio
28,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,397509366,19743,...,46355577,46355577,Deletion,NM_006031.6(PCNT):c.1887del (p.Ala630fs),germline,"MONDO:MONDO:0008872,MedGen:C0432246,OMIM:21072...",Microcephalic osteodysplastic primordial dwarf...,"ClinGen:CA250508,OMIM:605925.0002",NM_006031.6,Ala630fs
29,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,397514033,19744,...,46388844,46388845,Duplication,NM_006031.6(PCNT):c.3568dup (p.Cys1190fs),germline,"MONDO:MONDO:0008872,MedGen:C0432246,OMIM:21072...",Microcephalic osteodysplastic primordial dwarf...,"ClinGen:CA250509,OMIM:605925.0003",NM_006031.6,Cys1190fs
31,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,1601795448,19746,...,46346863,46346864,Duplication,NM_006031.6(PCNT):c.844dup (p.Glu282fs),germline,"MONDO:MONDO:0008872,MedGen:C0432246,OMIM:21072...",Microcephalic osteodysplastic primordial dwarf...,OMIM:605925.0005,NM_006031.6,Glu282fs
36,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,1369869782,48589,...,46353169,46353170,Duplication,NM_006031.6(PCNT):c.1528dup (p.Thr510fs),germline,"MONDO:MONDO:0008872,MedGen:C0432246,OMIM:21072...",Microcephalic osteodysplastic primordial dwarf...,OMIM:605925.0012,NM_006031.6,Thr510fs
109,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,587784302,169639,...,46366955,46366965,Deletion,NM_006031.6(PCNT):c.2984_2994del (p.Ala995fs),germline,"MONDO:MONDO:0008872,MedGen:C0432246,OMIM:21072...",Microcephalic osteodysplastic primordial dwarf...,ClinGen:CA251043,NM_006031.6,Ala995fs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40971,Q15233,Homo sapiens,paraspeckle,HGNC:7871,471,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,4841,NONO,1555950374,493310,...,71298727,71298728,Microsatellite,NM_007363.5(NONO):c.1192_1193AG[1] (p.Gly399fs),germline,MedGen:CN517202,not provided,ClinGen:CA658799797,NM_007363.5,Gly399fs
40974,Q15233,Homo sapiens,paraspeckle,HGNC:7871,471,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,4841,NONO,1602386427,650198,...,71291868,71291868,Deletion,NM_007363.5(NONO):c.245del (p.Pro82fs),germline,MedGen:CN517202,not provided,-,NM_007363.5,Pro82fs
40988,Q15233,Homo sapiens,paraspeckle,HGNC:7871,471,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,4841,NONO,1602385860,798822,...,71290743,71290743,Deletion,NM_007363.5(NONO):c.107del (p.Pro36fs),germline,"MONDO:MONDO:0010501,MedGen:C4225417,OMIM:30096...","Mental retardation, X-linked, syndromic 34",-,NM_007363.5,Pro36fs
40989,Q15233,Homo sapiens,paraspeckle,HGNC:7871,471,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,4841,NONO,-1,800680,...,71298725,71298726,Deletion,NM_007363.5(NONO):c.1191_1192del (p.Asn397fs),germline,"MONDO:MONDO:0010501,MedGen:C4225417,OMIM:30096...","Mental retardation, X-linked, syndromic 34",-,NM_007363.5,Asn397fs


In [379]:
# capturo 'fs' 
frameshift = box1_clinvar_total.copy()
frameshift['aux'] = frameshift.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?fs(.*)$', x))
frameshift['aux'] = frameshift.aux.str[0]
frameshift = frameshift[frameshift.aux.notnull()]
frameshift.aux

28        (Ala, 630, , , )
29       (Cys, 1190, , , )
31        (Glu, 282, , , )
36        (Thr, 510, , , )
109       (Ala, 995, , , )
               ...        
40971     (Gly, 399, , , )
40974      (Pro, 82, , , )
40988      (Pro, 36, , , )
40989     (Asn, 397, , , )
40993      (Pro, 83, , , )
Name: aux, Length: 2759, dtype: object

In [380]:
separar_en_cols(frameshift, 'aux', 'frameshift')

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
28,Ala630fs,630,,Ala,,frameshift
29,Cys1190fs,1190,,Cys,,frameshift
31,Glu282fs,282,,Glu,,frameshift
36,Thr510fs,510,,Thr,,frameshift
109,Ala995fs,995,,Ala,,frameshift
...,...,...,...,...,...,...
40971,Gly399fs,399,,Gly,,frameshift
40974,Pro82fs,82,,Pro,,frameshift
40988,Pro36fs,36,,Pro,,frameshift
40989,Asn397fs,397,,Asn,,frameshift


In [381]:
frameshift = frameshift.drop(columns=['aux'])

In [382]:
cond = box1_clinvar_total.index.isin(frameshift.index)                           # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)     # drop esas filas
len(box1_clinvar_total)

19739

# subset nonsense mutations:  
### son cuando aparecen un condon de terminacion (Ter) (definicion: https://www.genome.gov/genetics-glossary/Nonsense-Mutation)

In [383]:
((box1_clinvar_total.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?Ter(.*)$', x))).str[0]).notnull().value_counts() # dan lo mismo, 621

False    18118
True      1621
Name: cambio, dtype: int64

In [384]:
nonsense = box1_clinvar_total[box1_clinvar_total.cambio.str.endswith('Ter')]

In [385]:
nonsense['aux'] = nonsense.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?Ter(.*)$', x))
nonsense['aux'] = nonsense.aux.str[0]
nonsense = nonsense[nonsense.aux.notnull()]
nonsense.aux

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonsense['aux'] = nonsense.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?Ter(.*)$', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonsense['aux'] = nonsense.aux.str[0]


27        (Glu, 220, , , )
30       (Arg, 1923, , , )
32       (Glu, 1037, , , )
33       (Arg, 2918, , , )
35       (Glu, 1154, , , )
               ...        
40961      (Gln, 35, , , )
40963     (Arg, 337, , , )
40965      (Arg, 73, , , )
40976     (Arg, 184, , , )
40990     (Arg, 153, , , )
Name: aux, Length: 1621, dtype: object

In [386]:
separar_en_cols(nonsense, 'aux', 'nonsense')

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
27,Glu220Ter,220,,Glu,,nonsense
30,Arg1923Ter,1923,,Arg,,nonsense
32,Glu1037Ter,1037,,Glu,,nonsense
33,Arg2918Ter,2918,,Arg,,nonsense
35,Glu1154Ter,1154,,Glu,,nonsense
...,...,...,...,...,...,...
40961,Gln35Ter,35,,Gln,,nonsense
40963,Arg337Ter,337,,Arg,,nonsense
40965,Arg73Ter,73,,Arg,,nonsense
40976,Arg184Ter,184,,Arg,,nonsense


In [387]:
nonsense['to'] = 'Ter'

In [388]:
nonsense = nonsense.drop(columns=['aux'])

In [389]:
nonsense[['cambio','start_aa','end_aa','from','to','consequence']]

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
27,Glu220Ter,220,,Glu,Ter,nonsense
30,Arg1923Ter,1923,,Arg,Ter,nonsense
32,Glu1037Ter,1037,,Glu,Ter,nonsense
33,Arg2918Ter,2918,,Arg,Ter,nonsense
35,Glu1154Ter,1154,,Glu,Ter,nonsense
...,...,...,...,...,...,...
40961,Gln35Ter,35,,Gln,Ter,nonsense
40963,Arg337Ter,337,,Arg,Ter,nonsense
40965,Arg73Ter,73,,Arg,Ter,nonsense
40976,Arg184Ter,184,,Arg,Ter,nonsense


In [390]:
cond = box1_clinvar_total.index.isin(nonsense.index)                           # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)   # drop esas filas
len(box1_clinvar_total)

18118

# Subset: Missense mutations  
A missense mutation is a substitution where the altered codon corresponds to a different amino acid (ver: https://www.genome.gov/genetics-glossary/Point-Mutation)

In [391]:
((box1_clinvar_total.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})$', x))).str[0]).notnull().value_counts() # 16968

True     17953
False      165
Name: cambio, dtype: int64

In [392]:
missense = box1_clinvar_total.copy()
missense['aux'] = missense.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})$', x))
missense['aux'] = missense.aux.str[0]
missense = missense[missense.aux.notnull()]
missense.aux

0          (Arg, 46, Gln)
17        (Tyr, 511, Cys)
34       (Gln, 1280, His)
37       (Asn, 1841, Ser)
40       (Ala, 1924, Val)
               ...       
40972     (Arg, 184, Pro)
40973      (Ala, 48, Ser)
40975     (Arg, 270, Cys)
40977     (Met, 393, Ile)
40992     (Thr, 424, Ser)
Name: aux, Length: 17953, dtype: object

In [393]:
missense['start_aa'] = missense.aux.map(lambda x: x[1])
missense['end_aa'] = np.nan
missense['from'] = missense.aux.map(lambda x: x[0])
missense['to'] = missense.aux.map(lambda x: x[2])
missense['consequence'] = 'missense'

In [394]:
missense[['start_aa',	'end_aa',	'from',	'to',	'consequence']]

Unnamed: 0,start_aa,end_aa,from,to,consequence
0,46,,Arg,Gln,missense
17,511,,Tyr,Cys,missense
34,1280,,Gln,His,missense
37,1841,,Asn,Ser,missense
40,1924,,Ala,Val,missense
...,...,...,...,...,...
40972,184,,Arg,Pro,missense
40973,48,,Ala,Ser,missense
40975,270,,Arg,Cys,missense
40977,393,,Met,Ile,missense


In [395]:
missense = missense.drop(columns=['aux'])

In [396]:
cond = box1_clinvar_total.index.isin(missense.index)                           # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)   # drop esas filas
len(box1_clinvar_total)

165

In [397]:
box1_clinvar_total

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,start,stop,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio
234,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,587784306,208732,...,46334557,46334558,Duplication,NM_006031.6(PCNT):c.467_505dup (p.His156_Gln16...,germline,-,-,ClinGen:CA209336,NM_006031.6,His156_Gln168dup
293,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,886042352,266471,...,46334596,46334597,Duplication,NM_006031.6(PCNT):c.481_519dup (p.Val161_Thr17...,germline,MedGen:CN517202,not provided,ClinGen:CA10604116,NM_006031.6,Val161_Thr173dup
558,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,1555945598,446345,...,46334583,46334584,Duplication,NM_006031.6(PCNT):c.467_583dup (p.His156_Gln19...,germline,MedGen:CN517202,not provided,ClinGen:CA658658902,NM_006031.6,His156_Gln194dup
641,O95613,Homo sapiens,centrosome/spindle pole body,HGNC:16068,3336,MEVEQEQRRRKVEAGRTKLAHFRQRKTKGDSSHSEKKTAKRKGSAV...,5116,PCNT,769893798,717272,...,46334627,46334628,Insertion,NM_006031.6(PCNT):c.519_520insGTCAGTGACCACCCAC...,germline,MedGen:CN517202,not provided,-,NM_006031.6,Val161_Thr173dup
828,P42858,Homo sapiens,centrosome/spindle pole body,HGNC:4851,3142,MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQPPPPPPPP...,3064,HTT,772429544,590772,...,3074945,3074946,Microsatellite,NM_002111.8(HTT):c.129_131GCC[8] (p.Pro51dup),germline,MedGen:CN169374,not specified,-,NM_002111.8,Pro51dup
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40331,Q99700,Homo sapiens,stress granule,HGNC:10555,1313,MRSAAAAPRSPAVATESRRFAAARWPGWRSLQRPARRSGRGGGGAA...,6311,ATXN2,-1,964787,...,111598978,111598979,Microsatellite,NM_001372574.1(ATXN2):c.18GCA[28] (p.Gln14_Gln...,unknown,"Human Phenotype Ontology:HP:0007354,MedGen:C00...",Amyotrophic lateral sclerosis,-,NM_001372574.1,Gln14_Gln28dup
40793,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,5621,PRNP,138688873,534134,...,4699449,4699472,Deletion,NM_000311.5(PRNP):c.246_269del (p.60_67PHGGGWG...,germline,"MONDO:MONDO:0011299,MedGen:C1864112,OMIM:60321...",Huntington disease-like 1,ClinGen:CA9752036,NM_000311.5,60_67PHGGGWGQ[3]
40795,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,5621,PRNP,193922906,677337,...,4699379,4699380,Insertion,NM_000311.5(PRNP):c.227_228insTCATGGTGGTGGCTGG...,unknown,-,PRNP-associated condition,-,NM_000311.5,60_67PHGGGWGQ[12]
40801,P04156,Homo sapiens,cytoplasmic protein granule,HGNC:9449,253,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,5621,PRNP,-1,848464,...,4699380,4699427,Deletion,NM_000311.5(PRNP):c.180_227del (p.60_67PHGGGWG...,germline,"MONDO:MONDO:0011299,MedGen:C1864112,OMIM:60321...",Huntington disease-like 1,-,NM_000311.5,60_67PHGGGWGQ[2]


# Subset duplications

In [398]:
((box1_clinvar_total.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?dup(.*)$', x))).str[0]).notnull().value_counts()

True     108
False     57
Name: cambio, dtype: int64

In [399]:
duplications = box1_clinvar_total[box1_clinvar_total.cambio.str.endswith('dup')] # subsetting por la terminacion 'dup'

In [400]:
duplications.cambio

234      His156_Gln168dup
293      Val161_Thr173dup
558      His156_Gln194dup
641      Val161_Thr173dup
828              Pro51dup
               ...       
38776      Glu40_Gly41dup
38785      Glu39_Glu40dup
39187    Glu100_Pro106dup
40331      Gln14_Gln28dup
40970           Ala439dup
Name: cambio, Length: 108, dtype: object

In [401]:
duplications['aux'] = duplications.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?dup(.*)$', x))
duplications['aux'] = duplications.aux.str[0]
duplications = duplications[duplications.aux.notnull()]
duplications.aux

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplications['aux'] = duplications.cambio.map(lambda x: re.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?dup(.*)$', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplications['aux'] = duplications.aux.str[0]


234      (His, 156, Gln, 168, )
293      (Val, 161, Thr, 173, )
558      (His, 156, Gln, 194, )
641      (Val, 161, Thr, 173, )
828             (Pro, 51, , , )
                  ...          
38776      (Glu, 40, Gly, 41, )
38785      (Glu, 39, Glu, 40, )
39187    (Glu, 100, Pro, 106, )
40331      (Gln, 14, Gln, 28, )
40970          (Ala, 439, , , )
Name: aux, Length: 108, dtype: object

In [402]:
separar_en_cols(duplications, 'aux', 'duplication')

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
234,His156_Gln168dup,156,168.0,HisGln,,duplication
293,Val161_Thr173dup,161,173.0,ValThr,,duplication
558,His156_Gln194dup,156,194.0,HisGln,,duplication
641,Val161_Thr173dup,161,173.0,ValThr,,duplication
828,Pro51dup,51,,Pro,,duplication
...,...,...,...,...,...,...
38776,Glu40_Gly41dup,40,41.0,GluGly,,duplication
38785,Glu39_Glu40dup,39,40.0,GluGlu,,duplication
39187,Glu100_Pro106dup,100,106.0,GluPro,,duplication
40331,Gln14_Gln28dup,14,28.0,GlnGln,,duplication


In [403]:
duplications = duplications.drop(columns=['aux'])

In [404]:
# elimino estas entradas de la tabla original
cond = box1_clinvar_total.index.isin(duplications.index)                       # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)   # drop esas filas
len(box1_clinvar_total)

57

# Casos raros

In [406]:
# elimino los '?'
box1_clinvar_total = box1_clinvar_total[(box1_clinvar_total.cambio != '?') & (box1_clinvar_total.cambio != '(?')]

In [407]:
box1_clinvar_total.cambio

926          1540_1541DE[1]
1865           604_605EK[1]
5387           113_114PQ[1]
5911           513_514LV[1]
8786          329_331SGG[1]
9075       138_143GQQQSY[1]
9259           178_179RS[6]
9829           622_623TL[1]
10043      1053_1056SGGG[1]
10181            1118SGG[1]
11175    454_461GYGGDRGG[3]
11178    454_461GYGGDRGG[3]
14922         219_221QGS[1]
15031         772_774KNP[3]
17057         646_648GLG[3]
17108       648_652GGLGV[1]
17179      479_484VAPGVG[2]
17217          501VGVAPG[1]
24725         38_42SGPEE[1]
25114         38_42SGPEE[3]
25454         38_42SGPEE[3]
25975          191_192HP[4]
25997            72_73HP[2]
26004              191HP[5]
26092              191HP[1]
26223          237_238PA[3]
27095          229_230GP[6]
27327          229_230GP[9]
27328         229_230GP[12]
27488          229_230GP[6]
27676          229_230GP[9]
27733         229_230GP[10]
28139          229_230GP[7]
28238          229_230GP[9]
28561          229_230GP[2]
28562          229_2

In [420]:
# elimino los parentesis con el nro del final usando una regex
box1_clinvar_total.cambio = box1_clinvar_total.cambio.map(lambda x: re.findall('^(\d+_?\d+?[A-Za-z][A-Za-z]*[A-Za-z])', x))
box1_clinvar_total.cambio = box1_clinvar_total.cambio.str[0]


In [421]:
box1_clinvar_total[['type', 'name', 'cambio']]

Unnamed: 0,type,name,cambio
926,Microsatellite,NM_000489.5(ATRX):c.4620_4625TGAAGA[1] (p.1540...,1540_1541DE
1865,Microsatellite,NM_147185.3(AKAP9):c.1809_1814AGAAAA[1] (p.604...,604_605EK
5387,Deletion,NM_014494.4(TNRC6A):c.339_350del (p.113_114PQ[1]),113_114PQ
5911,Microsatellite,NM_004646.3(NPHS1):c.1536_1541GCTGGT[1] (p.513...,513_514LV
8786,Deletion,NM_002137.4(HNRNPA2B1):c.984_992del (p.329_331...,329_331SGG
9075,Microsatellite,NM_004960.4(FUS):c.412_429GGACAGCAGCAAAGCTAT[1...,138_143GQQQSY
9259,Deletion,NM_001195427.2(SRSF2):c.550_555del (p.178_179R...,178_179RS
9829,Microsatellite,NM_006772.3(SYNGAP1):c.1865_1870CCCTCA[1] (p.6...,622_623TL
10043,Deletion,NM_006772.3(SYNGAP1):c.3168_3179del (p.1053_10...,1053_1056SGGG
10181,Microsatellite,NM_006772.3(SYNGAP1):c.3348GGGCAGCGG[1] (p.111...,1118SGG


# Raros: duplicaciones

In [449]:
dup = box1_clinvar_total[box1_clinvar_total.type == 'Duplication']
dup.head()

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,start,stop,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio
15031,O94906,Homo sapiens,cajal body,HGNC:15860,941,MNKKKKPFLGMPAPLGYVPGLGRGATGFTTRSDIGPARDANDPVDD...,24148,PRPF6,-1,856967,...,64027709,64027710,Duplication,NM_012469.4(PRPF6):c.2313_2321dup (p.772_774KN...,germline,"Human Phenotype Ontology:HP:0000556,Human Phen...",Retinal dystrophy,-,NM_012469.4,772_774KNP
17057,P15502,Homo sapiens,null_phasepdb_rev,HGNC:3327,786,MAGLTAAAPRPGVLLLLLSILHPSRPGGVPGAIPGGVPGGVFYPGA...,2006,ELN,374813147,494051,...,74063640,74063641,Duplication,NM_000501.4(ELN):c.1946_1954dup (p.646_648GLG[3]),germline,MedGen:CN169374|MedGen:CN517202|Human Phenotyp...,not specified|not provided|Supravalvar aortic ...,-,NM_000501.4,646_648GLG
25114,P40337,Homo sapiens,stress granule,HGNC:12687,213,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,7428,VHL,863224839,451603,...,10141955,10141956,Duplication,NM_000551.3(VHL):c.123_137dup (p.38_42SGPEE[3]),germline,"MONDO:MONDO:0009892,MedGen:C1837915,OMIM:26340...","Erythrocytosis, familial, 2;Von Hippel-Lindau ...",ClinGen:CA541213522,NM_000551.3,38_42SGPEE
25454,P40337,Homo sapiens,stress granule,HGNC:12687,213,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,7428,VHL,1340164546,807479,...,10141952,10141953,Duplication,NM_000551.4(VHL):c.108_122dup (p.38_42SGPEE[3]),germline,"MONDO:MONDO:0015356,MedGen:C0027672,Orphanet:1...",Hereditary cancer-predisposing syndrome,-,NM_000551.4,38_42SGPEE
27328,P51532,Homo sapiens,nucleus speckles,HGNC:11100,1647,MSTPDPPLGGTPRPGPSPGPGPSPGAMLGPSPGPSPGSAHSMMGPS...,6597,SMARCA4,1555753690,402888,...,10986535,10986536,Duplication,NM_001128849.2(SMARCA4):c.708_731dup (p.229_23...,germline,"MONDO:MONDO:0013224,MedGen:C2750074,OMIM:61332...",Rhabdoid tumor predisposition syndrome 2,ClinGen:CA16615870,NM_001128849.2,229_230GP


In [452]:
dup['aux'] = dup.cambio.map(lambda x: re.findall('^(\d+)_?(\d+)?([A-Za-z]*)', x))
dup['aux'] = dup.aux.str[0]
dup.aux

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup['aux'] = dup.cambio.map(lambda x: re.findall('^(\d+)_?(\d+)?([A-Za-z]*)', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup['aux'] = dup.aux.str[0]


15031    (772, 774, KNP)
17057    (646, 648, GLG)
25114    (38, 42, SGPEE)
25454    (38, 42, SGPEE)
27328     (229, 230, GP)
27733     (229, 230, GP)
28238     (229, 230, GP)
28565     (229, 230, GP)
28955     (229, 230, GP)
29204     (229, 230, GP)
29343        (229, , GP)
38278     (164, 165, QP)
Name: aux, dtype: object

In [455]:
def separar_en_cols_raros(df, column, conseq):
    '''
    recibe un DataFrame, el nombre de una columna auxiliar (column)
    y un string con el tipo de consecuencia (conseq). La col. auxiliar
    es una tupla con los elementos implicados en una mutacion
    como la siguiente (start_pos, end_pos, aminoacidos).
    Devuelve el DataFrame df con estas 5 nuevas columnas
    '''
    # start position
    df['start_aa'] = df[column].map(lambda x: x[0])
    df.start_aa = df.start_aa.apply(int)

    # end position
    df['end_aa'] = df[column].map(lambda x: int(x[1]) if x[1] != '' else np.nan)

    # from: es el/los aa que cambian
    df['from'] = df[column].map(lambda x: x[2])
    df['from'] = df['from'].map(lambda x: seq3(x))

    # to: aa/s nuevos
    df['to'] = np.nan

    # consecuencia de la mutacion
    df['consequence'] = conseq

    return df[['cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence']]

In [456]:
separar_en_cols_raros(dup, 'aux', 'duplication')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_aa'] = df[column].map(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end_aa'] = df[column].map(lambda x: int(x[1]) if x[1] != '' else np.nan)
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
15031,772_774KNP,772,774.0,LysAsnPro,,duplication
17057,646_648GLG,646,648.0,GlyLeuGly,,duplication
25114,38_42SGPEE,38,42.0,SerGlyProGluGlu,,duplication
25454,38_42SGPEE,38,42.0,SerGlyProGluGlu,,duplication
27328,229_230GP,229,230.0,GlyPro,,duplication
27733,229_230GP,229,230.0,GlyPro,,duplication
28238,229_230GP,229,230.0,GlyPro,,duplication
28565,229_230GP,229,230.0,GlyPro,,duplication
28955,229_230GP,229,230.0,GlyPro,,duplication
29204,229_230GP,229,230.0,GlyPro,,duplication


In [457]:
dup = dup.drop(columns=['aux'])

In [460]:
# agrego estas entradas a la tabla de duplications
duplications = pd.concat([duplications, dup])

In [461]:
# elimino estas entradas de la tabla original
cond = box1_clinvar_total.index.isin(dup.index)                                # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)   # drop esas filas
len(box1_clinvar_total)

40

# Raros: deleciones  
Evaluar el len(rango), si es igual al len(letras) entonces en una delecion, si es distinto es delins

In [465]:
delet = box1_clinvar_total[box1_clinvar_total.type == 'Deletion']

In [467]:
delet['aux'] = delet.cambio.map(lambda x: re.findall('^(\d+)_?(\d+)?([A-Za-z]*)', x))
delet['aux'] = delet.aux.str[0]
delet.aux

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  delet['aux'] = delet.cambio.map(lambda x: re.findall('^(\d+)_?(\d+)?([A-Za-z]*)', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  delet['aux'] = delet.aux.str[0]


5387           (113, 114, PQ)
8786          (329, 331, SGG)
9259           (178, 179, RS)
10043      (1053, 1056, SGGG)
11175    (454, 461, GYGGDRGG)
11178    (454, 461, GYGGDRGG)
14922         (219, 221, QGS)
17108       (648, 652, GGLGV)
17179      (479, 484, VAPGVG)
17217         (501, , VGVAPG)
24725         (38, 42, SGPEE)
27095          (229, 230, GP)
28561          (229, 230, GP)
28562          (229, 230, GP)
28563          (229, 230, GP)
28564          (229, 230, GP)
28958          (229, 230, GP)
40793      (60, 67, PHGGGWGQ)
40801      (60, 67, PHGGGWGQ)
Name: aux, dtype: object

In [480]:
# evaluo si el rango de de las posiciones coincide con el nro de letras
is_del = []
for i in delet.index:
    start = int(delet.aux[i][0])
    try:
        end = int(delet.aux[i][1])
    except:
        pass
    length = end - start + 1
    aa = len(delet.aux[i][2])
    is_del.append(length == aa)

delet['is_del'] = is_del
delet[['cambio', 'aux', 'is_del']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  delet['is_del'] = is_del


Unnamed: 0,cambio,aux,is_del
5387,113_114PQ,"(113, 114, PQ)",True
8786,329_331SGG,"(329, 331, SGG)",True
9259,178_179RS,"(178, 179, RS)",True
10043,1053_1056SGGG,"(1053, 1056, SGGG)",True
11175,454_461GYGGDRGG,"(454, 461, GYGGDRGG)",True
11178,454_461GYGGDRGG,"(454, 461, GYGGDRGG)",True
14922,219_221QGS,"(219, 221, QGS)",True
17108,648_652GGLGV,"(648, 652, GGLGV)",True
17179,479_484VAPGVG,"(479, 484, VAPGVG)",True
17217,501VGVAPG,"(501, , VGVAPG)",False


In [481]:
separar_en_cols_raros(delet, 'aux', 'deletion')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_aa'] = df[column].map(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end_aa'] = df[column].map(lambda x: int(x[1]) if x[1] != '' else np.nan)
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
5387,113_114PQ,113,114.0,ProGln,,deletion
8786,329_331SGG,329,331.0,SerGlyGly,,deletion
9259,178_179RS,178,179.0,ArgSer,,deletion
10043,1053_1056SGGG,1053,1056.0,SerGlyGlyGly,,deletion
11175,454_461GYGGDRGG,454,461.0,GlyTyrGlyGlyAspArgGlyGly,,deletion
11178,454_461GYGGDRGG,454,461.0,GlyTyrGlyGlyAspArgGlyGly,,deletion
14922,219_221QGS,219,221.0,GlnGlySer,,deletion
17108,648_652GGLGV,648,652.0,GlyGlyLeuGlyVal,,deletion
17179,479_484VAPGVG,479,484.0,ValAlaProGlyValGly,,deletion
17217,501VGVAPG,501,,ValGlyValAlaProGly,,deletion


In [487]:
# agrego la posicion de fin faltante
delet.end_aa[17217] = delet.start_aa[17217] + len(delet['from'][17217]) / 3 - 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  delet.end_aa[17217] = delet.start_aa[17217] + len(delet['from'][17217]) / 3 - 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [489]:
delet = delet.drop(columns=['aux', 'is_del'])

In [488]:
# agrego estas entradas a la tabla de deletions
deletions = pd.concat([deletions, delet])

In [491]:
# elimino estas entradas de la tabla original
cond = box1_clinvar_total.index.isin(delet.index)                                # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)     # drop esas filas
len(box1_clinvar_total)

21

# Raros: inserciones

In [492]:
inser = box1_clinvar_total[box1_clinvar_total.type == 'Insertion']

In [497]:
# para inserciones
inser['aux'] = inser.cambio.map(lambda x: re.findall('^(\d+)_?(\d+)?([A-Za-z]*)', x))
inser['aux'] = inser.aux.str[0]
inser.aux

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inser['aux'] = inser.cambio.map(lambda x: re.findall('^(\d+)_?(\d+)?([A-Za-z]*)', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inser['aux'] = inser.aux.str[0]


27676        (229, 230, GP)
29342           (229, , GP)
40795    (60, 67, PHGGGWGQ)
Name: aux, dtype: object

In [508]:
separar_en_cols_raros(inser, 'aux', 'insertion')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_aa'] = df[column].map(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end_aa'] = df[column].map(lambda x: int(x[1]) if x[1] != '' else np.nan)
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
27676,229_230GP,229,230.0,GlyPro,,insertion
29342,229GP,229,,GlyPro,,insertion
40795,60_67PHGGGWGQ,60,67.0,ProHisGlyGlyGlyTrpGlyGln,,insertion


In [505]:
inser.sequence[27676][228:230]

'GP'

In [507]:
inser.sequence[40795][59:67]

'PHGGGWGQ'

In [509]:
inser.sequence[29342][228]

'G'

In [329]:
# para inserciones
re.findall('^(\d+)_?(\d+)?([A-Za-z])[A-Za-z]*([A-Za-z])$', '229_230GP')

[('229', '230', 'G', 'P')]

In [518]:
inser = inser.drop(columns = ['aux'])

In [520]:
# agrego estas entradas a la tabla de insertions
insertions = pd.concat([insertions, inser])

In [515]:
# elimino estas entradas de la tabla original
cond = box1_clinvar_total.index.isin(inser.index)                                # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index)     # drop esas filas
len(box1_clinvar_total)

18

# Clasificar lo que queda  
--------------------------

In [521]:
box1_clinvar_total

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,start,stop,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio
926,P46100,Homo sapiens,pml nuclear body,HGNC:886,2492,MTAEPMSESKLNTLVQKLHDFLAHSSEESEETSSPPRLAMNQNTDK...,546,ATRX,797045406,209199,...,77635983,77635988,Microsatellite,NM_000489.5(ATRX):c.4620_4625TGAAGA[1] (p.1540...,germline,"MedGen:CN263314|MONDO:MONDO:0010519,MedGen:C18...",Alpha-thalassemia/mental retardation syndrome|...,ClinGen:CA277098,NM_000489.5,1540_1541DE
1865,Q99996,Homo sapiens,centrosome/spindle pole body,HGNC:379,3907,MEDEERQKKLEAGKAKLAQFRQRKAQSDGQSPSKKQKKKRKTSSSK...,10142,AKAP9,1455029822,522879,...,92001726,92001731,Microsatellite,NM_147185.3(AKAP9):c.1809_1814AGAAAA[1] (p.604...,germline,"MONDO:MONDO:0002442,MeSH:D008133,MedGen:C00239...",Long QT syndrome,ClinGen:CA576705339,NM_147185.3,604_605EK
5911,O60500,Homo sapiens,others,HGNC:7908,1241,MALGTTLRASLLLLGLLTEGLAQLAIPASVPRGFWALPENLTVVEG...,4868,NPHS1,1555762721,549138,...,35846088,35846093,Microsatellite,NM_004646.3(NPHS1):c.1536_1541GCTGGT[1] (p.513...,unknown,"MONDO:MONDO:0009732,MedGen:C0403399,OMIM:25630...",Finnish congenital nephrotic syndrome,-,NM_004646.3,513_514LV
9075,P35637,Homo sapiens,nuclear speckle,HGNC:4010,526,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,2521,FUS,-1,861409,...,31184284,31184301,Microsatellite,NM_004960.4(FUS):c.412_429GGACAGCAGCAAAGCTAT[1...,germline,"MONDO:MONDO:0011951,MedGen:C1842675,OMIM:60803...","Amyotrophic lateral sclerosis 6, with or witho...",-,NM_004960.4,138_143GQQQSY
9829,Q96PV0,Homo sapiens,postsynaptic density,HGNC:11497,1343,MSRSRASIHRGSIPAMSYAPFRDVRGPSMHRTQYVHSPYDRPGWNP...,8831,SYNGAP1,1554121722,428620,...,33440916,33440921,Microsatellite,NM_006772.3(SYNGAP1):c.1865_1870CCCTCA[1] (p.6...,germline,-,-,ClinGen:CA645372819,NM_006772.3,622_623TL
10181,Q96PV0,Homo sapiens,postsynaptic density,HGNC:11497,1343,MSRSRASIHRGSIPAMSYAPFRDVRGPSMHRTQYVHSPYDRPGWNP...,8831,SYNGAP1,-1,924340,...,33443898,33443906,Microsatellite,NM_006772.3(SYNGAP1):c.3348GGGCAGCGG[1] (p.111...,germline,"MONDO:MONDO:0012960,MedGen:C2675473,OMIM:612621","Mental retardation, autosomal dominant 5",-,NM_006772.3,1118SGG
25975,P49715,Homo sapiens,nucleolus,HGNC:1833,358,MESADFYEAEPRPPMSSHLQSPPHAPSSAAFGFPRGAGPAQPPAPP...,1050,CEBPA,762459325,208566,...,33301825,33301826,Microsatellite,NM_004364.5(CEBPA):c.572_577ACCCGC[4] (p.191_1...,germline,-,-,ClinGen:CA209407,NM_004364.5,191_192HP
25997,P49715,Homo sapiens,nucleolus,HGNC:1833,358,MESADFYEAEPRPPMSSHLQSPPHAPSSAAFGFPRGAGPAQPPAPP...,1050,CEBPA,762459325,403264,...,33301826,33301831,Microsatellite,NM_001285829.1(CEBPA):c.215_220ACCCGC[2] (p.72...,germline,"Human Phenotype Ontology:HP:0001914,Human Phen...",Acute myeloid leukemia,ClinGen:CA16616054,NM_001285829.1,72_73HP
26004,P49715,Homo sapiens,nucleolus,HGNC:1833,358,MESADFYEAEPRPPMSSHLQSPPHAPSSAAFGFPRGAGPAQPPAPP...,1050,CEBPA,762459325,403676,...,33301825,33301826,Microsatellite,NM_004364.5(CEBPA):c.572ACCCGC[5] (p.191HP[5]),germline,"Human Phenotype Ontology:HP:0001914,Human Phen...",Acute myeloid leukemia,ClinGen:CA16616245,NM_004364.5,191HP
26092,P49715,Homo sapiens,nucleolus,HGNC:1833,358,MESADFYEAEPRPPMSSHLQSPPHAPSSAAFGFPRGAGPAQPPAPP...,1050,CEBPA,762459325,533261,...,33301826,33301837,Microsatellite,NM_004364.5(CEBPA):c.572ACCCGC[1] (p.191HP[1]),germline,"Human Phenotype Ontology:HP:0001914,Human Phen...",Acute myeloid leukemia,ClinGen:CA645612672,NM_004364.5,191HP


In [524]:
# separo en una col auxiliar
box1_clinvar_total['aux'] = box1_clinvar_total.cambio.map(lambda x: re.findall('^(\d+)_?(\d+)?([A-Za-z]*)', x))
box1_clinvar_total['aux'] = box1_clinvar_total.aux.str[0]
box1_clinvar_total['aux']

926        (1540, 1541, DE)
1865         (604, 605, EK)
5911         (513, 514, LV)
9075     (138, 143, GQQQSY)
9829         (622, 623, TL)
10181         (1118, , SGG)
25975        (191, 192, HP)
25997          (72, 73, HP)
26004           (191, , HP)
26092           (191, , HP)
26223        (237, 238, PA)
27327        (229, 230, GP)
27488        (229, 230, GP)
28139        (229, 230, GP)
28956        (229, 230, GP)
33836       (343, 345, DFS)
38213     (2760, 2762, NLQ)
40137       (596, 598, DDE)
Name: aux, dtype: object

In [526]:
# si el rango de de las posiciones coincide con el nro de letras: es delecion
l = []
for i in box1_clinvar_total.index:
    start = int(box1_clinvar_total.aux[i][0])
    try:
        end = int(box1_clinvar_total.aux[i][1])
    except:
        pass
    length = end - start + 1
    aa = len(box1_clinvar_total.aux[i][2])
    l.append(length == aa)

box1_clinvar_total['is_del'] = l
box1_clinvar_total[['cambio', 'aux', 'is_del']]

Unnamed: 0,cambio,aux,is_del
926,1540_1541DE,"(1540, 1541, DE)",True
1865,604_605EK,"(604, 605, EK)",True
5911,513_514LV,"(513, 514, LV)",True
9075,138_143GQQQSY,"(138, 143, GQQQSY)",True
9829,622_623TL,"(622, 623, TL)",True
10181,1118SGG,"(1118, , SGG)",False
25975,191_192HP,"(191, 192, HP)",True
25997,72_73HP,"(72, 73, HP)",True
26004,191HP,"(191, , HP)",False
26092,191HP,"(191, , HP)",False


In [527]:
delet2 = box1_clinvar_total[box1_clinvar_total.is_del == True]

In [528]:
separar_en_cols_raros(delet2, 'aux', 'deletion')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_aa'] = df[column].map(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end_aa'] = df[column].map(lambda x: int(x[1]) if x[1] != '' else np.nan)
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
926,1540_1541DE,1540,1541,AspGlu,,deletion
1865,604_605EK,604,605,GluLys,,deletion
5911,513_514LV,513,514,LeuVal,,deletion
9075,138_143GQQQSY,138,143,GlyGlnGlnGlnSerTyr,,deletion
9829,622_623TL,622,623,ThrLeu,,deletion
25975,191_192HP,191,192,HisPro,,deletion
25997,72_73HP,72,73,HisPro,,deletion
26223,237_238PA,237,238,ProAla,,deletion
27327,229_230GP,229,230,GlyPro,,deletion
27488,229_230GP,229,230,GlyPro,,deletion


In [529]:
delet2 = delet2.drop(columns=['is_del', 'aux'])

In [531]:
# agrego al dataset de deleciones
deletions = pd.concat([deletions, delet2])

## lo que queda son delins xq lo que se inserta es de distinta longitud al rango que se corta

In [532]:
box1_clinvar_total = box1_clinvar_total[box1_clinvar_total.is_del == False]
box1_clinvar_total

Unnamed: 0,uniprot,organism,mlo,hgnc_id,length,sequence,geneid,genesymbol,snpid,alleleid,...,type,name,origin,phenotypeids,phenotypelist,otherids,nuccore_id,cambio,aux,is_del
10181,Q96PV0,Homo sapiens,postsynaptic density,HGNC:11497,1343,MSRSRASIHRGSIPAMSYAPFRDVRGPSMHRTQYVHSPYDRPGWNP...,8831,SYNGAP1,-1,924340,...,Microsatellite,NM_006772.3(SYNGAP1):c.3348GGGCAGCGG[1] (p.111...,germline,"MONDO:MONDO:0012960,MedGen:C2675473,OMIM:612621","Mental retardation, autosomal dominant 5",-,NM_006772.3,1118SGG,"(1118, , SGG)",False
26004,P49715,Homo sapiens,nucleolus,HGNC:1833,358,MESADFYEAEPRPPMSSHLQSPPHAPSSAAFGFPRGAGPAQPPAPP...,1050,CEBPA,762459325,403676,...,Microsatellite,NM_004364.5(CEBPA):c.572ACCCGC[5] (p.191HP[5]),germline,"Human Phenotype Ontology:HP:0001914,Human Phen...",Acute myeloid leukemia,ClinGen:CA16616245,NM_004364.5,191HP,"(191, , HP)",False
26092,P49715,Homo sapiens,nucleolus,HGNC:1833,358,MESADFYEAEPRPPMSSHLQSPPHAPSSAAFGFPRGAGPAQPPAPP...,1050,CEBPA,762459325,533261,...,Microsatellite,NM_004364.5(CEBPA):c.572ACCCGC[1] (p.191HP[1]),germline,"Human Phenotype Ontology:HP:0001914,Human Phen...",Acute myeloid leukemia,ClinGen:CA645612672,NM_004364.5,191HP,"(191, , HP)",False


In [533]:
separar_en_cols_raros(box1_clinvar_total, 'aux', 'delins')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_aa'] = df[column].map(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end_aa'] = df[column].map(lambda x: int(x[1]) if x[1] != '' else np.nan)
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
10181,1118SGG,1118,,SerGlyGly,,delins
26004,191HP,191,,HisPro,,delins
26092,191HP,191,,HisPro,,delins


In [534]:
box1_clinvar_total = box1_clinvar_total.drop(columns=['is_del', 'aux'])

In [535]:
delins = pd.concat([delins, box1_clinvar_total])

# Concatenar todos los subseteos  
--------------------------------

In [536]:
tables = [deletions, delins, duplications, frameshift, insertions, missense, nonsense]

In [537]:
# nro de entradas totales revisadas
l = 0
for i in tables:
    l += len(i)
l

23126

In [538]:
del(mutations)

In [539]:
# tabla final
mutations = pd.concat(tables)
len(mutations)

23126

In [540]:
# etiqueta
mutations['source'] = 'clinvar'

In [541]:
mutations.columns

Index(['uniprot', 'organism', 'mlo', 'hgnc_id', 'length', 'sequence', 'geneid',
       'genesymbol', 'snpid', 'alleleid', 'chromosomeaccession', 'chromosome',
       'start', 'stop', 'type', 'name', 'origin', 'phenotypeids',
       'phenotypelist', 'otherids', 'nuccore_id', 'cambio', 'consequence',
       'start_aa', 'end_aa', 'from', 'to', 'source'],
      dtype='object')

In [542]:
mutations[['cambio', 'consequence',	'start_aa',	'end_aa',	'from',	'to', 'source']]

Unnamed: 0,cambio,consequence,start_aa,end_aa,from,to,source
86,His156_Gln168del,deletion,156,168.0,HisGln,,clinvar
233,Val148_Thr173del,deletion,148,173.0,ValThr,,clinvar
404,Lys2944del,deletion,2944,,Lys,,clinvar
603,Leu1141del,deletion,1141,,Leu,,clinvar
869,Glu1464del,deletion,1464,,Glu,,clinvar
...,...,...,...,...,...,...,...
40961,Gln35Ter,nonsense,35,,Gln,Ter,clinvar
40963,Arg337Ter,nonsense,337,,Arg,Ter,clinvar
40965,Arg73Ter,nonsense,73,,Arg,Ter,clinvar
40976,Arg184Ter,nonsense,184,,Arg,Ter,clinvar


In [543]:
mutations.to_csv('datasets/clinvar_box1_mutations.csv.gz', compression='gzip')