In [1]:
import pandas as pd
import numpy as np
import re
# from Bio.SeqUtils import seq3

def seq3(seq):
    
    protein_letters_1to3 = {
        "A": "Ala",
        "C": "Cys",
        "D": "Asp",
        "E": "Glu",
        "F": "Phe",
        "G": "Gly",
        "H": "His",
        "I": "Ile",
        "K": "Lys",
        "L": "Leu",
        "M": "Met",
        "N": "Asn",
        "P": "Pro",
        "Q": "Gln",
        "R": "Arg",
        "S": "Ser",
        "T": "Thr",
        "V": "Val",
        "W": "Trp",
        "Y": "Tyr",
        "B": "Asx",
        "X": "Xaa",
        "Z": "Glx",
        "J": "Xle",
        "U": "Sec",
        "O": "Pyl"
    }
    
    return "".join(protein_letters_1to3.get(aa, "Xaa") for aa in seq)

In [78]:
[x for x in "string"]

['s', 't', 'r', 'i', 'n', 'g']

In [2]:
vs = pd.read_csv('../../data/vs_orig.csv.gz')
box = pd.read_csv('../box1_proteins.csv')

# Mergeo con el dataset completo
box1_clinvar_total = box.merge(vs)

# Para generar una col con los codigos NM... estos son los id de los transcriptos
box1_clinvar_total['nuccore_id'] = box1_clinvar_total.name.map(lambda x: re.findall('[A-Z]{2}\_[0-9]+\.[0-9]*', x))
box1_clinvar_total['nuccore_id'] = box1_clinvar_total.nuccore_id.str[0]

# Subset mutations with "p." only
box1_clinvar_total['cambio'] = box1_clinvar_total.name.map(lambda x: re.findall('\(p\..*\)$', x))
box1_clinvar_total['cambio'] = box1_clinvar_total.cambio.str[0]
box1_clinvar_total.cambio = box1_clinvar_total.cambio.str.strip('()')  # para quitar los parentesis
box1_clinvar_total.cambio = box1_clinvar_total.cambio.str.lstrip('p.')                             # se usa lstrip xq strip tambien saca las p del final 

# saco los nans -- > Qué son los nan? 9433 NaNs probably intronic mutations?
box1_clinvar_total = box1_clinvar_total[box1_clinvar_total.cambio.notnull()]

# separo las mutaciones sinonimas, no las tenemos en cuenta xq no surgen cambio en la proteina
syn = box1_clinvar_total[box1_clinvar_total.cambio.str.endswith('=')]
# y las borro
cond = box1_clinvar_total.index.isin(syn.index) # es un array de bool
box1_clinvar_total = box1_clinvar_total.drop(box1_clinvar_total[cond].index) # drop esas filas

In [21]:
def separar_en_cols(df, column, conseq, conseq_regex, override=False):
    '''
    recibe un DataFrame, el nombre de una columna auxiliar (column)
    y un string con el tipo de consecuencia (conseq). La col. auxiliar
    es una tupla con los elementos implicados en una mutacion
    como la siguiente (aa1, start_pos, aa2, end_pos, aa/s_nuevos).
    Devuelve el DataFrame df con estas 5 nuevas columnas
    '''
    df_crop = df[df[column].str.contains(conseq_regex)].copy()
      
    if override:
        df_crop['aux'] = df_crop[column].str.findall(conseq_regex).str[0]
    else:
        df_crop['aux'] = df_crop[column].str.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?'+conseq_regex+'(.*)$').str[0]
        
    if conseq == "missense":
        df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[1])
        df_crop['end_aa'] = np.nan
        df_crop['from'] = df_crop['aux'].map(lambda x: x[0])        
        df_crop['to'] = df_crop['aux'].map(lambda x: x[2])
    else:
    
        # start position
        df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[1])

        # end position
        df_crop['end_aa'] = df_crop['aux'].map(lambda x: int(x[3]) if x[3] != '' else np.nan)

        # from: es el/los aa que cambian
        df_crop['from'] = df_crop['aux'].map(lambda x: x[0] + x[2]) # concateno si existe mas de un aa que cambia (o sea, si es un rango)

        # to: aa/s nuevos
        if conseq == "nonsense":
            df_crop['to'] = "Ter"
        else:
            df_crop['to'] = df_crop['aux'].map(lambda x: x[4] if x[4] != '' else np.nan)

    # consecuencia de la mutacion
    df_crop['consequence'] = conseq

    df_crop = df_crop.drop(columns=['aux'])

    return df_crop[['cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence']]

In [4]:
delins = separar_en_cols(box1_clinvar_total, "cambio", "delins", "delins")
print(delins.shape)

(94, 6)


In [5]:
deletions = separar_en_cols(box1_clinvar_total, "cambio", "deletion", "del$") # finish with 'del'
print(deletions.shape)

(396, 6)


In [6]:
insertions = separar_en_cols(box1_clinvar_total, "cambio", "insertion", "(?<!del)ins") # Negative lookbehind search!
print(insertions.shape)

(143, 6)


In [281]:
frameshift = separar_en_cols(box1_clinvar_total, "cambio", "frameshit", "fs$")
print(frameshift.shape)

(2757, 6)


In [31]:
nonsense = separar_en_cols(box1_clinvar_total, "cambio", "nonsense", "(?<=\d)Ter") # positiv lookbehind search! must have a number before, some delins insert a Ter
print(nonsense.shape)

(1621, 6)


In [30]:
missense = separar_en_cols(box1_clinvar_total, "cambio", "missense", '^([A-Z][a-z]{2})(\d+)(?!Ter)([A-Z][a-z]{2})$', override=True)
print(missense.shape)


(17953, 6)


  return func(self, *args, **kwargs)


In [209]:
duplications = separar_en_cols(box1_clinvar_total, "cambio", "duplication", "dup")
duplications.shape

(108, 6)

In [234]:
ix_targets_lists = [list(x) for x in [delins.index, deletions.index, insertions.index, frameshift.index, nonsense.index, missense.index, duplications.index]]
ix_targets = [y for x in ix_targets_lists for y in x]

In [235]:
cond = box1_clinvar_total.index.isin(ix_targets)                       # es un array de bool
box1_clinvar_left = box1_clinvar_total.drop(box1_clinvar_total[cond].index)   # drop esas filas
len(box1_clinvar_left)

59

In [236]:
print(box1_clinvar_total.shape)
print(box1_clinvar_left.shape)

(23131, 22)
(59, 22)


## Take care of weird cases here

In [237]:
# elimino los '?' y los Nan
box1_clinvar_left = box1_clinvar_left[(box1_clinvar_left.cambio != '?') & (box1_clinvar_left.cambio != '(?') & (box1_clinvar_left.cambio.notnull())]

In [265]:
def delete_brackets(x):
    if re.search('\[\d+\]', x):
        z = re.search('(\[\d+\])', x)
        mlen = len(z.groups()[0])
        return x[:-mlen]
    else:
        return x

box1_clinvar_left.cambio = box1_clinvar_left.cambio.apply(delete_brackets)

In [228]:
# elimino los parentesis con el nro del final usando una regex
# box1_clinvar_left.cambio = box1_clinvar_left.cambio.map(lambda x: re.findall('^(\d+_?\d+?[A-Za-z][A-Za-z]*[A-Za-z])', x))
# box1_clinvar_left.cambio = box1_clinvar_left.cambio.str[0]

In [266]:
def separar_en_cols_raros(df, column, conseq, conseq_regex, override=False):
    '''
    recibe un DataFrame, el nombre de una columna auxiliar (column)
    y un string con el tipo de consecuencia (conseq). La col. auxiliar
    es una tupla con los elementos implicados en una mutacion
    como la siguiente (aa1, start_pos, aa2, end_pos, aa/s_nuevos).
    Devuelve el DataFrame df con estas 5 nuevas columnas
    '''
    df_crop = df[df[column].str.contains(conseq_regex)].copy()
      
    if override:
        df_crop['aux'] = df_crop[column].str.findall(conseq_regex).str[0]
    else:
        df_crop['aux'] = df_crop[column].str.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?'+conseq_regex+'(.*)$').str[0]
            
    # start position
    df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[0])
    df_crop.start_aa = df_crop.start_aa.apply(int)
    
    # end position
    df_crop['end_aa'] = df_crop['aux'].map(lambda x: int(x[1]) if x[1] != '' else np.nan)
    
    # from: es el/los aa que cambian
    df_crop['from'] = df_crop['aux'].map(lambda x: x[2]) # concateno si existe mas de un aa que cambia (o sea, si es un rango)
    df_crop['from'] = df_crop['from'].map(lambda x: seq3(x))
    df_crop['from'] = df_crop['from'].apply(str)
    
    df_crop['to'] = np.nan
        
    # consecuencia de la mutacion
    df_crop['consequence'] = conseq

    df_crop = df_crop.drop(columns=['aux'])

    return df_crop[['cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence']]

In [267]:
box1_clinvar_left.cambio

926          1540_1541DE
1865           604_605EK
5387           113_114PQ
5911           513_514LV
8786          329_331SGG
9075       138_143GQQQSY
9259           178_179RS
9829           622_623TL
10043      1053_1056SGGG
10181            1118SGG
11175    454_461GYGGDRGG
11178    454_461GYGGDRGG
14922         219_221QGS
15031         772_774KNP
17057         646_648GLG
17108       648_652GGLGV
17179      479_484VAPGVG
17217          501VGVAPG
24725         38_42SGPEE
25114         38_42SGPEE
25454         38_42SGPEE
25967      Ile68Leufs*41
25969     Asp107Valfs*54
25975          191_192HP
25997            72_73HP
26004              191HP
26092              191HP
26223          237_238PA
27095          229_230GP
27327          229_230GP
27328          229_230GP
27488          229_230GP
27676          229_230GP
27733          229_230GP
28139          229_230GP
28238          229_230GP
28561          229_230GP
28562          229_230GP
28563          229_230GP
28564          229_230GP


In [268]:
dup = box1_clinvar_left[box1_clinvar_left.type == 'Duplication']
dup_raros = separar_en_cols_raros(dup, "cambio", "duplication", '^(\d+)_?(\d+)?([A-Za-z]*)', override=True)
dup_raros.shape

(12, 6)

In [269]:
# agrego estas entradas a la tabla de duplications
duplications2 = pd.concat([duplications, dup_raros])

In [270]:
delet = box1_clinvar_left[box1_clinvar_left.type == 'Deletion']

delet2 = separar_en_cols_raros(delet, "cambio", "deletion", '^(\d+)_?(\d+)?([A-Za-z]*)', override=True)
delet2

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
5387,113_114PQ,113,114.0,ProGln,,deletion
8786,329_331SGG,329,331.0,SerGlyGly,,deletion
9259,178_179RS,178,179.0,ArgSer,,deletion
10043,1053_1056SGGG,1053,1056.0,SerGlyGlyGly,,deletion
11175,454_461GYGGDRGG,454,461.0,GlyTyrGlyGlyAspArgGlyGly,,deletion
11178,454_461GYGGDRGG,454,461.0,GlyTyrGlyGlyAspArgGlyGly,,deletion
14922,219_221QGS,219,221.0,GlnGlySer,,deletion
17108,648_652GGLGV,648,652.0,GlyGlyLeuGlyVal,,deletion
17179,479_484VAPGVG,479,484.0,ValAlaProGlyValGly,,deletion
17217,501VGVAPG,501,,ValGlyValAlaProGly,,deletion


In [271]:
# agrego la posicion de fin faltante (o más, busca Nan en el end_aa)
ix = delet2.end_aa.isna()
for i in np.where(ix)[0]:
    delet2['end_aa'].iloc[i] = delet2.start_aa.iloc[i] + len(delet2['from'].iloc[i]) / 3 - 1

In [272]:
# agrego estas entradas a la tabla de deletions
deletions2 = pd.concat([deletions, delet2])

In [273]:
inser = box1_clinvar_left[box1_clinvar_left.type == 'Insertion']
inser.shape

(3, 22)

In [274]:
insert2 = separar_en_cols_raros(inser, "cambio", "insertion", '^(\d+)_?(\d+)?([A-Za-z]*)', override=True)
insert2

  return func(self, *args, **kwargs)


Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
27676,229_230GP,229,230.0,GlyPro,,insertion
29342,229GP,229,,GlyPro,,insertion
40795,60_67PHGGGWGQ,60,67.0,ProHisGlyGlyGlyTrpGlyGln,,insertion


In [275]:
# agrego estas entradas a la tabla de insertions
insertions2 = pd.concat([insertions, insert2])
insertions2.shape

(146, 6)

In [339]:

ix_targets_lists = [list(x) for x in [delins.index, deletions2.index, insertions2.index, frameshift.index, nonsense.index, missense.index, duplications2.index]]
ix_targets = [y for x in ix_targets_lists for y in x]
cond = box1_clinvar_total.index.isin(ix_targets)                       # es un array de bool
box1_clinvar_leftovers = box1_clinvar_total.drop(box1_clinvar_total[cond].index)   # drop esas filas
box1_clinvar_leftovers = box1_clinvar_leftovers[(box1_clinvar_leftovers.cambio != '?') & (box1_clinvar_leftovers.cambio != '(?') & (box1_clinvar_leftovers.cambio.notnull())]
box1_clinvar_leftovers.cambio = box1_clinvar_leftovers.cambio.apply(delete_brackets)
box1_clinvar_leftovers.shape

(20, 22)

In [340]:
box1_clinvar_leftovers.cambio

926         1540_1541DE
1865          604_605EK
5911          513_514LV
9075      138_143GQQQSY
9829          622_623TL
10181           1118SGG
25967     Ile68Leufs*41
25969    Asp107Valfs*54
25975         191_192HP
25997           72_73HP
26004             191HP
26092             191HP
26223         237_238PA
27327         229_230GP
27488         229_230GP
28139         229_230GP
28956         229_230GP
33836        343_345DFS
38213      2760_2762NLQ
40137        596_598DDE
Name: cambio, dtype: object

In [341]:
leftovers2check = separar_en_cols_raros(box1_clinvar_leftovers, "cambio", "to_check", '^(\d+)_?(\d+)?([A-Za-z]*)', override=True )
leftovers2check

  return func(self, *args, **kwargs)


Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence
926,1540_1541DE,1540,1541.0,AspGlu,,to_check
1865,604_605EK,604,605.0,GluLys,,to_check
5911,513_514LV,513,514.0,LeuVal,,to_check
9075,138_143GQQQSY,138,143.0,GlyGlnGlnGlnSerTyr,,to_check
9829,622_623TL,622,623.0,ThrLeu,,to_check
10181,1118SGG,1118,,SerGlyGly,,to_check
25975,191_192HP,191,192.0,HisPro,,to_check
25997,72_73HP,72,73.0,HisPro,,to_check
26004,191HP,191,,HisPro,,to_check
26092,191HP,191,,HisPro,,to_check


In [342]:
# # separo en una col auxiliar y borro los Nan (son 2 frameshift extraños! preguntar a Alvaro como lo quiere solucionar)
# box1_clinvar_leftovers['aux'] = box1_clinvar_leftovers.cambio.map(lambda x: re.findall('^(\d+)_?(\d+)?([A-Za-z]*)', x))
# box1_clinvar_leftovers['aux'] = box1_clinvar_leftovers.aux.str[0]
# box1_clinvar_leftovers = box1_clinvar_leftovers[box1_clinvar_leftovers['aux'].notnull()]


In [343]:
# # si el rango de de las posiciones coincide con el nro de letras: es delecion
# l = []
# for i in box1_clinvar_leftovers.index:
#     if box1_clinvar_leftovers.loc[i]["aux"] != np.nan:
#         start = int(box1_clinvar_leftovers.loc[i]["aux"][0])
#         try:
#             end = int(box1_clinvar_leftovers.loc[i]["aux"][1])
#         except:
#             pass
#         length = end - start + 1
#         aa = len(box1_clinvar_leftovers.loc[i]["aux"][2])
#         l.append(length == aa)
#     else:
#         print("flop")

# box1_clinvar_leftovers['is_del'] = l
# box1_clinvar_leftovers[['cambio', 'aux', 'is_del']]

# si el rango de de las posiciones coincide con el nro de letras: es delecion
l = []
for i in leftovers2check.index:
    start = int(leftovers2check.loc[i]["start_aa"])
    try:
        end = int(leftovers2check.loc[i]["end_aa"])
    except:
        pass
    length = end - start + 1
    aa = len(leftovers2check.loc[i]["from"]) / 3
    l.append(length == aa)


leftovers2check['is_del'] = l
leftovers2check

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence,is_del
926,1540_1541DE,1540,1541.0,AspGlu,,to_check,True
1865,604_605EK,604,605.0,GluLys,,to_check,True
5911,513_514LV,513,514.0,LeuVal,,to_check,True
9075,138_143GQQQSY,138,143.0,GlyGlnGlnGlnSerTyr,,to_check,True
9829,622_623TL,622,623.0,ThrLeu,,to_check,True
10181,1118SGG,1118,,SerGlyGly,,to_check,False
25975,191_192HP,191,192.0,HisPro,,to_check,True
25997,72_73HP,72,73.0,HisPro,,to_check,True
26004,191HP,191,,HisPro,,to_check,False
26092,191HP,191,,HisPro,,to_check,False


In [372]:
delet3 = leftovers2check[leftovers2check.is_del == True]
delet3["consequence"] = "deletion"
delet3 = delet3.drop(columns=["is_del"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [361]:
deletions3 = pd.concat([deletions2, delet3])


In [363]:
delins2 = leftovers2check[leftovers2check.is_del == False]
delins2.consequence = "delins"
delins2

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence,is_del
10181,1118SGG,1118,,SerGlyGly,,delins,False
26004,191HP,191,,HisPro,,delins,False
26092,191HP,191,,HisPro,,delins,False


In [364]:
delins3 = pd.concat([delins, delins2])
delins3

Unnamed: 0,cambio,start_aa,end_aa,from,to,consequence,is_del
673,Gly144delinsTrpAspValHisSerGlnTer,144,,Gly,TrpAspValHisSerGlnTer,delins,
815,Gln18_Gln38delinsGlnGlnGlnGlnGlnGlnGlnGlnGlnGl...,18,38.0,GlnGln,GlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnG...,delins,
1153,Lys1409_Ala1410delinsAsn,1409,1410.0,LysAla,Asn,delins,
1501,Arg2345_Glu2346delinsLys,2345,2346.0,ArgGlu,Lys,delins,
2093,Lys738_Gly739delinsSer,738,739.0,LysGly,Ser,delins,
...,...,...,...,...,...,...,...
38702,Glu132_His134delinsAlaAsn,132,134.0,GluHis,AlaAsn,delins,
39989,Arg208delinsAsnTer,208,,Arg,AsnTer,delins,
10181,1118SGG,1118,,SerGlyGly,,delins,False
26004,191HP,191,,HisPro,,delins,False


## Concatenate everything

In [370]:
tables = [deletions3, delins3, duplications2, frameshift, insertions2, missense, nonsense]

print(f"Found {deletions3.shape[0]} deletions")
print(f"Found {delins3.shape[0]} delins")
print(f"Found {duplications2.shape[0]} duplications")
print(f"Found {frameshift.shape[0]} frameshift")
print(f"Found {insertions2.shape[0]} inserions")
print(f"Found {missense.shape[0]} missense")
print(f"Found {nonsense.shape[0]} nonsense")

Found 430 deletions
Found 97 delins
Found 120 duplications
Found 2757 frameshift
Found 146 inserions
Found 17953 missense
Found 1621 nonsense


In [366]:
# tabla final
mutations = pd.concat(tables)
len(mutations)

23124

In [367]:
# etiqueta
mutations['source'] = 'clinvar'

In [368]:
mutations[['cambio', 'consequence','start_aa','end_aa','from','to', 'source']]

Unnamed: 0,cambio,consequence,start_aa,end_aa,from,to,source
86,His156_Gln168del,deletion,156,168.0,HisGln,,clinvar
233,Val148_Thr173del,deletion,148,173.0,ValThr,,clinvar
404,Lys2944del,deletion,2944,,Lys,,clinvar
603,Leu1141del,deletion,1141,,Leu,,clinvar
869,Glu1464del,deletion,1464,,Glu,,clinvar
...,...,...,...,...,...,...,...
40961,Gln35Ter,nonsense,35,,Gln,Ter,clinvar
40963,Arg337Ter,nonsense,337,,Arg,Ter,clinvar
40965,Arg73Ter,nonsense,73,,Arg,Ter,clinvar
40976,Arg184Ter,nonsense,184,,Arg,Ter,clinvar


In [369]:
mutations.to_csv('clinvar_box1_mutations_processed.csv.gz', index= False, compression='gzip')