In [None]:
import pandas as pd
import numpy as np
import sys, os
from collections import defaultdict

llpsprots = "../../llps_human_all_proteins.csv"
datafile  = "../../cosmic/CosmicMutantExport.tsv.gz"

mapfile_ENSP = "../llps_uniprot2ENSP.tab.txt"
mapfile_ENST = "../llps_uniprot2ENST.tab.txt"

def load_mapping(mapfile):
    mapdict = defaultdict(lambda: False)
    with open(mapfile) as infmt:
        next(infmt)
        for line in infmt:
            arr = line.strip().split("\t")
            if mapdict[arr[1]]:
                print(f"WARNING: {arr[1]} already in dict")
            else:
                mapdict[arr[1]] = True
    return mapdict
                
ENSP_dict = load_mapping(mapfile_ENSP)
ENST_dict = load_mapping(mapfile_ENST)

In [None]:
# read COSMIC file line by line, to heavy to load entirely in memory

import gzip

select = ["Gene name", "Accession Number", "HGNC ID", "Primary site", 
          "Primary histology", "Genome-wide screen", "GENOMIC_MUTATION_ID", "LEGACY_MUTATION_ID",
          "MUTATION_ID", "Mutation CDS", "Mutation AA", "Mutation Description", "GRCh", 
          "Mutation genome position", "SNP", "Mutation somatic status", "Pubmed_PMID", "Age",
          "HGVSP", "HGVSC", "HGVSG"]

all_cols = list()
with gzip.open(datafile) as ifile, open("COSMIC_crop.txt", 'w') as ofile:
    headers = next(ifile).decode().rstrip('\n').split("\t")
    ix = [headers.index(x) for x in select]
    ofile.write("\t".join(select)+"\n")
    for i,line in enumerate(ifile):
        try:
            arr = line.decode().rstrip("\n").split("\t")

            cols = [arr[i] for i in ix]
            # print(cols[18], cols[19], cols[20])
            if cols[18] != "p.?" and cols[18] != "":
                ENSP_id = cols[18].split(".")[0]
            if cols[19] != "":
                ENST_id = cols[19].split(".")[0]

            # Filter only mutations in our LLPS dataset
            if ENSP_dict[ENSP_id] or ENST_dict[ENST_id]:
                ofile.write("\t".join(cols)+"\n")        
        except:
            print(f"Error at line {i}: {line}")
            continue

In [None]:
from collections import defaultdict

mut_dict = defaultdict(list)

with open("COSMIC_crop.txt") as ifile:
    headers = next(ifile)
    for line in ifile:
        arr = line.rstrip("\n").split("\t")
        mut_dict[arr[11]].append(arr)
        

In [None]:
for k in mut_dict:
    print(f"{k}: {len(mut_dict[k])} mutations")

In [None]:
import pandas as pd
import re 
import numpy as np

headers = ["Gene name", "Accession Number", "HGNC ID", "Primary site", 
          "Primary histology", "Genome-wide screen", "GENOMIC_MUTATION_ID", "LEGACY_MUTATION_ID",
          "MUTATION_ID", "Mutation CDS", "Mutation AA", "Mutation Description", "GRCh", 
          "Mutation genome position", "SNP", "Mutation somatic status", "Pubmed_PMID", "Age",
          "HGVSP", "HGVSC", "HGVSG"]



In [None]:
def separar_en_cols(df, column, conseq, conseq_regex, override=False):
    '''
    recibe un DataFrame, el nombre de una columna auxiliar (column)
    y un string con el tipo de consecuencia (conseq). La col. auxiliar
    es una tupla con los elementos implicados en una mutacion
    como la siguiente (aa1, start_pos, aa2, end_pos, aa/s_nuevos).
    Devuelve el DataFrame df con estas 5 nuevas columnas
    '''
    
    df_crop = df[df[column].str.contains(conseq_regex)].copy()
      
    if override:
        df_crop['aux'] = df_crop[column].str.findall(conseq_regex).str[0]
    else:
        df_crop['aux'] = df_crop[column].str.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?'+conseq_regex+'(.*)$').str[0]
        
    if conseq == "missense" or conseq == "nonsense":
        df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[1])
        df_crop['end_aa'] = df_crop['start_aa']
        df_crop['from'] = df_crop['aux'].map(lambda x: x[0])        
        df_crop['to'] = df_crop['aux'].map(lambda x: x[2])
    else:
    
        # start position
        df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[1])

        # end position
        df_crop['end_aa'] = df_crop['aux'].map(lambda x: int(x[3]) if x[3] != '' else np.nan)

        # from: es el/los aa que cambian
        df_crop['from'] = df_crop['aux'].map(lambda x: x[0] + x[2]) # concateno si existe mas de un aa que cambia (o sea, si es un rango)

        # to: aa/s nuevos
        if conseq == "nonsense":
            df_crop['to'] = "Ter"
        elif conseq == "deletion":
            df_crop['to'] = ""
        else:
            df_crop['to'] = df_crop['aux'].map(lambda x: x[4] if x[4] != '' else np.nan)

    # consecuencia de la mutacion
    df_crop['consequence'] = conseq

    df_crop = df_crop.drop(columns=['aux'])

    return df_crop[['cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence']]

def seq3(seq):
    
    protein_letters_1to3 = {
        "A": "Ala",
        "C": "Cys",
        "D": "Asp",
        "E": "Glu",
        "F": "Phe",
        "G": "Gly",
        "H": "His",
        "I": "Ile",
        "K": "Lys",
        "L": "Leu",
        "M": "Met",
        "N": "Asn",
        "P": "Pro",
        "Q": "Gln",
        "R": "Arg",
        "S": "Ser",
        "T": "Thr",
        "V": "Val",
        "W": "Trp",
        "Y": "Tyr",
        "B": "Asx",
        "X": "Xaa",
        "Z": "Glx",
        "J": "Xle",
        "U": "Sec",
        "O": "Pyl",
        "*": "Ter"
    }
    
    return "".join(protein_letters_1to3.get(aa, "Xaa") for aa in seq)

In [None]:

def separar_en_cols_missense(df, column, conseq, conseq_regex, override=False):
    
    df_crop = df[df[column].str.contains(conseq_regex)].copy()
      
    if override:
        df_crop['aux'] = df_crop[column].str.findall(conseq_regex).str[0]
    else:
        df_crop['aux'] = df_crop[column].str.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?'+conseq_regex+'(.*)$').str[0]
            
    # start position
    df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[1])
    df_crop.start_aa = df_crop.start_aa.apply(int)
    
    # end position
    df_crop['end_aa'] = df_crop['start_aa']
    # df_crop['end_aa'] = df_crop['aux'].map(lambda x: int(x[1]) if x[1] != '' else np.nan)
    
    # from: es el/los aa que cambian
    df_crop['from'] = df_crop['aux'].map(lambda x: seq3(x[0])) # concateno si existe mas de un aa que cambia (o sea, si es un rango)
    df_crop['from'] = df_crop['from'].apply(str)
    
    df_crop['to'] = df_crop['aux'].map(lambda x: seq3(x[2]))
    df_crop['to'] = df_crop['to'].apply(str)
        
    # consecuencia de la mutacion
    df_crop['consequence'] = conseq

    df_crop = df_crop.drop(columns=['aux'])

    return df_crop[['cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence']]

## Process MISSENSE

In [None]:
df_tmp = pd.DataFrame(mut_dict["Substitution - Missense"], columns=headers)

# Subset mutations with "p." only
df_tmp['cambio'] = df_tmp["HGVSP"].map(lambda x: re.findall('p\..*$', x))
df_tmp['cambio'] = df_tmp.cambio.str[0]
df_tmp.cambio = df_tmp.cambio.str.lstrip('p.') 

# separate those that don't have HGVSP
ix_nulls = df_tmp["cambio"].isnull()
df_nulls = df_tmp[ix_nulls].copy()

# do classic missense processing
missense = separar_en_cols(df_tmp[~ix_nulls], "cambio", "missense", '^([A-Z][a-z]{2})(\d+)(?!Ter)([A-Z][a-z]{2})$', override=True)
df_tmp = df_tmp.drop(columns=["cambio"])

# Now process those that have NULL in HGVSP but have some information on "Mutation AA"
# small check in case something is not a missense mutation
check = df_nulls["Mutation AA"].map(lambda x: re.findall('p\.[A-Z]\d+[A-Z]$', x))
if np.sum(check.isnull()) > 0:
    print("Warning MISSENSE! some nulls or errors here")

# continue with mutation processing
ix_X = df_nulls["Mutation AA"].str.contains('p\.[A-Z]\d+X$')  # discard mutations to X
df_pass = df_nulls[~ix_X].copy()
df_pass["cambio"] = df_pass["Mutation AA"].str.lstrip('p.')
df_done = separar_en_cols_missense(df_pass, "cambio", "missense", '^([A-Z])(\d+)([A-Z])$', override=True)

In [None]:
all_missense = pd.concat((missense, df_done))
missense_df = pd.merge(df_tmp, all_missense, right_index=True, left_index=True)
print(missense.shape)
print(df_done.shape)
print(all_missense.shape)
print(missense_df.shape)



In [None]:
missense_df.to_csv("llps_missense_cosmic.csv.gz", sep=",", header=True, index=False, compression='gzip')

In [None]:
del missense_df
del all_missense

## Process NONSENSE

In [None]:
### Process Nonsense

df_tmp = pd.DataFrame(mut_dict["Substitution - Nonsense"], columns=headers)

# Subset mutations with "p." only
df_tmp['cambio'] = df_tmp["HGVSP"].map(lambda x: re.findall('p\..*$', x))
df_tmp['cambio'] = df_tmp.cambio.str[0]
df_tmp.cambio = df_tmp.cambio.str.lstrip('p.') 

# # separate those that don't have HGVSP
ix_nulls = df_tmp["cambio"].isnull()
df_nulls = df_tmp[ix_nulls].copy()

# # do classic missense processing
nonsense = separar_en_cols(df_tmp[~ix_nulls], "cambio", "nonsense", "(?<=\d)Ter", override=False)
df_tmp = df_tmp.drop(columns=["cambio"])

# # Now process those that have NULL in HGVSP but have some information on "Mutation AA"
# # small check in case something is not a missense mutation
check = df_nulls["Mutation AA"].map(lambda x: re.findall('p\.[A-Z]\d+\*$', x))
if np.sum(check.isnull()) > 0:
    print("Warning NONSENSE! some nulls or errors here")

# # continue with mutation processing
df_nulls["cambio"] = df_nulls["Mutation AA"].str.lstrip('p.')
df_done = separar_en_cols_missense(df_nulls, "cambio", "nonsense", '^([A-Z])(\d+)(\*)$', override=True)

In [None]:
print(nonsense.shape)
print(df_done.shape)

In [None]:
all_nonsense = pd.concat((nonsense, df_done))
nonsense_df = pd.merge(df_tmp, all_nonsense, right_index=True, left_index=True)

In [None]:
nonsense_df.to_csv("llps_nonsense_cosmic.csv.gz", sep=",", header=True, index=False, compression='gzip')

In [None]:
print(nonsense_df.shape)

In [None]:
del nonsense_df
del all_nonsense

# Process DELETIONS

In [None]:
### Process Deletions

df_tmp = pd.DataFrame(mut_dict["Deletion - In frame"], columns=headers)
df_tmp

# Subset mutations with "p." only
df_tmp['cambio'] = df_tmp["HGVSP"].map(lambda x: re.findall('p\..*$', x))
df_tmp['cambio'] = df_tmp.cambio.str[0]
df_tmp.cambio = df_tmp.cambio.str.lstrip('p.') 

# separate those that don't have HGVSP
ix_nulls = df_tmp["cambio"].isnull()
df_nulls = df_tmp[ix_nulls]
df_notnulls = df_tmp[~ix_nulls].copy()
ix_right = df_notnulls["cambio"].str.contains('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?del$')
df_weird = df_notnulls[~ix_right].copy()

# # # do classic missense processing
deletions = separar_en_cols(df_notnulls[ix_right], "cambio", "deletion", '^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?del$', override=True)
# df_tmp = df_tmp.drop(columns=["cambio"])

# # # Now process those that have NULL in HGVSP but have some information on "Mutation AA"
# # # small check in case something is not a missense mutation
# check = df_nulls["Mutation AA"].map(lambda x: re.findall('p\.[A-Z]\d+\*$', x))
# if np.sum(check.isnull()) > 0:
#     print("Warning NONSENSE! some nulls or errors here")

# # # continue with mutation processing
# df_nulls["cambio"] = df_nulls["Mutation AA"].str.lstrip('p.')
# df_done = separar_en_cols_missense(df_nulls, "cambio", "nonsense", '^([A-Z])(\d+)(\*)$', override=True)

In [None]:
deletions[:50]

In [None]:
def separar_en_cols(df, column, conseq, conseq_regex, override=False):
    '''
    recibe un DataFrame, el nombre de una columna auxiliar (column)
    y un string con el tipo de consecuencia (conseq). La col. auxiliar
    es una tupla con los elementos implicados en una mutacion
    como la siguiente (aa1, start_pos, aa2, end_pos, aa/s_nuevos).
    Devuelve el DataFrame df con estas 5 nuevas columnas
    '''
    
    df_crop = df[df[column].str.contains(conseq_regex)].copy()
      
    if override:
        df_crop['aux'] = df_crop[column].str.findall(conseq_regex).str[0]
    else:
        df_crop['aux'] = df_crop[column].str.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?'+conseq_regex+'(.*)$').str[0]
        
    if conseq == "missense" or conseq == "nonsense":
        df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[1])
        df_crop['end_aa'] = df_crop['start_aa']
        df_crop['from'] = df_crop['aux'].map(lambda x: x[0])        
        df_crop['to'] = df_crop['aux'].map(lambda x: x[2])
    else:
    
        # start position
        df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[1])

        # end position
        df_crop['end_aa'] = df_crop['aux'].map(lambda x: int(x[3]) if x[3] != '' else np.nan)

        # from: es el/los aa que cambian
        df_crop['from'] = df_crop['aux'].map(lambda x: x[0] + x[2]) # concateno si existe mas de un aa que cambia (o sea, si es un rango)

        # to: aa/s nuevos
        if conseq == "nonsense":
            df_crop['to'] = "Ter"
        elif conseq == "deletion":
            df_crop['to'] = ""
        else:
            df_crop['to'] = df_crop['aux'].map(lambda x: x[4] if x[4] != '' else np.nan)

    # consecuencia de la mutacion
    df_crop['consequence'] = conseq

    df_crop = df_crop.drop(columns=['aux'])

    return df_crop[['cambio', 'start_aa', 'end_aa', 'from', 'to', 'consequence']]


In [None]:
conseq_regex = 'del'
for e in df_notnulls[ix_right]["cambio"]:
    m = re.search('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?del$', e)
    if m is not None:
        print(m.group(1), m.group(2), m.group(3), m.group(4))
    else:
        print(m)

In [None]:
df_tmp[~ix_nulls][:14]

In [None]:
df_tmp[~ix_nulls]["cambio"][:40]

In [None]:
# list(df_tmp[~ix_nulls]["cambio"].str.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?'+conseq_regex+'(.*)$').str[0][0:100])
aux = df_notnulls[ix_right]["cambio"].str.findall('^([A-Z][a-z]{2})(\d+)_?([A-Z][a-z]{2})?(\d+)?del$').str[0][0:100]
for x in aux:
    print(x[4])