In [1]:
import pandas as pd
import numpy as np
import re

p_prot = pd.read_csv("../db_tables/protein.tsv", sep="\t")
p_prot = p_prot[['id_protein', 'uniprot_acc']]

#by ensemble transcript
protein_enst = pd.read_csv("../raw_data/llps_uniprot2ENST.tab.txt", sep="\t")
protein_enst = protein_enst.rename(columns={'From': 'uniprot_acc', 'To':'ensembl_enst'})
protein_enst = protein_enst.merge(p_prot[['id_protein', 'uniprot_acc']]).drop(columns=['uniprot_acc'])

#by ensemble protein
protein_ensp = pd.read_csv("../raw_data/llps_uniprot2ENSP.tab.txt", sep="\t")
protein_ensp = protein_ensp.rename(columns={'From': 'uniprot_acc', 'To':'ensembl_ensp'})
protein_ensp = protein_ensp.merge(p_prot[['id_protein', 'uniprot_acc']]).drop(columns=['uniprot_acc'])

datafile  = "../raw_data/cosmic_v94/CosmicMutantExport_sel.tsv"
#for each sample_id has the cancer types columns and the cosmic_phenotype_id
samples_data = "../raw_data/cosmic_v94/CosmicSample.tsv" 
#for each cosmic_phenotype_id has the NCI_CODE and other like EFO, MONDO, ....
class_data = "../raw_data/cosmic_v94/classification.csv"

In [2]:
mutation = pd.read_csv(datafile, sep="\t")
#to string the columns
mutation['pubmed'] = mutation['pubmed'].map(lambda x: str(int(x)) if not np.isnan(x) else x)
print(list(mutation.columns))
print(mutation.head())
print(f'cosmic rows {mutation.shape[0]}') 

['id_sample', 'GENOMIC_MUTATION_ID', 'LEGACY_MUTATION_ID', 'MUTATION_ID', 'CDS', 'AA', 'consequence', 'genomic', 'somatic_status', 'pubmed', 'ensembl_ensp', 'ensembl_enst']
   id_sample GENOMIC_MUTATION_ID LEGACY_MUTATION_ID  MUTATION_ID  \
0    2198328        COSV57322711        COSM1628357     98293492   
1    2869867        COSV57321510        COSM1706236     98292376   
2    2549254        COSV57321745        COSM5796874     98292486   
3    2749203        COSV57324149        COSM7003726     98292523   
4    2839416       COSV104562760        COSM9463514     98293844   

                 CDS       AA              consequence  \
0           c.609G>T  p.M203I  Substitution - Missense   
1           c.781G>A  p.E261K  Substitution - Missense   
2           c.385G>C  p.G129R  Substitution - Missense   
3            c.20G>C    p.G7A  Substitution - Missense   
4  c.559_560delinsAA  p.G187K  Substitution - Missense   

                  genomic             somatic_status    pubmed  \
0  

In [3]:
# Filter only mutations in our LLPS dataset 

mutation['internalid'] = range(1, len(mutation)+1)

#merge by order 'ensembl_enst', 'ensembl_ensp'
#to avoid the rows duplication
protein_enst = mutation.merge(protein_enst)

y =  protein_enst[['ensembl_enst', 'id_protein']]
y = y.drop_duplicates()
dd = y.groupby(['ensembl_enst']).size().reset_index(name='counts')
print(dd[dd.counts > 1])
#este es un caso raro ya que la secuencia del transcripto es la union de dos uniprots
#P35544 + P62861
#id_protein 4340 y 4342

mapped_internal = protein_enst['internalid'].unique().tolist()
mutation = mutation[~mutation['internalid'].isin(mapped_internal)]
protein_ensp = mutation.merge(protein_ensp)
y =  protein_ensp[['ensembl_ensp', 'id_protein']]
y = y.drop_duplicates()
dd = y.groupby(['ensembl_ensp']).size().reset_index(name='counts')
print(dd[dd.counts > 1])

print(f'rows mapped by ENSP_name {protein_ensp.shape[0]}')
print(f'rows mapped by ENST_name {protein_enst.shape[0]}')

mutation = pd.concat([protein_enst, protein_ensp], ignore_index=True)
mutation = mutation.drop_duplicates()
print(f'cosmic without duplicates {mutation.shape[0]}') 
print(f'cosmic proteins {len(mutation["id_protein"].unique())}')                


         ensembl_enst  counts
3692  ENST00000529639       2
Empty DataFrame
Columns: [ensembl_ensp, counts]
Index: []
rows mapped by ENSP_name 0
rows mapped by ENST_name 1724388
cosmic without duplicates 1724388
cosmic proteins 4018


In [4]:
#ver si hay mutiples transcriptos de una misma proteina para eliminarlos 
#un legacy_mutation_id tiene todas los transcriptos mutados que corresponde a una misma posicion
y = mutation[['LEGACY_MUTATION_ID', 'id_protein']]
y = y.drop_duplicates()
print(y.shape[0])
mut_trans = y.groupby(['LEGACY_MUTATION_ID']).size().reset_index(name='counts')
mut_trans = mut_trans[mut_trans.counts > 1]
print(mut_trans)
#puede darse el caso que tengan varios protein_id ya que los uniprot asociados son disitntas variantes de splicing
y = mutation[['LEGACY_MUTATION_ID', 'id_protein', 'ensembl_enst']]
y = y.drop_duplicates()
mut_trans = y.groupby(['LEGACY_MUTATION_ID', 'id_protein']).size().reset_index(name='counts')
mut_trans = mut_trans[mut_trans.counts > 1]
print(mut_trans)
#only one transcript per protein id


1144756
        LEGACY_MUTATION_ID  counts
8381           COSM1049194       2
8382           COSM1049196       2
33704          COSM1206683       2
59817          COSM1355890       2
85748           COSM160375       2
...                    ...     ...
1063915        COSM9462306       2
1071035        COSM9508187       2
1084998        COSM9589314       2
1090340        COSM9621560       2
1099092         COSM967422       2

[81 rows x 2 columns]
Empty DataFrame
Columns: [LEGACY_MUTATION_ID, id_protein, counts]
Index: []


In [5]:
#ver si id_mutation and id_protein es unique de la tabla
#de ser asi, dejar solo id_mutation y datos de las mutaciones juntas
#y otra tabla con "ID_sample", "GENOMIC_MUTATION_ID", "LEGACY_MUTATION_ID", "MUTATION_ID", "Mutation somatic status", "Pubmed_PMID"

samples_mut = mutation[["id_sample", "LEGACY_MUTATION_ID", 'id_protein', "somatic_status", "pubmed"]]
samples_mut = samples_mut.drop_duplicates()
print(f'sample-mutation {samples_mut.shape[0]}')
#samples_mut.to_csv('samples_mut.tsv', sep="\t", index= False)
#"LEGACY_MUTATION_ID" + 'id_protein' is the mutation id
mutation = mutation[["LEGACY_MUTATION_ID", 'id_protein', 'CDS', 'AA', 'consequence', 'genomic']]
mutation = mutation.drop_duplicates()
print(f'mutation-idprot-transcr {mutation.shape[0]}')

sample-mutation 1723613
mutation-idprot-transcr 1144756


In [6]:
#split genomic by (x:y-z) in chromosome, start_genomic end_genomic
mutation['genomic'] = mutation['genomic'].str.findall('^(.+):(.+)-(.+)$').str[0]
mutation['chromosome'] = mutation['genomic'].map(lambda x: x[0])
mutation['start_genomic'] = mutation['genomic'].map(lambda x: x[1])
mutation['end_genomic'] = mutation['genomic'].map(lambda x: x[2])
mutation = mutation.drop(columns=['genomic'])
print(mutation['chromosome'].unique().tolist())

['10', '2', '15', '11', '19', '7', '17', '1', '3', '12', '14', '5', '9', '4', '16', '6', '22', '21', '18', '8', '20', '23', '13', '24', '25']


In [7]:
print(mutation.shape[0])
mutation = mutation.rename(columns={'consequence': 'consequence_cosmic'})
mutation['indexinternal'] = range(1, len(mutation)+1)
mutation['AA'] = mutation.AA.map(lambda x: re.findall('^p\.(.*)$', x))
mutation['AA'] = mutation.AA.str[0]
mutation = mutation[mutation.AA.notnull()]

mutation['CDS'] = mutation.CDS.map(lambda x: re.findall('^c\.(.*)$', x))
mutation['CDS'] = mutation.CDS.str[0]
mutation = mutation[mutation.CDS.notnull()]

print(mutation.shape[0])
print(mutation)



1144756
1144756
        LEGACY_MUTATION_ID  id_protein  \
0               COSM166059        1011   
2              COSM6982134        1011   
3               COSM428012        1011   
5               COSM427996        1011   
6              COSM5703736        1011   
...                    ...         ...   
1724382        COSM7772448        1844   
1724383        COSM4622984        1844   
1724384        COSM9330544        1844   
1724386        COSM8832530        1844   
1724387        COSM3944131        1844   

                                                       CDS  \
0                                                  1224dup   
2        661_662delinsCCTGCCTGTTGTGAGCTGCTCTACGTGCCCTAC...   
3                                             1302_1303dup   
5                                                  1203dup   
6                                                   633del   
...                                                    ...   
1724382                                      

In [8]:
print(mutation['consequence_cosmic'].unique().tolist())
print(mutation[mutation['consequence_cosmic'].isnull()].head())
print(mutation[mutation['consequence_cosmic'] == 'Insertion - Frameshift'].head())
print(mutation[mutation['consequence_cosmic'] == 'Deletion - Frameshift'].head())
print(mutation[mutation['consequence_cosmic'] == 'Substitution - coding silent'].head())
print(mutation[mutation['consequence_cosmic'] == 'Substitution - Missense'].head())
print(mutation[mutation['consequence_cosmic'] == 'Deletion - In frame'].head())
print(mutation[mutation['consequence_cosmic'] == 'Substitution - Nonsense'].head())
print(mutation[mutation['consequence_cosmic'] == 'Complex - frameshift'].head())
print(mutation[mutation['consequence_cosmic'] == 'Insertion - In frame'].head())
print(mutation[mutation['consequence_cosmic'] == 'Nonstop extension'].head())
print(mutation[mutation['consequence_cosmic'] == 'Complex - deletion inframe'].head())
print(mutation[mutation['consequence_cosmic'] == 'Frameshift'].head())
print(mutation[mutation['consequence_cosmic'] == 'Complex - insertion inframe'].head())

['Insertion - Frameshift', nan, 'Deletion - Frameshift', 'Substitution - coding silent', 'Substitution - Missense', 'Deletion - In frame', 'Substitution - Nonsense', 'Complex - frameshift', 'Insertion - In frame', 'Nonstop extension', 'Complex - deletion inframe', 'Frameshift', 'Complex - insertion inframe']
     LEGACY_MUTATION_ID  id_protein  \
2           COSM6982134        1011   
1566        COSM4603688        1011   
1920        COSM4856816         243   
3817        COSM5351343        2188   
4192        COSM5816736        1865   

                                                    CDS  \
2     661_662delinsCCTGCCTGTTGTGAGCTGCTCTACGTGCCCTAC...   
1566                                929_940delinsTCCGAC   
1920                                             689G>A   
3817                                            1430G>A   
4192                                            2139G>A   

                           AA consequence_cosmic chromosome start_genomic  \
2     T221delinsPACCELL

       LEGACY_MUTATION_ID  id_protein   CDS     AA consequence_cosmic  \
146151        COSM8367386        2056  3G>T    M1?         Frameshift   
149442         COSM333041        1635  1A>G  M1_?4         Frameshift   
149518        COSM5679162        1635  3G>A  M1_?4         Frameshift   
149521         COSM320833        1635  3G>T  M1_?4         Frameshift   
208646        COSM6498374        1873  3G>A    M1?         Frameshift   

       chromosome start_genomic end_genomic  indexinternal  
146151         21      17609142    17609142         109272  
149442          6      27873510    27873510         111804  
149518          6      27873508    27873508         111863  
149521          6      27873508    27873508         111866  
208646         16      70800898    70800898         156163  
       LEGACY_MUTATION_ID  id_protein                              CDS  \
307044        COSM9840441         558              849delinsACCTCCTGCC   
421307          COSM12389         572          

In [9]:
def separar_en_cols(df, column, conseq, conseq_regex, override=False):
    '''
    recibe un DataFrame, el nombre de una columna auxiliar (column)
    y un string con el tipo de consecuencia (conseq). La col. auxiliar
    es una tupla con los elementos implicados en una mutacion
    como la siguiente (aa1, start_pos, aa2, end_pos, aa/s_nuevos).
    Devuelve el DataFrame df con estas 5 nuevas columnas
    '''
    
    df_crop = df[df[column].str.contains(conseq_regex)].copy()
      
      
    if override:
        df_crop['aux'] = df_crop[column].str.findall(conseq_regex).str[0]
    else:
        df_crop['aux'] = df_crop[column].str.findall('^([A-Z\*]+)(\d+)_?([A-Z\*]+)?(\d+)?'+conseq_regex+'(.*)$').str[0]
  
    df_crop = df_crop[~df_crop['aux'].isnull()]
    
    if conseq == "missense":
        df_crop['start_aa'] = df_crop['aux'].map(lambda x: str(x[1]))
        df_crop['end_aa'] = df_crop['aux'].map(lambda x: str(x[1]))  # mismo start y end para los que no tienen un end
        df_crop['from'] = df_crop['aux'].map(lambda x: x[0])       
        df_crop['to'] = df_crop['aux'].map(lambda x: x[2])
    else:    
        # start position
        df_crop['start_aa'] = df_crop['aux'].map(lambda x: x[1])
        #df_crop['start_aa'] = [n[1] for n in df_crop['aux']]

        # end position
        df_crop['end_aa'] = df_crop['aux'].map(lambda x: x[3] if x[3] != '' else x[1]) # poner en el end el start si no hay end
        #df_crop['end_aa'] = [n[3] for n in df_crop['aux']]
        
        # from: es el/los aa que cambian
        df_crop['from'] = df_crop['aux'].map(lambda x: str(x[0]) + str(x[2])) # concateno si existe mas de un aa que cambia (o sea, si es un rango)
        #df_crop['from'] = [str(n[0]) + str(n[2]) for n in df_crop['aux']]
        
        # to: aa/s nuevos
        if conseq == "nonsense":
            df_crop['to'] = "*"
        else:
            df_crop['to'] = df_crop['aux'].map(lambda x: x[4])
            #df_crop['to'] = [n[4] for n in df_crop['aux']]

    # consecuencia de la mutacion
    df_crop['consequence'] = conseq

    df_crop = df_crop.drop(columns=['aux'])

    return df_crop

In [14]:
# A DataFrame for each molecular consequence

mutation_used = mutation.copy()
print(f"all mutations {mutation.shape[0]}")

synonym = separar_en_cols(mutation_used, "AA", "synonym", "=$")
print(f"Found {synonym.shape[0]} synonym")

mutation_used = mutation_used[~mutation_used['indexinternal'].isin(synonym['indexinternal'].tolist())]
delins = separar_en_cols(mutation_used, "AA", "delins", "delins")
print(f"Found {delins.shape[0]} delins")

mutation_used = mutation_used[~mutation_used['indexinternal'].isin(delins['indexinternal'].tolist())]
deletions = separar_en_cols(mutation_used, "AA", "deletion", "del$") # finish with 'del'
print(f"Found {deletions.shape[0]} deletions")

mutation_used = mutation_used[~mutation_used['indexinternal'].isin(deletions['indexinternal'].tolist())]
insertions = separar_en_cols(mutation_used, "AA", "insertion", "(?<!del)ins") # Negative lookbehind search!
print(f"Found {insertions.shape[0]} inserions")

mutation_used = mutation_used[~mutation_used['indexinternal'].isin(insertions['indexinternal'].tolist())]
frameshift = separar_en_cols(mutation_used, "AA", "frameshit", '^([A-Z])(\d+)_?([A-Z])?(\d+)?fs\*(.*)$', override= True) # expressions as 'Lys1254Terfs' are nonsense, not frameshift
print(f"Found {frameshift.shape[0]} frameshift")

mutation_used = mutation_used[~mutation_used['indexinternal'].isin(frameshift['indexinternal'].tolist())]
nonsense = separar_en_cols(mutation_used, "AA", "nonsense", "\*$")
print(f"Found {nonsense.shape[0]} nonsense")

mutation_used = mutation_used[~mutation_used['indexinternal'].isin(nonsense['indexinternal'].tolist())]
missense = separar_en_cols(mutation_used, "AA", "missense", '^([A-Z])(\d+)(?!\*)([A-Z])$', override=True)
print(f"Found {missense.shape[0]} missense")

mutation_used = mutation_used[~mutation_used['indexinternal'].isin(missense['indexinternal'].tolist())]
duplications = separar_en_cols(mutation_used, "AA", "duplication", "dup")
print(f"Found {duplications.shape[0]} duplications")

#faltan las nonstop \*\d+[A-Za-z]ext*\d+
#los frameshift M1?, M1_?\d*
## Concatenate everything


idexall = synonym['indexinternal'].tolist() + deletions['indexinternal'].tolist() + delins['indexinternal'].tolist() + duplications['indexinternal'].tolist() + frameshift['indexinternal'].tolist() + insertions['indexinternal'].tolist() + missense['indexinternal'].tolist() + nonsense['indexinternal'].tolist()
print(f"mapped mutations {len(idexall)}")

mutation_used = mutation_used[~mutation_used['indexinternal'].isin(duplications['indexinternal'].tolist())]
print(f"Missing mutations: {mutation_used.shape[0]}")

print(mutation_used[['AA', 'consequence_cosmic']])
'''
mutations = pd.concat(tables)
print(f"Total mutations: {mutations.shape[0]}")
'''


all mutations 1144756
Found 264334 synonym
Found 1606 delins
Found 5403 deletions
Found 884 inserions
Found 38272 frameshift
Found 60124 nonsense
Found 771186 missense
Found 1031 duplications
mapped mutations 1142840
Missing mutations: 1916
                        AA      consequence_cosmic
79              *445Lfs*63  Insertion - Frameshift
148             *445Rfs*31   Deletion - Frameshift
300             *445Lfs*63  Insertion - Frameshift
371             *445Yfs*63  Insertion - Frameshift
1337     TAMG*441delext*23     Deletion - In frame
...                    ...                     ...
1723332                M1?                     NaN
1724189                M1?                     NaN
1724253        *182Yext*26       Nonstop extension
1724273        *104Yext*17       Nonstop extension
1724298                M1?                     NaN

[1916 rows x 2 columns]


'\nmutations = pd.concat(tables)\nprint(f"Total mutations: {mutations.shape[0]}")\n'