# Create database of mutations

In [1]:
import os
from collections import defaultdict

import bgreference
import pandas as pd

#### Input files

In [2]:
boostdm_dir = './../boostDM/boostDM-intogen-prediction/'
intogen_drivers_f = './../intogen/IntOGen-Drivers-20200201_Compendium_Cancer_Genes.tsv'
intogen_negative_f = './../intogen/intOGen-20191022_Negative_gene_set.tsv'
cgi_biomarkers_f = './../CGI/selected_biomarkers.tsv'

#### Output file

In [8]:
output_file = './mutations_db.tsv'

In [9]:
# FIXME update names
header = [
    'cancer_type',
    'mutation_id',
    'gene', 
    'aa_change', 
    'dna_change', 
    'driver_passenger', 
    'og_tsg', 
    'targeted_therapy'
]

#### Other input data

In [10]:
cancer_types = {
    'skin': ['CM', 'SCCC', 'SBCC'], 
    'lung': ['NSCLC', 'LUSC', 'LUAD', 'SCLC']
}

In [11]:
cancer_types = dict([(ct_specific, ct_general) for ct_general, v in cancer_types.items() for ct_specific in list(v)])

In [8]:
cancer_types

{'CM': 'skin',
 'SCCC': 'skin',
 'SBCC': 'skin',
 'NSCLC': 'lung',
 'LUSC': 'lung',
 'LUAD': 'lung',
 'SCLC': 'lung'}

## Mutations DB

In [12]:
# Object to load
mutations_info = defaultdict(lambda: defaultdict(dict))

#### 1) Load CGI biomarkers

In [13]:
biomarkers_df = pd.read_csv(cgi_biomarkers_f, sep='\t', header=0)

In [14]:
biomarkers_df['cancer_type_general'] = biomarkers_df.apply(lambda x: cancer_types.get(x['Primary Tumor type'], 'NAN'), axis=1)

In [15]:
biomarkers_df.head()

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved,cancer_type_general
0,ALK_E1408V,ALK,E1408V,Brigatinib,LUAD,Pan-TK inhibitor,False,lung
1,ALK_L1196M,ALK,L1196M,Brigatinib,LUAD,Pan-TK inhibitor,False,lung
2,ALK_S1206Y,ALK,S1206Y,Ceritinib,LUAD,ALK inhibitor,False,lung
3,ALK_G1269A,ALK,G1269A,Ceritinib,LUAD,ALK inhibitor,False,lung
4,ALK_I1171T,ALK,I1171T,Ceritinib,LUAD,ALK inhibitor,False,lung


#### 2) Get driver mutations from boostDM

Comments: 
    
- Truncating mutations are skipped
- Mutations with AA change not available (nan) are skipped
- Biomarkers with unkwown drug are labeled as therapy 'None'

In [16]:
# Identify cohorts with the cancer_types of interest
for file in os.scandir(boostdm_dir): 
    if file.name.split('.')[1] in set(cancer_types.keys()): 
        
        # Cancer type in data 
        cancer_type_specific = file.name.split('.')[1]
        cancer_type_general = cancer_types[cancer_type_specific]
        
        # Read df
        df = pd.read_csv(file.path, sep='\t', header=0)
        
        # Subset driver mutations
        df_drivers = df.loc[df['boostDM_class'] == True].copy()
        median_boostdm_score = df_drivers['boostDM_score'].describe()['50%']
        
        # Save information
        for _, row in df_drivers.iterrows(): 
            
            # If missense mutation 
            # If BoostDM score above the median (higher driver evidence)
            if row['csqn_type_missense'] and row['boostDM_score'] >= median_boostdm_score: 
                
                # If AA change is not nan
                if row['aachange'] == row['aachange']: 
                                                                      
                    # Mutation id (BRAF_V600E)
                    mutation_id = f"{row['gene']}_{row['aachange']}"
                    
                    # Gene
                    mutations_info[cancer_type_general][mutation_id]['gene'] = row['gene']
                    
                    # Cancer type
                    mutations_info[cancer_type_general][mutation_id]['cancer_type_specific'] = cancer_type_specific
                    mutations_info[cancer_type_general][mutation_id]['cancer_type_general'] = cancer_type_general
                    
                    # AA change
                    mutations_info[cancer_type_general][mutation_id]['aachange'] = row['aachange']
                   
                    # Nucleotide change
                    chrom = row['chr']
                    pos = row['pos']
                    ref = bgreference.refseq('hg38', chrom, pos, 1)
                    alt = row['alt']
                    mutations_info[cancer_type_general][mutation_id]['nuchange'] = f'{chrom}:{pos}_{ref}>{alt}'
                    
                    # Driver or passenger
                    mutations_info[cancer_type_general][mutation_id]['driver_passenger'] = 'driver'
                    
                    # Oncogene or tumor suppresor gene
                    if row['role_Act'] == 1: 
                        role = 'og'
                    elif row['role_LoF'] == 1: 
                        role = 'tsg'
                    else:
                        role = 'unknown'
                    mutations_info[cancer_type_general][mutation_id]['og_tsg'] = role
                    
                    # Targeted therapy
                    # Select mutation in the cancer type
                    biomarker_info_df = biomarkers_df.loc[
                        (biomarkers_df['Biomarker'] == mutation_id) & 
                        (biomarkers_df['cancer_type_general'] == cancer_type_general)
                    ]
                    # If mutation is biomarker
                    if len(biomarker_info_df) > 0: 
                        # If the drug field (should be unique) is not nan
                        if biomarker_info_df['Drug'].iloc[0] == biomarker_info_df['Drug'].iloc[0]: 
                            mutations_info[cancer_type_general][mutation_id]['targeted_therapy'] = biomarker_info_df['Drug'].iloc[0]
                    else: 
                        mutations_info[cancer_type_general][mutation_id]['targeted_therapy'] = 'None'
                    # Drug is approved or not
                    # TODO add

In [33]:
mutations_info

defaultdict(<function __main__.<lambda>>,
            {'skin': defaultdict(dict,
                         {'CTNNB1_S45Y': {'gene': 'CTNNB1',
                           'cancer_type_specific': 'CM',
                           'cancer_type_general': 'skin',
                           'aachange': 'S45Y',
                           'nuchange': '3:41224646_C>A',
                           'driver_passenger': 'driver',
                           'og_tsg': 'og',
                           'targeted_therapy': 'None'},
                          'PIK3CA_E545K': {'gene': 'PIK3CA',
                           'cancer_type_specific': 'CM',
                           'cancer_type_general': 'skin',
                           'aachange': 'E545K',
                           'nuchange': '3:179218303_G>A',
                           'driver_passenger': 'driver',
                           'og_tsg': 'og',
                           'targeted_therapy': 'None'},
                          'FAM135B_G1169E': {'

In [18]:
len(mutations_info['lung']) # 5334

529

In [19]:
len(mutations_info['skin']) #3642

515

#### 3) Get passenger mutations from negative genes in intOGen

Comments: 
    
- 

In [20]:
negatives_df = pd.read_csv(intogen_negative_f, sep='\t', header=None)
negatives_df

Unnamed: 0,0,1
0,ACC,"OR2F1,CNGB3,TNFSF18,IQCF3,USP50,SNORD114-2,LOC..."
1,BLCA,"OR2F1,VGLL2,IQCF3,USP50,SNORD114-2,C1orf141,HE..."
2,BRCA,"OR2F1,CNGB3,IL13,VGLL2,IQCF3,USP50,SNORD114-2,..."
3,CESC,"OR2F1,VGLL2,IQCF3,USP50,SNORD114-2,C1orf141,HE..."
4,CHOL,"OR2F1,LINGO2,CNGB3,IL13,VGLL2,IQCF3,USP50,SNOR..."
5,COAD,"OR2F1,VGLL2,IQCF3,USP50,SNORD114-2,C1orf141,SC..."
6,COADREAD,"OR2F1,VGLL2,IQCF3,USP50,SNORD114-2,C1orf141,HE..."
7,DLBC,"OR2F1,LINGO2,CNGB3,KCNG3,VGLL2,IQCF3,USP50,SNO..."
8,ESCA,"OR2F1,IL13,VGLL2,IQCF3,USP50,SNORD114-2,C1orf1..."
9,GBM,"OR2F1,CNGB3,IQCF3,USP50,SNORD114-2,SCGB2A2,TTT..."


### Save mutations_info dict

In [31]:
type(mutations_info)

collections.defaultdict

In [28]:
mutations_info_df = pd.DataFrame(mutations_info)
mutations_info_df

Unnamed: 0,skin,lung
CTNNB1_S45Y,"{'gene': 'CTNNB1', 'cancer_type_specific': 'CM...",
PIK3CA_E545K,"{'gene': 'PIK3CA', 'cancer_type_specific': 'CM...","{'gene': 'PIK3CA', 'cancer_type_specific': 'LU..."
FAM135B_G1169E,"{'gene': 'FAM135B', 'cancer_type_specific': 'S...",
GNAQ_Q209P,"{'gene': 'GNAQ', 'cancer_type_specific': 'CM',...",
SF3B1_R625H,"{'gene': 'SF3B1', 'cancer_type_specific': 'CM'...",
...,...,...
TP53_E258Q,,"{'gene': 'TP53', 'cancer_type_specific': 'LUAD..."
EGFR_I759N,,"{'gene': 'EGFR', 'cancer_type_specific': 'LUAD..."
EGFR_L833F,,"{'gene': 'EGFR', 'cancer_type_specific': 'LUAD..."
EGFR_E866K,,"{'gene': 'EGFR', 'cancer_type_specific': 'LUAD..."


In [29]:
dict(mutations_info_df)

{'skin': CTNNB1_S45Y       {'gene': 'CTNNB1', 'cancer_type_specific': 'CM...
 PIK3CA_E545K      {'gene': 'PIK3CA', 'cancer_type_specific': 'CM...
 FAM135B_G1169E    {'gene': 'FAM135B', 'cancer_type_specific': 'S...
 GNAQ_Q209P        {'gene': 'GNAQ', 'cancer_type_specific': 'CM',...
 SF3B1_R625H       {'gene': 'SF3B1', 'cancer_type_specific': 'CM'...
                                         ...                        
 TP53_E258Q                                                      NaN
 EGFR_I759N                                                      NaN
 EGFR_L833F                                                      NaN
 EGFR_E866K                                                      NaN
 EGFR_K754E                                                      NaN
 Name: skin, Length: 987, dtype: object,
 'lung': CTNNB1_S45Y                                                     NaN
 PIK3CA_E545K      {'gene': 'PIK3CA', 'cancer_type_specific': 'LU...
 FAM135B_G1169E                               

In [21]:
import pickle

In [24]:
with open('mutations_info.dict','wb')as f:
    pickle.dump(mutations_info, f)

PicklingError: Can't pickle <function <lambda> at 0x7f9943697b90>: attribute lookup <lambda> on __main__ failed