# Create database of mutations

In [8]:
import os
from collections import defaultdict

import bgreference
import pandas as pd

#### Input files

In [2]:
boostdm_dir = './../boostDM/boostDM-intogen-prediction/'
intogen_drivers_f = './../intogen/IntOGen-Drivers-20200201_Compendium_Cancer_Genes.tsv'
intogen_negative_f = './../intogen/intOGen-20191022_Negative_gene_set.tsv'
cgi_biomarkers_f = './../CGI/selected_biomarkers.csv'

#### Output file

In [3]:
output_file = './mutations_db.tsv'

In [32]:
# FIXME update names
header = [
    'cancer_type',
    'mutation_id',
    'gene', 
    'aa_change', 
    'dna_change', 
    'driver_passenger', 
    'og_tsg', 
    'targeted_therapy'
]

#### Other input data

In [4]:
cancer_types = {
    'skin': ['CM', 'SCCC', 'SBCC'], 
    'lung': ['NSCLC', 'LUSC', 'LUAD', 'SCLC']
}

In [5]:
cancer_types = dict([(ct_specific, ct_general) for ct_general, v in cancer_types.items() for ct_specific in list(v)])

In [6]:
cancer_types

{'CM': 'skin',
 'SCCC': 'skin',
 'SBCC': 'skin',
 'NSCLC': 'lung',
 'LUSC': 'lung',
 'LUAD': 'lung',
 'SCLC': 'lung'}

## Mutations DB

In [57]:
# Object to load
mutations_info = defaultdict(lambda: defaultdict(dict))

#### 1) Load CGI biomarkers

In [58]:
biomarkers_df = pd.read_csv(cgi_biomarkers_f, sep='\t', header=0)

In [59]:
biomarkers_df['cancer_type_general'] = biomarkers_df.apply(lambda x: cancer_types.get(x['Primary Tumor type'], 'NAN'), axis=1)

In [60]:
biomarkers_df.head()

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,cancer_type_general
0,ALK_F1174L,ALK,F1174L,,LUAD,novel ALK inhibitors,lung
1,ALK_E1408V,ALK,E1408V,Brigatinib,LUAD,Pan-TK inhibitor,lung
2,ALK_L1196M,ALK,L1196M,Brigatinib,LUAD,Pan-TK inhibitor,lung
3,ALK_L1196M,ALK,L1196M,Ceritinib,LUAD,ALK inhibitor,lung
4,ALK_S1206Y,ALK,S1206Y,Ceritinib,LUAD,ALK inhibitor,lung


#### 2) Get driver mutations from boostDM

Comments: 
    
- Truncating mutations are skipped
- Mutations with AA change not available (nan) are skipped
- Biomarkers with unkwown drug are labeled as therapy 'None'

In [68]:
# Identify cohorts with the cancer_types of interest
for file in os.scandir(boostdm_dir): 
    if file.name.split('.')[1] in set(cancer_types.keys()): 
        
        # Cancer type in data 
        cancer_type_specific = file.name.split('.')[1]
        cancer_type_general = cancer_types[cancer_type_specific]
        
        # Read df
        df = pd.read_csv(file.path, sep='\t', header=0)
        
        # Subset driver mutations
        df_drivers = df.loc[df['boostDM_class'] == True].copy()
        median_boostdm_score = df_drivers['boostDM_score'].describe()['50%']
        
        # Save information
        for _, row in df_drivers.iterrows(): 
            
            # If missense mutation 
            # If BoostDM score above the median (higher driver evidence)
            if row['csqn_type_missense'] and row['boostDM_score'] >= median_boostdm_score: 
                
                # If AA change is not nan
                if row['aachange'] == row['aachange']: 
                                                                      
                    # Mutation id (BRAF_V600E)
                    mutation_id = f"{row['gene']}_{row['aachange']}"
                    
                    # Gene
                    mutations_info[cancer_type_general][mutation_id]['gene'] = row['gene']
                    
                    # Cancer type
                    mutations_info[cancer_type_general][mutation_id]['cancer_type_specific'] = cancer_type_specific
                    mutations_info[cancer_type_general][mutation_id]['cancer_type_general'] = cancer_type_general
                    
                    # AA change
                    mutations_info[cancer_type_general][mutation_id]['aachange'] = row['aachange']
                   
                    # Nucleotide change
                    chrom = row['chr']
                    pos = row['pos']
                    ref = bgreference.refseq('hg38', chrom, pos, 1)
                    alt = row['alt']
                    mutations_info[cancer_type_general][mutation_id]['nuchange'] = f'{chrom}:{pos}_{ref}>{alt}'
                    
                    # Driver or passenger
                    mutations_info[cancer_type_general][mutation_id]['driver_passenger'] = 'driver'
                    
                    # Oncogene or tumor suppresor gene
                    if row['role_Act'] == 1: 
                        role = 'og'
                    elif row['role_LoF'] == 1: 
                        role = 'tsg'
                    else:
                        role = 'unknown'
                    mutations_info[cancer_type_general][mutation_id]['og_tsg'] = role
                    
                    # Targeted therapy
                    # Select mutation in the cancer type
                    biomarker_info_df = biomarkers_df.loc[
                        (biomarkers_df['Biomarker'] == mutation_id) & 
                        (biomarkers_df['cancer_type_general'] == cancer_type_general)
                    ]
                    # If mutation is biomarker
                    if len(biomarker_info_df) > 0: 
                        # If the drug field (should be unique) is not nan
                        if biomarker_info_df['Drug'].iloc[0] == biomarker_info_df['Drug'].iloc[0]: 
                            mutations_info[cancer_type_general][mutation_id]['targeted_therapy'] = biomarker_info_df['Drug'].iloc[0]
                    else: 
                        mutations_info[cancer_type_general][mutation_id]['targeted_therapy'] = 'None'
                    # Drug is approved or not
                    # TODO add

In [69]:
len(mutations_info['lung']) # 5334

529

In [70]:
len(mutations_info['skin']) #3642

515

#### 3) Get passenger mutations from negative genes in intOGen

Comments: 
    
- 

In [84]:
negatives_df = pd.read_csv(intogen_negative_f, sep='\t', header=None)