# Create database of mutations

In [1]:
import os

import bgreference
import pandas as pd

#### Input files

In [2]:
boostdm_dir = './../boostDM/boostDM-intogen-prediction/'
intogen_drivers_f = './../intogen/IntOGen-Drivers-20200201_Compendium_Cancer_Genes.tsv'
intogen_negative_f = './../intogen/intOGen-20191022_Negative_gene_set.tsv'
cgi_biomarkers_f = './../CGI/selected_biomarkers.csv'

#### Output file

In [3]:
output_file = './mutations_db.tsv'

In [32]:
# FIXME update names
header = [
    'cancer_type',
    'mutation_id',
    'gene', 
    'aa_change', 
    'dna_change', 
    'driver_passenger', 
    'og_tsg', 
    'targeted_therapy'
]

#### Other input data

In [5]:
cancer_types = {
    'skin': ['CM', 'SCCC', 'SBCC'], 
    'lung': ['NSCLC', 'LUSC', 'LUAD', 'SCLC']
}

In [6]:
cancer_types = dict([(ct_specific, ct_general) for ct_general, v in cancer_types.items() for ct_specific in list(v)])

In [7]:
cancer_types

{'CM': 'skin',
 'SCCC': 'skin',
 'SBCC': 'skin',
 'NSCLC': 'lung',
 'LUSC': 'lung',
 'LUAD': 'lung',
 'SCLC': 'lung'}

## Mutations DB

In [63]:
# Object to load
mutations_info = dict()
mutations_info['skin'] = dict()
mutations_info['lung'] = dict()

1) Load CGI biomarkers

In [64]:
biomarkers_df = pd.read_csv(cgi_biomarkers_f, sep='\t', header=0)

In [65]:
biomarkers_df['cancer_type_general'] = biomarkers_df.apply(lambda x: cancer_types.get(x['Primary Tumor type'], 'NAN'), axis=1)

In [66]:
biomarkers_df.head()

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,cancer_type_general
0,ALK_F1174L,ALK,F1174L,,LUAD,novel ALK inhibitors,lung
1,ALK_E1408V,ALK,E1408V,Brigatinib,LUAD,Pan-TK inhibitor,lung
2,ALK_L1196M,ALK,L1196M,Brigatinib,LUAD,Pan-TK inhibitor,lung
3,ALK_L1196M,ALK,L1196M,Ceritinib,LUAD,ALK inhibitor,lung
4,ALK_S1206Y,ALK,S1206Y,Ceritinib,LUAD,ALK inhibitor,lung


2) Get driver mutations from boostDM

In [67]:
# Identify cohorts with the cancer_types of interest
for file in os.scandir(boostdm_dir): 
    if file.name.split('.')[1] in set(cancer_types.keys()): 
        
        # Cancer type in data 
        cancer_type_specific = file.name.split('.')[1]
        cancer_type_general = cancer_types[cancer_type_specific]
        
        # Read df
        df = pd.read_csv(file.path, sep='\t', header=0)
        
        # Subset driver mutations
        df_drivers = df.loc[df['boostDM_class'] == True].copy()
        
        # Save information
        for _, row in df_drivers.iterrows(): 
            # Mutation id (BRAF_V600E)
            mutation_id = f"{row['gene']}_{row['aachange']}"
            mutations_info[cancer_type_general][mutation_id] = {}
            # Gene
            mutations_info[cancer_type_general][mutation_id]['gene'] = row['gene']
            # Cancer type
            mutations_info[cancer_type_general][mutation_id]['cancer_type_specific'] = cancer_type_specific
            mutations_info[cancer_type_general][mutation_id]['cancer_type_general'] = cancer_type_general
            # AA change
            mutations_info[cancer_type_general][mutation_id]['aachange'] = row['aachange']
            # Nucleotide change
            chrom = row['chr']
            pos = row['pos']
            ref = bgreference.refseq('hg38', chrom, pos, 1)
            alt = row['alt']
            mutations_info[cancer_type_general][mutation_id]['nuchange'] = f'{chrom}:{pos}_{ref}>{alt}'
            # Driver or passenger
            mutations_info[cancer_type_general][mutation_id]['driver_passenger'] = 'driver'
            # Oncogene or tumor suppresor gene
            if row['role_Act'] == 1: 
                role = 'og'
            elif row['role_LoF'] == 1: 
                role = 'tsg'
            else:
                role = 'unknown'
            mutations_info[cancer_type_general][mutation_id]['og_tsg'] = role
            # Targeted therapy
            biomarker_info_df = biomarkers_df.loc[
                (biomarkers_df['Biomarker'] == mutation_id) & (biomarkers_df['cancer_type_general'] == cancer_type_general)
            ]
            if len(biomarker_info_df) > 0: 
                mutations_info[cancer_type_general][mutation_id]['targeted_therapy'] = biomarker_info_df['Drug'].iloc[0]
#                 if biomarker_info_df['Drug'].iloc[0] == biomarker_info_df['Drug'].iloc[0]: 
#                     print(mutations_info[mutation_id])
                
            else: 
                mutations_info[cancer_type_general][mutation_id]['targeted_therapy'] = 'None'    

Comments: 
    
- BRD7_Q589* is truncating? Then biomarkers add also nonsense mutations?
- Need to remove nan aachange: RPS6KA3_nan
- Some drugs have more than one therapy per cancer type
- Add LUSC, NSCLC in biomarkers
- Add drug approved or not (binary info, like approved or under research?)
- Add example of drug name in those cases where Inhibitor type is present but drug is missing

In [69]:
len(mutations_info['lung'])

5334

In [70]:
len(mutations_info['skin'])

3642