This notebook creates one list of NOTCH1 mutations of the TALL cohorts (ADULT TALL AECC PROJECT, PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS), PEDIATRIC ALL (Li et al., 2019, Blood)) to map the mutations to the exons. After, the needle plot figure can be done with NOTCH1_needle_plot.ipynb in ../notebook_figures

This piece of code relies on a worspace directory structure such as 
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
 Be aware that the maf files used to get the NOTCH1 mutations are the ones resulted from step 3 in the filtering steps. In this piece of code are called:
 ```
 - 'Strelka_'+TumorID_vs_normalID+'_somatic_snvs_sh_checked.maf'
 - 'Strelka_'+TumorID_vs_normalID+'_somatic_indels_sh.maf'
 - 'Strelka_'+TumorID_vs_normalID+'_somatic_mnvs_sh.maf'
 ```

In [None]:
import sys, os
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pybedtools
from aux_functions import read_vcf,get_three_subsets
from aux_data_in_pyvar import PATS_DIRS

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
## FUNCTIONS
def map_muts(df_muts, df_gene):
    df_muts[['#CHROM', 'REF', 'ALT']] = df_muts[['#CHROM', 'REF', 'ALT']].astype(str)
    df_muts['POS'] = df_muts['POS'].astype(int)
    
    df_gene['#CHROM'] = df_gene['#CHROM'].astype(str)
    df_gene[['START', 'END']] = df_gene[['START', 'END']].astype(int)
    
    muts = pybedtools.BedTool.from_dataframe(df_muts[['#CHROM', 'POS', 'POS','REF', 'ALT']])
    gene_coords = pybedtools.BedTool.from_dataframe(df_gene[['#CHROM', 'START', 'END']])
    
    result = muts.intersect(gene_coords, loj = True)
    merged = pd.read_table(result.fn, names=['#CHROM', 'POS', 'POS2','REF', 'ALT', 'chrom', 'start_gene', 'end_gene'])
    
    merged = merged[merged['start_gene'] == df_gene.loc[0,'START']]
    merged[['#CHROM', 'REF', 'ALT']] = merged[['#CHROM', 'REF', 'ALT']].astype(str)
    merged[['POS','start_gene']] = merged[['POS','start_gene']].astype(int)
    
    return merged


def filter_vep_results(df):

    df_pass_annotation = pd.DataFrame()
    df_inspection = pd.DataFrame()

    grps = df.groupby("#Uploaded_variation")

    for g in grps.groups:
        df_var = grps.get_group(g)
        df_var.drop_duplicates(inplace=True)
        df_var = df_var[df_var['SYMBOL'] == 'NOTCH1']
        if len(df_var)>1:
            df_var = df_var[df_var['CANONICAL'] == 'YES']
            if len(df_var)>1:
                df_inspection = df_inspection.append(df_var, ignore_index=True, sort=False)
            else:
                df_pass_annotation = df_pass_annotation.append(df_var, ignore_index=True, sort=False)
        else:
            df_pass_annotation = df_pass_annotation.append(df_var, ignore_index=True, sort=False)
    print(df_inspection)
    return df_pass_annotation


def get_variant_columns(rw):
    # get columns from id
    rw['#CHROM'], pos, rw['REF'], rw['ALT'] = rw['#Uploaded_variation'].split("_")
    rw['POS'] = int(pos)
    # filter polimorphisms
    try:
        af = float(rw['gnomADg_AF'])
        if af >= 0.01:
            rw['snp_remove'] = True
        else:
            rw['snp_remove'] = False
    except ValueError:
        rw['snp_remove'] = False
    return rw


def process_vep_results(path, muts):
    try:
        # read annotated vep variants
        df = read_vcf(os.path.join(path, "notch1_"+muts+"_anno_vep92.tab"))
        # get canonical
        df = filter_vep_results(df)
        # remove snps
        df = df.apply(lambda x: get_variant_columns(x), axis=1)
        df = df[df['snp_remove'] == False]
        # read original input
        df_original = pd.read_csv(os.path.join(path,"notch1_"+muts+".tsv"), sep='\t')
        df_original[['#CHROM', 'REF', 'ALT']] = df_original[['#CHROM', 'REF', 'ALT']].astype(str)
        df_original[['POS']] = df_original[['POS']].astype(int)
        # merge
        notch1_muts = df_original.merge(df, how='left', on=['#CHROM', 'POS', 'REF', 'ALT'])
        # remove unmapped mutations
        notch1_muts.dropna(subset=['Existing_variation', 'IMPACT', 'DISTANCE', 'STRAND', 'FLAGS', 'SYMBOL',
           'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL', 'ENSP', 'SOURCE', 'EXON',
           'INTRON', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'CLIN_SIG',
           'SOMATIC', 'PHENO', 'gnomADg', 'gnomADg_AF', 'gnomADg_NFE'], inplace=True)
    except FileNotFoundError:
            print("no {}".format(muts))
            notch1_muts = pd.DataFrame()
    return notch1_muts

### PRE-VEP ONLY NOTCH1

In [None]:
# Clinical data
cohort = 'ADULT TALL AECC PROJECT'

info_clinical = pd.read_csv("../ext_files/all_cohort_clinical_groups.tsv", sep='\t')
info_clinical['PATIENT'] = info_clinical.apply(lambda x: 'oshima_pat_'+x['PATIENT'] if x['COHORT']=='PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)' else x['PATIENT'], axis=1)
info_clinical_cohort = info_clinical[info_clinical['COHORT'] == cohort]

info_clinical = info_clinical[info_clinical['COHORT'].isin(['ADULT TALL AECC PROJECT',  'PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)'])]
info_clinical = info_clinical[info_clinical['PATIENT'] != 'oshima_pat_28']

# Go to the supplementary materials of Li et al., 2020 PMID: 31697823 and make a dataframe with the clinical information
# like the one in Additional file 2 Table S2 of the paper
info_paper_blood = pd.read_csv("", sep='\t')
info_paper_blood = info_paper_blood[info_paper_blood['TYPE'] == 'TALL'] 

info_clinical = info_clinical.append(info_paper_blood, ignore_index=True, sort=False)

In [None]:
info_clinical['COHORT'].unique()

In [None]:
#where mafs of mutationa are. Paths of Adult patients are given with PATS_DIRS dictionary since each
# batch of sequenced patients is in a different folder
in_dir = "" 
out_dir = "" #per cohort

In [None]:
# NOTCH1 length
notch1_gene = pd.DataFrame()
notch1_gene = notch1_gene.append({"#CHROM":'9',
"START":139388896,
"END":139440314}, ignore_index=True)
notch1_gene[['START', 'END']] = notch1_gene[['START', 'END']].astype(int)

In [None]:
df_snvs_all = pd.DataFrame()
df_indels_all = pd.DataFrame()
df_mnvs_all = pd.DataFrame()

for i, rw in info_clinical_cohort.iterrows():
    
    if rw['COHORT'] == "ADULT TALL AECC PROJECT":
        in_dir = PATS_DIRS[rw['PATIENT']]
    
    # read maf for each type of mutation
    try:
        df_snvs = pd.read_csv(os.path.join(in_dir, rw['PATIENT'], rw['COMPARISON'], 
                                           'Strelka_'+rw['COMPARISON']+'_somatic_snvs_sh_checked.maf'), sep='\t')
        df_snvs['#CHROM'] = df_snvs['#CHROM'].astype(str) 
        df_indels = pd.read_csv(os.path.join(in_dir, rw['PATIENT'], rw['COMPARISON'], 
                                           'Strelka_'+rw['COMPARISON']+'_somatic_indels_sh.maf'), sep='\t')
        df_indels['#CHROM'] = df_indels['#CHROM'].astype(str) 
        df_mnvs = pd.read_csv(os.path.join(in_dir, rw['PATIENT'], rw['COMPARISON'], 
                                           'Strelka_'+rw['COMPARISON']+'_somatic_mnvs_sh_checked.maf'), sep='\t')
        df_mnvs['#CHROM'] = df_mnvs['#CHROM'].astype(str) 
    except FileNotFoundError:
        pass
    # get muts in chrom 9
    df_snvs = df_snvs[df_snvs['#CHROM'] == '9']
    df_indels = df_indels[df_indels['#CHROM'] == '9']
    df_mnvs = df_mnvs[df_mnvs['#CHROM'] == '9']
    
    # map muts to notch1 gene
    df_snvs_map = map_muts(df_snvs, notch1_gene)
    df_indels_map = map_muts(df_indels, notch1_gene)
    df_mnvs_map = map_muts(df_mnvs, notch1_gene)
    
    df_snvs_map = df_snvs_map[['#CHROM', 'POS', 'REF', 'ALT']].merge(df_snvs, how='inner', on=['#CHROM', 'POS', 'REF', 'ALT'])
    df_indels_map = df_indels_map[['#CHROM', 'POS', 'REF', 'ALT']].merge(df_indels, how='inner', on=['#CHROM', 'POS', 'REF', 'ALT'])
    df_mnvs_map = df_mnvs_map[['#CHROM', 'POS', 'REF', 'ALT']].merge(df_mnvs, how='inner', on=['#CHROM', 'POS', 'REF', 'ALT'])
    
    #snvs
    if df_snvs_map.empty == False:
        df_snvs_map['ID'] = df_snvs_map.apply(lambda x: x['#CHROM']+'_'+str(x['POS'])+'_'+x['REF']+'_'+x['ALT'], axis=1)
        df_snvs_map['COMPARISON'] = rw['COMPARISON']
        df_snvs_all = df_snvs_all.append(df_snvs_map, ignore_index=True)
    # indels
    if df_indels_map.empty == False:
        df_indels_map['ID'] = df_indels_map.apply(lambda x: x['#CHROM']+'_'+str(x['POS'])+'_'+x['REF']+'_'+x['ALT'], axis=1)
        df_indels_map['COMPARISON'] = rw['COMPARISON'] 
        df_indels_all = df_indels_all.append(df_indels_map, ignore_index=True)
    # mnvs
    if df_mnvs_map.empty == False:
        df_mnvs_map['ID'] = df_mnvs_map.apply(lambda x: x['#CHROM']+'_'+str(x['POS'])+'_'+x['REF']+'_'+x['ALT'], axis=1)
        df_mnvs_map['COMPARISON'] = rw['COMPARISON'] 
        df_mnvs_all = df_mnvs_all.append(df_mnvs_map, ignore_index=True)
   

In [None]:
if df_snvs_all.empty == False:
     df_snvs_all[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
        'NORMAL', 'TUMOR', 'DP_tumor', 't_alt_reads', 't_ref_reads',
        'DP_normal', 'n_alt_reads', 'n_ref_reads', 'mut_type', 'GT_normal',
        'GT_tumor','COMPARISON']].to_csv(os.path.join(out_dir,"notch1_snvs.tsv"), sep='\t', index=False)
if df_indels_all.empty == False:
     df_indels_all[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
        'NORMAL', 'TUMOR', 'DP_tumor', 't_alt_reads', 't_ref_reads',
        'DP_normal', 'n_alt_reads', 'n_ref_reads', 'mut_type', 'GT_normal',
        'GT_tumor','COMPARISON']].to_csv(os.path.join(out_dir,"notch1_indels.tsv"), sep='\t', index=False)
if df_mnvs_all.empty == False:
     df_mnvs_all[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
        'NORMAL', 'TUMOR', 'DP_tumor', 't_alt_reads', 't_ref_reads',
        'DP_normal', 'n_alt_reads', 'n_ref_reads', 'mut_type', 'GT_normal',
        'GT_tumor','COMPARISON']].to_csv(os.path.join(out_dir,"notch1_mnvs.tsv"), sep='\t', index=False)

### RUN VEP


```
source activate vep92


vep -i notch1_snvs.tsv -o STDOUT -tab --assembly GRCh37 --no_stats --cache --symbol --protein --numbers --canonical --offline --af_1kg --dir /workspace/datasets/vep --custom /workspace/datasets/gnomad/gnomad.genomes.r2.0.1.sites.noVEP.vcf.gz,gnomADg,vcf,exact,0,AF,NFE > notch1_snvs_anno_vep92.tab
 
 
vep -i notch1_indels.tsv -o STDOUT -tab --assembly GRCh37 --no_stats --cache --symbol --protein --numbers --canonical --offline --af_1kg --dir /workspace/datasets/vep --custom /workspace/datasets/gnomad/gnomad.genomes.r2.0.1.sites.noVEP.vcf.gz,gnomADg,vcf,exact,0,AF,NFE > notch1_indels_anno_vep92.tab
```

### POST VEP ONLY NOTCH1

In [None]:
## read NOTCH1 annotated mutations

notch1_candidates = pd.DataFrame()

mutation_types = ['snvs', 'indels', 'mnvs']

in_dir = "../intermediate_files/notch1_needle_muts/"

folder = 'adult_TALL'

for muts in mutation_types:
    path = os.path.join(in_dir, folder)
    result = process_vep_results(path, muts)
    if result.empty ==False:
        result.drop(['FILTER','FORMAT','GT_normal','GT_tumor','INFO','NORMAL','QUAL','TUMOR'],axis=1, inplace=True)
    notch1_candidates = notch1_candidates.append(result, ignore_index=True, sort=False)
    
folder = 'oshima_WXS_2016'

for muts in mutation_types:
    path = os.path.join(in_dir, folder)
    result = process_vep_results(path, muts)
    if result.empty ==False:
        result.drop(['FILTER','FORMAT','GT_normal','GT_tumor','INFO','NORMAL','QUAL','TUMOR'],axis=1, inplace=True)
    notch1_candidates = notch1_candidates.append(result, ignore_index=True, sort=False)   
    
folder = 'li_blood_2020'

for muts in mutation_types:
    path = os.path.join(in_dir, folder)
    result = process_vep_results(path, muts)
    notch1_candidates = notch1_candidates.append(result, ignore_index=True, sort=False)

In [None]:
## muts protein affecting
notch1_candidates = notch1_candidates[notch1_candidates['Consequence'] != 'synonymous_variant']
notch1_candidates = notch1_candidates[notch1_candidates['Protein_position'] != '-']

In [None]:
## annotate mutations with subset
grps = info_clinical.groupby("PATIENT")

notch1_muts_subsets = pd.DataFrame()

for g in grps.groups:
   
    pat_info = grps.get_group(g)
    pat_info.sort_values("STAGE", inplace=True)
    pat_info.reset_index(drop=True, inplace=True)
    com_pry = pat_info[pat_info['STAGE'] == 'primary']['COMPARISON'].unique()[0]
    com_rel = pat_info[pat_info['STAGE'] == 'relapse']['COMPARISON'].unique()[0]
    
    pry_notch1 = notch1_candidates[notch1_candidates['COMPARISON'] == com_pry]
    rel_notch1 = notch1_candidates[notch1_candidates['COMPARISON'] == com_rel]
    
    if (pry_notch1.empty == True) and (rel_notch1.empty == True):
        continue
    
    pry_notch1['Variant'] = pry_notch1['ID']
    rel_notch1['Variant'] = rel_notch1['ID']
    
    trunk, private_pry, private_rel = get_three_subsets(pry_notch1, rel_notch1)
    
    pry_notch1['PATIENT'] = g
    rel_notch1['PATIENT'] = g
    
    pry_notch1['COHORT'] = pat_info['COHORT'].unique()[0]
    rel_notch1['COHORT'] = pat_info['COHORT'].unique()[0]
    
    if pry_notch1.empty == False:
        pry_notch1['subset'] = pry_notch1.apply(lambda x: 'shared' if x['Variant'] in trunk else 'private_primary', axis=1)
        notch1_muts_subsets = notch1_muts_subsets.append(pry_notch1[['#CHROM', 'POS', 'REF', 'ALT', 'Consequence', 
                                                                 'SYMBOL', 'mut_type','EXON', 'Amino_acids',
                                                                 'Protein_position', 'subset', 'Variant','PATIENT', 'COHORT']], ignore_index=True, sort= False)
    if rel_notch1.empty == False:
        print(g)
        rel_notch1_subset = rel_notch1[rel_notch1['Variant'].isin(private_rel)]
        rel_notch1_subset['subset'] = 'private_relapse'
        notch1_muts_subsets = notch1_muts_subsets.append(rel_notch1_subset[['#CHROM', 'POS', 'REF', 'ALT', 'Consequence', 
                                                                 'SYMBOL', 'mut_type','EXON', 'Amino_acids',
                                                                 'Protein_position', 'subset', 'Variant','PATIENT', 'COHORT']], ignore_index=True, sort= False)
    

In [None]:
notch1_muts_subsets[['COHORT', 'PATIENT']].drop_duplicates().groupby('COHORT').count()

In [None]:
notch1_muts_subsets.head()

In [None]:
notch1_muts_subsets.to_csv(os.path.join(in_dir, "candidate_muts_notch1.tsv"), sep='\t', index=False)