This notebook calculates the mutational burden of the cohorts of the first part of the analysis. The sigmoid plot belongs to figure 1a of the paper. 

This piece of code relies on a workspace directory structure such as 
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
Be aware that the filtered mafs with clonal classification and joined mutations after running the scripts in ```filter/```  have the following file name: ```TumorID_vs_normalID + _strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf``` 
.This file name is used in the following code.

In [None]:
import os
import pandas as pd
import numpy as np
import glob
from io import StringIO
import math

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

from aux_data_in_pyvar import config_rcparams, PATS_DIRS, TOTAL_LEN_GENOME


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
config_rcparams()

In [None]:
# read clinical data
info_cohorts = pd.read_csv("../ext_files/all_cohort_clinical_groups.tsv", sep='\t')
info_cohorts['SUBTYPE_LABEL'] = info_cohorts['SUBTYPE_LABEL'].str.replace("PHALL", 'Ph positive')
info_cohorts.head()

In [None]:
#filter
info_cohorts = info_cohorts[(info_cohorts['STAGE'] == 'primary') & (info_cohorts['SEQUENCING'] == 'WGS')]
info_cohorts = info_cohorts[~info_cohorts['AGE_RANGE'].isnull()]
info_cohorts = info_cohorts[~info_cohorts['PATIENT'].isin(['PAT3', 'PAT4'])]

In [None]:
# read pediatric data of public repositories

dff_count_cohorts = pd.DataFrame()
pediatric_maf_path = "" # add here the path to the pediatric cohorts

for f in glob.glob(os.path.join(pediatric_maf_path, "*/SJ*/SJ*_vs_SJ*/*_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf")):
        
    pat = f.split("/")[-3]
    com = f.split("/")[-2]
    info_sam = info_cohorts[info_cohorts['COMPARISON'] == com].reset_index()
    
    if info_sam.empty == False:

        df = pd.read_csv(f, sep='\t')
        muts_clonal = df[df['clonal_classification'] == 'clonal']

        dff_count_cohorts = dff_count_cohorts.append({'COHORT':info_sam.loc[0, 'COHORT'], 'TYPE':info_sam.loc[0, 'TYPE'], 
                                                      'SUBTYPE_LABEL':info_sam.loc[0, 'SUBTYPE_LABEL'],
                                                      'AGE_RANGE':info_sam.loc[0, 'AGE_RANGE'], 'STAGE':info_sam.loc[0, 'STAGE'], 
                                                      'MUTS':len(muts_clonal), 
                                                      'SNVS':len(muts_clonal[muts_clonal['mut_type'] == 'snv']),
                                                      'INDELS':len(muts_clonal[muts_clonal['mut_type'] == 'indels']),
                                                      'SAMPLE':com}, ignore_index=True)


In [None]:
# read adult patient's data

for i, rw in info_cohorts[info_cohorts['COHORT'] == 'ADULT TALL AECC PROJECT'].iterrows():
    pat = info_cohorts.loc[i, 'PATIENT']
    com = info_cohorts.loc[i, 'COMPARISON']
    
    df = pd.read_csv(os.path.join(PATS_DIRS[pat], pat, com, com+"_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf"), sep='\t')
    muts_clonal = df[df['clonal_classification'] == 'clonal']

    dff_count_cohorts = dff_count_cohorts.append({'COHORT':info_cohorts.loc[i, 'COHORT'], 'TYPE':info_cohorts.loc[i, 'TYPE'], 
                                                      'SUBTYPE_LABEL':info_cohorts.loc[i, 'SUBTYPE_LABEL'],
                                                      'AGE_RANGE':info_cohorts.loc[i, 'AGE_RANGE'], 'STAGE':info_cohorts.loc[i, 'STAGE'], 
                                                      'MUTS':len(muts_clonal), 
                                                      'SNVS':len(muts_clonal[muts_clonal['mut_type'] == 'snv']),
                                                      'INDELS':len(muts_clonal[muts_clonal['mut_type'] == 'indels']),
                                                      'SAMPLE':com}, ignore_index=True)

In [None]:
# compute mutation burden

dff_count_cohorts['MUTS/MB'] = dff_count_cohorts['MUTS'].apply(lambda x: (x/TOTAL_LEN_GENOME)*(1*10**6))
dff_count_cohorts['log2(MUTS/MB)'] = dff_count_cohorts['MUTS'].apply(lambda x:  math.log2((x/TOTAL_LEN_GENOME)*(1*10**6)))
dff_count_cohorts[['SUBTYPE_LABEL', 'SAMPLE']].groupby('SUBTYPE_LABEL').count()

In [None]:
# sort cohorts by ascending median of number of mutations per patient 

sorter = pd.DataFrame()

grps = dff_count_cohorts[dff_count_cohorts['SUBTYPE_LABEL'] != 'Other'].groupby("SUBTYPE_LABEL")

for g in grps.groups:
    df_group = grps.get_group(g)
    df_group = df_group.sort_values('log2(MUTS/MB)', ascending=True).reset_index(drop=True)
    df_group.reset_index(inplace=True)
    median = df_group['log2(MUTS/MB)'].median()
    sorter = sorter.append({'SUBTYPE_LABEL':g, 'median':median}, ignore_index=True)
    
sorter.sort_values("median", inplace=True, ascending=True)
order_subtype = sorter['SUBTYPE_LABEL'].tolist()

In [None]:
order_subtype

In [None]:
## make plot

dire_plot = "" # path for the figure

grps = dff_count_cohorts.groupby("SUBTYPE_LABEL")

fig, axarr = plt.subplots(1,len(order_subtype), figsize=(15, 3), gridspec_kw={'wspace':0.1, 'hspace':0.5}, sharey=True)

for i,g in enumerate(order_subtype):
    
    # cohort
    df_group = grps.get_group(g)
    df_group = df_group.sort_values('log2(MUTS/MB)', ascending=True).reset_index(drop=True)
    df_group.reset_index(inplace=True)
    median = df_group['log2(MUTS/MB)'].median()
    print("{} {}".format(g, median))
    
    # scatter
    xticks = df_group['index'].tolist()
    xtickslabels = df_group['SAMPLE'].tolist()
    yvalues = df_group['log2(MUTS/MB)'].tolist()
    axarr[i].scatter(x=xticks, y=yvalues, color='#2a2a2a')
    axarr[i].set_xlabel(g, fontsize=14)
    axarr[i].plot([(len(df_group)/2)-5,(len(df_group)/2)+5],[median,median], color='r')
    if i ==0:
        axarr[i].set_ylabel('log2(muts/mb)', fontsize=14)
    axarr[i].tick_params(axis='both', which='major', labelsize=12)
    
    fig.suptitle("MUTATIONAL BURDEN", fontsize=16)
plt.tight_layout()
fig.savefig(os.path.join(dire_plot, "mutational_burden.svg"), dpi=300,bbox_inches='tight')
plt.show()