This notebook performs correlations between mutations attributable to clock-like signatures and the age of the patients. 

The plots and numbers of the linear regressions correspond to Figure 4a and Additional file 1 Figure S7 in the paper

This piece of code relies on a workspace directory structure such as:
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
Be aware that the filtered mafs with clonal classification and joined mutations after running the scripts in ```filter/```  have the following file name: ```TumorID_vs_normalID + _strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf``` 
.This file name is used in the following code.

PATS_DIRS is a dictionary with the path to the patient folder where the MAF files are.

In [None]:
import sys, os
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
import seaborn as sns
import collections

import numpy as np
import statsmodels.formula.api as smf
from scipy import stats
import glob

from aux_functions import stage_mapping, get_context_rev,add_pyrimidine_type, df_to_dict,count_variant_type, get_muts_x_signature
from aux_data_in_pyvar import PATS_DIRS, COLORS_SIGNATURES, COLORS_SUBTYPES, config_rcparams

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

%load_ext autoreload
%autoreload 2

In [None]:
config_rcparams()

In [None]:
def get_clonal_x_signature_patient(path_fitting, clinical):
    
    signature_subsets = pd.DataFrame()
    
    fitting_results = pd.read_csv(os.path.join(path_fitting, "signatures_weight.csv"), sep='\t')
    
    for pat in fitting_results['sample_id']:
        
        df_pat = pd.DataFrame()
        
        clinical_pat = clinical[clinical['PATIENT'] == pat].reset_index()
        com_pry = clinical_pat[clinical_pat['STAGE'] == 'primary']['COMPARISON'].tolist()[0]
        com_rel = clinical_pat[clinical_pat['STAGE'] == 'relapse']['COMPARISON'].tolist()[0]

        df_pry = pd.read_table(os.path.join(PATS_DIRS[pat], pat, com_pry,
                                            com_pry+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'),
                               sep='\t',low_memory=False)

        df_rel = pd.read_table(os.path.join(PATS_DIRS[pat], pat, com_rel,
                                            com_rel+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'), 
                            sep='\t',low_memory=False)

        # GET CLONALS and SNVS
        df_pry = df_pry[df_pry['mut_type'] == 'snv']
        df_rel = df_rel[df_rel['mut_type'] == 'snv']

        # SEPARATE CLONALS SUBCLONALS
      
        df_pry = df_pry[df_pry['clonal_classification'] == 'clonal']
        df_rel = df_rel[df_rel['clonal_classification'] == 'clonal']

        # GET SUBSETS
        all_pry_variants = set(df_pry['Variant'].unique())
        all_rel_variants = set(df_rel['Variant'].unique())

        shared_variants = all_pry_variants.intersection(all_rel_variants)
        private_pry_variants = all_pry_variants.difference(shared_variants)
        private_rel_variants = all_rel_variants.difference(shared_variants) 

        df_shared = df_pry[df_pry['Variant'].isin(shared_variants)]
        df_private_pry = df_pry[df_pry['Variant'].isin(private_pry_variants)]
        df_private_rel = df_rel[df_rel['Variant'].isin(private_rel_variants)]

        for signature in ['SBS1', 'SBS5']: # clock like

            count_pp, count_pr, count_sh = get_muts_x_signature(sh=df_shared, pp=df_private_pry, 
                                                            pr=df_private_rel, pat=pat, sig=signature, 
                                                            prob_file_path=path_fitting)

            df_pat.set_value(index=0, col='MUTS_CLONAL_'+signature, value=count_pp+count_sh)
            df_pat.set_value(index=1, col='MUTS_CLONAL_'+signature, value=count_pr+count_sh)  
                
            df_pat.set_value(index=0, col='STAGE', value="primary")
            df_pat.set_value(index=1, col='STAGE', value="relapse") 

        df_pat['PATIENT'] = pat
        signature_subsets = signature_subsets.append(df_pat, ignore_index=True)
    return signature_subsets

In [None]:
## FUNCTIONS

def get_clonal_x_signature_sample(path_fitting, inputt):
    
    df_sig = pd.read_csv(os.path.join(path_fitting,'signatures_weight.csv'), sep='\t')
    prob = pd.read_csv(os.path.join(path_fitting, "mutation_sign_prob.tsv"), sep='\t')
    
    signature_subsets = pd.DataFrame()
    for i,rw in df_sig.iterrows():
        ids = rw['sample_id']
        signatures = [col for col in rw.index if 'SBS' in col]
        
        if "_vs_" in ids:
            # read maf
            path= glob.glob(os.path.join(in_maf, '*/'+ids,ids+"_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf"))
            maf = pd.read_csv(path[0], sep='\t',low_memory=False)
        else: 
            if type(inputt) == collections.OrderedDict:
                in_maf = inputt[ids]
            else:
                in_maf = inputt
            path= glob.glob(os.path.join(in_maf, ids,"*_vs_*/*_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf"))
            maf = pd.read_csv(path[0], sep='\t',low_memory=False)
        # filter
        maf = maf[maf['mut_type'] == 'snv']
        maf = maf[maf['clonal_classification'] == 'clonal']

        # get signature probabilities by context
        prob_pat = prob[prob['Sample'] == ids].set_index('Mutation_type')
        prob_pat.index.name=None

        for sig in signatures:
            dicc_muts = df_to_dict(maf)
            count_df = 0
            for cntxt, count in dicc_muts.items():
                prob_sig = prob_pat.loc[cntxt, sig]
                count_df = count_df + count*prob_sig
            signature_subsets.set_value(index=ids, col='MUTS_CLONAL_'+sig, value=count_df)
    return signature_subsets 

In [None]:
## PLOT FUNCTIONS

def corr_plt(df_plot, out_file, sig, title):
    if (sig == 'SIG1') or (sig == 'SBS1'):
        colorin = COLORS_SIGNATURES['SBS1']
        markers=['o', 'x']
    elif (sig == 'SIG5') or (sig == 'SBS5'):
        colorin = COLORS_SIGNATURES['SBS5']
        markers=['o', 'x']
    elif (sig == 'healthy'):
        colorin = "#bdbdbd"
        sig = 'SBS5'
        markers = ["P",'^']
    else:
        print("another signature! write SIG1 or SIG5")
    
    # with scipy
    coef_corr, pval_corr = stats.pearsonr(df_plot['AGE'], df_plot['MUTS_CLONAL_'+sig])

    est = smf.ols(formula='MUTS_CLONAL_'+sig+' ~ AGE', data=df_plot).fit()
    dfs = {}
    fs = est.summary()
    for item in fs.tables[0].data:
        dfs[item[0].strip()] = item[1].strip()
        dfs[item[2].strip()] = item[3].strip()
    for item in fs.tables[2].data:
        dfs[item[0].strip()] = item[1].strip()
        dfs[item[2].strip()] = item[3].strip()
    dfs = pd.Series(dfs)
    coef_det = dfs['R-squared:']
    intercept = round(est.params[0], 3)
    slope = round(est.params[1], 3) # reg coefficient
    print(est.summary())
    
    if len(df_plot['STAGE'].unique()) == 2:
        fgrid = sns.lmplot(y='MUTS_CLONAL_'+sig, x='AGE', data=df_plot, hue='STAGE',size=3, aspect=2,legend=False, fit_reg=False,
                      palette=[colorin],scatter_kws={'linewidths':1,'edgecolor':'k'}, markers=markers)
    else:
        fgrid = sns.lmplot(y='MUTS_CLONAL_'+sig, x='AGE', data=df_plot,size=3, aspect=2,legend=True, fit_reg=False,
                      palette=colorin, scatter_kws={'linewidths':1,'edgecolor':'k','facecolors':colorin}, markers=['o'])
    ax = fgrid.axes[0,0]
    ax.set_xlim(0, 70)
    ax.set_ylim(0, 2500)
    
    handles, labels = ax.get_legend_handles_labels()
    
    handles.append(mpatches.Patch(color='none', label='r-Pearson:'+str(round(coef_corr, 2))+' p-value:'+'{:0.2e}'.format(pval_corr)+'\n'+'R-squared:'+coef_det))
    slope, intercept, r_value, p_value, slope_std_error = stats.linregress(df_plot['AGE'],df_plot['MUTS_CLONAL_'+sig])
    handles.append(mpatches.Patch(color=colorin, label='$y=%3.7s*x+%3.7s$'%(slope, intercept)))
    ax.legend(handles=handles,bbox_to_anchor=(1,0.5),prop={'size': 10})
    
    print('{:0.2e}'.format(pval_corr))

    sns.regplot(x="AGE", y="MUTS_CLONAL_"+sig, data=df_plot, scatter=False, ax=ax,line_kws={"color": colorin})
    
    ax.set_ylabel("Clonal Mutations \n of "+sig[0:3]+" "+sig[-1])
    ax.set_xlabel("Age")

    ax = plt.gca()
    ax.set_title(title)

    fgrid.savefig(out_file, doi=200,bbox_inches = 'tight',pad_inches=0.1)
    plt.show()
    
    
def corr_tall(df_plot, out_file, sig, title):
    
    markers={'TALL Adult':'o', 'TALL Adult (relapse)':'x','TALL Pediatric':'o', 'Multipotent progenitor cells (MPP)':'^', 'Hematopoietic stem cells (HSC)':"P"}
    fgrid = sns.lmplot(y='MUTS_CLONAL_'+sig, x='AGE', data=df_plot, hue='SUBTYPE_LABEL',size=3, aspect=2,legend=False, fit_reg=False,
                      palette={'TALL Adult':'#d50402', 'TALL Adult (relapse)':'#d50402', 
                               "TALL Pediatric":'#ff8080', 'Multipotent progenitor cells (MPP)':'#737373', 
                               'Hematopoietic stem cells (HSC)':'#737373'}, 
                           scatter_kws={'linewidths':1,'edgecolor':'k'}, markers=['o', 'x', 'o', "P",'^'])
    
    ax = fgrid.axes[0,0]
    ax.set_xlim(0, 70)
    
    COLORS_SUBTYPES['Progenitor cells']='#737373'

    sns.regplot(x="AGE", y="MUTS_CLONAL_"+sig, data=df_plot[df_plot['SUBTYPE PLOT'] == 'TALL Adult'], 
                scatter=False, ax=ax,line_kws={"color": COLORS_SUBTYPES['TALL Adult']})
    sns.regplot(x="AGE", y="MUTS_CLONAL_"+sig, data=df_plot[df_plot['SUBTYPE PLOT'] == 'TALL Pediatric'], 
                scatter=False, ax=ax,line_kws={"color": COLORS_SUBTYPES['TALL Pediatric']})
    sns.regplot(x="AGE", y="MUTS_CLONAL_"+sig, data=df_plot[df_plot['SUBTYPE PLOT'] == 'Progenitor cells'], 
                scatter=False, ax=ax,line_kws={"color":COLORS_SUBTYPES['Progenitor cells'] , 'linestyle':"--"})
    
    ax.set_ylabel("Clonal Mutations\nof Signature "+sig)
    ax.set_xlabel("Age")
        
    handles, labels = ax.get_legend_handles_labels()
    
    for reg in df_plot['SUBTYPE PLOT'].unique():
        dff = df_plot[df_plot['SUBTYPE PLOT'] == reg]
        
        coef_corr, pval_corr = stats.pearsonr(dff['AGE'], dff['MUTS_CLONAL_'+sig])
        handles.append(mpatches.Patch(color=COLORS_SUBTYPES[reg], label='r-Pearson:'+str(round(coef_corr, 2))))
        ax.legend(handles=handles,bbox_to_anchor=(1,0.5),prop={'size': 10})

    ax = plt.gca()
    ax.set_title(title)


    fgrid.savefig(out_file, doi=200,bbox_inches = 'tight',pad_inches=0.1)
    plt.show()
    
    
def scatter(df, col_x, cols_y, filename):
    fig, ax = plt.subplots()
    
    ax.set_xlim(0,1)
    ax.set_ylim(0,1)
    
    ax = sns.scatterplot(x=col_x, y=cols_y, s= 100,
                      hue="stage", data=df)
    
    sns.regplot(x=col_x, y=cols_y, data=df, 
                scatter=False, ax=ax,line_kws={"color": "#bdbdbd"})
    
    handles, labels = ax.get_legend_handles_labels()
    coef_corr = df[[col_x, cols_y]].corr('pearson')
    coef_corr = coef_corr.loc[col_x, cols_y]
    handles.append(mpatches.Patch(color='none', label='r-Pearson:'+str(round(coef_corr, 2))))
    ax.legend(handles=handles,bbox_to_anchor=(1,0.5),prop={'size': 10})
    
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
    fig.savefig(filename+".svg", dpi=300, bbox_inches='tight', format='svg')
    plt.show()

In [None]:
clinical = pd.read_csv("../ext_files/all_cohort_clinical_groups.tsv", sep='\t')
clinical = clinical[~clinical['COMPARISON'].isin(['AE6518_vs_AE6519', 'AE6521_vs_AE6522'])]

In [None]:
# get only patients with known numerical age
clinical = clinical[~clinical['AGE'].isnull()]
clinical = clinical[~clinical['AGE'].isin(['Childhood SR', 'Childhood HR'])]
clinical = clinical[~clinical['AGE'].str.contains("-")]
clinical['AGE'] = clinical['AGE'].astype(float)

In [None]:
path_out = "" # path for the figures

In [None]:
dff_plot = pd.DataFrame()

In [None]:
# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# all samples (primary and relapse) together. If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb
# it should correspond to the results file signatures_weight.csv of a run with folder named run_subsets_together/
dire_fitsig = ""
df_adult = get_clonal_x_signature_patient(dire_fitsig, clinical)

In [None]:
dff_plot = df_adult.merge(clinical[['AGE', 'COMPARISON', 'PATIENT','STAGE', 'SUBTYPE_LABEL']], how='left', on=['PATIENT', 'STAGE'])
dff_plot

In [None]:
dff_plot = dff_plot.dropna() # in case there is no information
dff_plot['SUBTYPE PLOT'] = 'TALL Adult'

In [None]:
plot_file = os.path.join(path_out, "sig5_corr_age_adult_TALL_sarek.svg")
corr_plt(dff_plot[dff_plot['SUBTYPE PLOT'] == 'TALL Adult'], plot_file, 'SBS5', 'Adult TALL (in-house) cohort')

In [None]:
print("CI intercept +/- {} {}".format(str(round(649.215-397.3998, 3)), str(round(397.3998-145.585, 3))))
print("CI slope +/- {} {}".format(str(round(27.195-20.6128, 3)), str(round(20.6128-14.031, 3))))

In [None]:
plot_file =os.path.join(path_out, "sig1_corr_age_adult_TALL_sarek.svg")
corr_plt(dff_plot[dff_plot['SUBTYPE PLOT'] == 'TALL Adult'], plot_file, 'SBS1', 'Adult TALL (in-house) cohort')

In [None]:
print("CI intercept +/- {} {}".format(str(round(343.738-172.1058, 3)), str(round(172.1058-0.473, 3))))
print("CI slope +/- {} {}".format(str(round(11.780-7.2936, 3)), str(round(7.2936-2.808, 3))))

In [None]:
## Add pediatric TALL dataframe Zhang et al., 2012; Nature Genetics

dire_maf = "" # path to the folder of MAF files from Zhang et al., 2012; Nature Genetics

# path to the folder with the run of deconstructSigs for primary samples of the cohort Zhang et al., 2012; Nature Genetics
dire_fitsig = ""

pry_pedia = get_clonal_x_signature_sample(os.path.join(dire_fitsig, 'TALL_Pediatric_pry'), dire_maf)
pry_pedia.reset_index(inplace=True)
pry_pedia.rename(columns={'index':'PATIENT'}, inplace=True)

pry_pedia = pry_pedia.merge(clinical[['AGE', 'COMPARISON', 'PATIENT','STAGE', 'SUBTYPE_LABEL']], how='left', on='PATIENT')
pry_pedia['SUBTYPE PLOT'] = 'TALL Pediatric'
pry_pedia.dropna(subset=['AGE'], inplace=True)
dff_plot = dff_plot.append(pry_pedia, sort=False, ignore_index=True)

In [None]:
## Add healthy tissue data from Osorio et al., 2018; Cell Reports publication 

# Dataframe with the signature weights from fitting signatures of data from Osorio et al., 2018; Cell Reports publication
df_healthy = pd.read_csv("", sep='\t')

# Dataframe of clinical data on each sample provided by the authors of Osorio et al., 2018; Cell Reports publication
info_samples_healthy = pd.read_csv("", sep='\t')

df_healthy['MUTS_CLONAL_SIG5'] = df_healthy['mutation_count']*df_healthy['SBS5']
df_healthy = df_healthy.merge(info_samples_healthy[['Identifier', 'Age (years)', 'Cell type']], how='left', 
                              left_on='sample_id', right_on='Identifier')
df_healthy.rename(columns={'Age (years)':'AGE', 'Cell type':'STAGE'}, inplace=True)
df_healthy['STAGE'] = df_healthy['STAGE'].apply(lambda x: 'Hematopoietic stem cells (HSC)' if x=='HSC' else 'Multipotent progenitor cells (MPP)')

dff_plot_TALL = dff_plot[['AGE', 'MUTS_CLONAL_SBS5', 'STAGE', 'SUBTYPE PLOT', 'SUBTYPE_LABEL']]
df_healthy['SUBTYPE PLOT'] = 'Progenitor cells'
df_healthy['SUBTYPE_LABEL'] = df_healthy['STAGE']
df_healthy.sort_values('SUBTYPE_LABEL', inplace=True)
df_healthy.rename(columns={'MUTS_CLONAL_SIG5':'MUTS_CLONAL_SBS5'}, inplace=True)
df_plot = dff_plot_TALL.append(df_healthy[['AGE', 'MUTS_CLONAL_SBS5', 'STAGE', 'SUBTYPE PLOT', 'SUBTYPE_LABEL']], ignore_index=True)

In [None]:
#df_plot.to_csv("../intermediate_files/data_points_figure4a.tsv", sep='\t', index=False)

In [None]:
plot_file = os.path.join(path_out, "sig5_corr_age_TALL_join.svg")
corr_tall(df_plot, plot_file, 'SBS5', "Age-related acumulation of mutations of TALL cohorts (Signature 5)")

In [None]:
plot_file = os.path.join(path_out, "sig5_healthy.svg")
corr_plt(df_healthy, plot_file, 'healthy','SBS5 from Hematopoietic Cells\n(Osorio et al., 2018; Cell Reports)')

In [None]:
print("CI intercept +/- {} {}".format(str(round(67.885-22.3521, 3)), str(round(22.3521--23.181, 3))))
print("CI slope +/- {} {}".format(str(round(13.453-12.2119, 3)), str(round(12.2119-10.970, 3))))

### check HSCP and SIG5 correlation

HSCP profile looks similar to signature 5.

We compared the fitting of signature 5 of our data with and without the HSCP signature recently described
in  Osorio et al., 2018; Cell Reports publication.

In [None]:
# First check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb. According to the intructions in 
# that notebook,the results run stored in old_weights_* dataframes should correspond to the ones in folder 
# run_samples/ whereas the dataframes hscp_weigths_* should be the weights of the folder run_samples_hemato/

old_weigths_pry = pd.read_csv("", sep='\t')
old_weigths_rel = pd.read_csv("", sep='\t')

hscp_weigths_pry = pd.read_csv("", sep='\t')
hscp_weigths_rel = pd.read_csv("", sep='\t')

In [None]:
old_weigths_pry['stage'] = 'primary'
old_weigths_rel['stage'] = 'relapse'

hscp_weigths_pry['stage'] = 'primary'
hscp_weigths_rel['stage'] = 'relapse'

In [None]:
weights_old = old_weigths_pry.copy()
weights_hscp = hscp_weigths_pry.copy()

weights_old = weights_old.append(old_weigths_rel, ignore_index=True, sort=False)
weights_hscp = weights_hscp.append(hscp_weigths_rel, ignore_index=True, sort=False)

In [None]:
weights_hscp.rename(columns={'SBS_hscp':'HSCP_signature'}, inplace=True)
weights = weights_old.merge(weights_hscp, how='outer', on=['sample_id', 'stage'], suffixes=['_old', '_hscp'])
weights['SBS5_hscp + HSCP_signature'] = weights['SBS5_hscp']+weights['HSCP_signature']
weights['SBS5_old + unknown'] = weights['SBS5_old']+weights['unknown_hscp']

In [None]:
scatter(weights, "SBS5_old", 'SBS5_hscp + HSCP_signature', os.path.join(path_out,"contrib_sig5_old_vs_sig5_plus_sig_HSCP"))