This notebook creates barplots of the signature contributions that correspond to Figure 3a and Additional File 1 Figure S6 of the paper.

This piece of code relies on a workspace directory structure such as 
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
Be aware that the filtered mafs with clonal classification and joined mutations after running the scripts in ```filter/```  have the following file name: ```TumorID_vs_normalID + _strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf``` 
.This file name is used in the following code.

In [None]:
import pandas as pd
import numpy as np
import glob
import os

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from collections import OrderedDict
from collections import defaultdict
import seaborn as sns
from aux_functions import stage_mapping, get_context_rev,add_pyrimidine_type, df_to_dict,count_variant_type, get_muts_x_signature
from aux_data_in_pyvar import COLORS_SIGNATURES, PATS_DIRS, config_rcparams

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

%load_ext autoreload
%autoreload 2

In [None]:
entriesToRemove = ('PAT3', 'PAT4')
for k in entriesToRemove:
    PATS_DIRS.pop(k, None)

In [None]:
config_rcparams()

In [None]:
## FUNCTIONS

def read_probability_muts(deconstruct_run, clonal, clinical, signatures_found):
    
    signature_subsets = pd.DataFrame()

    for pat in PATS_DIRS.keys():

        # GET INFO
        pat_clinical = clinical[clinical['Patient_id'] == pat].reset_index()
        com_pry = pat_clinical.loc[0, 'Primary_seq_id']+'_vs_'+pat_clinical.loc[0, 'Remission_seq_id']
        com_rel = pat_clinical.loc[0, 'Relapse_seq_id']+'_vs_'+pat_clinical.loc[0, 'Remission_seq_id']
        
        dire_maf = PATS_DIRS[pat]
        print(pat)

        # READ MUTATION DATASET

        # RUN FROM NOV2019 REFISHING THE SHARED
        df_pry = pd.read_table(os.path.join(dire_maf, pat, com_pry,
                                            com_pry+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'),
                               sep='\t',low_memory=False)

        df_rel = pd.read_table(os.path.join(dire_maf, pat, com_rel,
                                            com_rel+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'), 
                            sep='\t',low_memory=False)

        # GET CLONALS and SNVS
        df_pry = df_pry[df_pry['mut_type'] == 'snv']
        df_rel = df_rel[df_rel['mut_type'] == 'snv']

        # SEPARATE CLONALS SUBCLONALS
        if clonal == True:
            df_pry = df_pry[df_pry['clonal_classification'] == 'clonal']
            df_rel = df_rel[df_rel['clonal_classification'] == 'clonal']

        # GET SUBSETS
        all_pry_variants = set(df_pry['Variant'].unique())
        all_rel_variants = set(df_rel['Variant'].unique())

        print(len(all_pry_variants))
        print(len(all_rel_variants))

        shared_variants = all_pry_variants.intersection(all_rel_variants)
        private_pry_variants = all_pry_variants.difference(shared_variants)
        private_rel_variants = all_rel_variants.difference(shared_variants) 

        df_shared = df_pry[df_pry['Variant'].isin(shared_variants)]
        df_private_pry = df_pry[df_pry['Variant'].isin(private_pry_variants)]
        df_private_rel = df_rel[df_rel['Variant'].isin(private_rel_variants)]

        for signature in signatures_found:

            count_pp, count_pr, count_sh = get_muts_x_signature(sh=df_shared, pp=df_private_pry, 
                                                            pr=df_private_rel, pat=pat, sig=signature, 
                                                            prob_file_path=deconstruct_run)

            signature_subsets.set_value(index=pat, col='PRIVATE_PRY_'+signature, value=count_pp)
            signature_subsets.set_value(index=pat, col='PRIVATE_REL_'+signature, value=count_pr)   
            signature_subsets.set_value(index=pat, col='TRUNK_'+signature, value=count_sh)   


        # ADD info for phylotree
        #signature_subsets.set_value(index=pat, col='DIAGNOSIS_AGE_YEARS', value=pat_clinical.loc[0, 'Primary_diagnosis_age'])
        #signature_subsets.set_value(index=pat, col='PRIMARY_TO_RELAPSE_AGE_DAYS', value=dict_time_relapse[pat])
        signature_subsets.set_value(index=pat, col='IMMUNO_CLASS', value=pat_clinical.loc[0,'Primary_immunoclassification'])
        signature_subsets.set_value(index=pat, col='SEX', value=pat_clinical.loc[0,'Sex'])

    return signature_subsets

In [None]:
def bar_plot_signatures(input_path, signatures_found, output_file, mutations, clinical,version=1):
    
    color_legend = list()
    labels_legend = list()

    fig, axs = plt.subplots(2,len(PATS_DIRS.keys()), figsize=(18, 6),frameon=False,
                            gridspec_kw={'wspace':0.05, 'hspace':0.2,  'height_ratios':[3, 1]})
    barWidth = 1
    
    if version == 2:
        if 'subclonal' in mutations:
            clonal = False
        else:
            clonal = True
        df_subsets = read_probability_muts(input_path, clonal, clinical, signatures_found)
        df_subsets.reset_index(inplace=True)
        df_subsets.rename(columns={'index':'sample_id'}, inplace=True)
        
        print(df_subsets.head())
        df_subsets['TOTAL_SH'] = df_subsets[["TRUNK_"+sig for sig in signatures_found]].sum(axis=1)
        df_subsets['TOTAL_PP'] = df_subsets[["PRIVATE_PRY_"+sig for sig in signatures_found]].sum(axis=1)
        df_subsets['TOTAL_PR'] = df_subsets[["PRIVATE_REL_"+sig for sig in signatures_found]].sum(axis=1)
    
        filter_col_pp = [col for col in df_subsets if col.startswith('PRIVATE_PRY_')]
        filter_col_pp.extend(["sample_id", "TOTAL_PP"])
        df_private_pry = df_subsets[filter_col_pp]
        
        filter_col_pr = [col for col in df_subsets if col.startswith('PRIVATE_REL_')]
        filter_col_pr.extend(["sample_id", "TOTAL_PR"])
        df_private_rel = df_subsets[filter_col_pr]
        
        filter_col_sh = [col for col in df_subsets if col.startswith('TRUNK_')]
        filter_col_sh.extend(["sample_id", "TOTAL_SH"])
        df_shared = df_subsets[filter_col_sh]

        for sig in signatures_found:
            df_private_pry.rename(columns={'PRIVATE_PRY_'+sig:sig+'_muts'}, inplace=True)
            df_private_pry[sig] = df_private_pry[sig+'_muts']/df_private_pry['TOTAL_PP']
            df_private_rel.rename(columns={'PRIVATE_REL_'+sig:sig+'_muts'}, inplace=True)
            df_private_rel[sig] = df_private_rel[sig+'_muts']/df_private_rel['TOTAL_PR']
            df_shared.rename(columns={'TRUNK_'+sig:sig+'_muts'}, inplace=True)
            df_shared[sig] = df_shared[sig+'_muts']/df_shared['TOTAL_SH']
  
    else:
        
        if mutations == 'Clonal and subclonal mutations':
            pp_dir = "primary_private"
            pr_dir = 'relapse_private'
            sh_dir  = 'shared'
        elif mutations == 'Clonal mutations':
            pp_dir = "primary_private_clonals"
            pr_dir = 'relapse_private_clonals'
            sh_dir  = 'shared_clonals'
        else:
            print("dires unknown")

        # read files
        df_private_pry = pd.read_csv(os.path.join(input_path, pp_dir, 'signatures_weight.csv'), sep='\t') 
        df_private_rel = pd.read_csv(os.path.join(input_path, pr_dir, 'signatures_weight.csv'), sep='\t')
        df_shared = pd.read_csv(os.path.join(input_path, sh_dir, 'signatures_weight.csv'), sep='\t')
        
        signatures_found.append('unknown')
        
        for sig in signatures_found:
            df_private_pry[sig+'_muts'] = df_private_pry[sig]*df_private_pry['mutation_count']
            df_private_rel[sig+'_muts'] = df_private_rel[sig]*df_private_rel['mutation_count']
            df_shared[sig+'_muts'] = df_shared[sig]*df_shared['mutation_count']
            
    
    for n,pat in enumerate(clinical['Patient_id'].tolist()):

        df_private_pry_pat = df_private_pry[df_private_pry['sample_id'] == pat].reset_index(drop = True)
        df_private_rel_pat = df_private_rel[df_private_rel['sample_id'] == pat].reset_index(drop = True)
        df_shared_pat = df_shared[df_shared['sample_id'] == pat].reset_index(drop = True)
         
    
        ax1 = axs[0, n]
        ax2 = axs[1, n]
        
        if n == 0:
            ax1.set_ylabel("Mutational process contribution", rotation=90, y=0.3)

        private_pry_tree = list()
        private_pry_bar = list()
        colors_pry = list()
        signatures_pry = list()

        for sig in signatures_found:
            private_pry_bar.append(df_private_pry_pat.loc[0, sig])
            private_pry_tree.append(df_private_pry_pat.loc[0, sig+'_muts'])
            colors_pry.append(COLORS_SIGNATURES[sig])
            signatures_pry.append(sig)

        suma_tree = 0 
        suma_bar = 0
        for i,val in enumerate(private_pry_tree):
            if i == 0:
                if val != 0:
                    ax1.bar(0, -val, color=colors_pry[i], edgecolor='white', 
                        width=barWidth, label=signatures_pry[i])
                    ax2.bar(0, private_pry_bar[i], color=colors_pry[i], edgecolor='white', 
                        width=barWidth, label=signatures_pry[i])
                    suma_tree = suma_tree-val
                    suma_bar = suma_bar+private_pry_bar[i]
            else:
                if val != 0:
                    ax1.bar(0, -val, bottom=suma_tree, color=colors_pry[i], 
                       edgecolor='white', width=barWidth, label=signatures_pry[i])
                    ax2.bar(0, private_pry_bar[i], bottom=suma_bar, color=colors_pry[i], edgecolor='white', 
                        width=barWidth, label=signatures_pry[i])
                    suma_tree = suma_tree-val
                    suma_bar = suma_bar+private_pry_bar[i]
 
        shared_tree = list()
        shared_bar = list()
        colors_sh = list()
        signatures_sh = list()
        
        for sig in signatures_found:
            shared_bar.append(df_shared_pat.loc[0, sig])
            shared_tree.append(df_shared_pat.loc[0, sig+'_muts'])
            colors_sh.append(COLORS_SIGNATURES[sig])
            signatures_sh.append(sig)

        suma_tree = 0 
        suma_bar = 0
    
        for i,val in enumerate(shared_tree):
            if i == 0:
                if val != 0:
                    ax1.bar(1, val, color=colors_sh[i], edgecolor='white', 
                        width=barWidth, label=signatures_sh[i])
                    ax2.bar(1, shared_bar[i], color=colors_sh[i], edgecolor='white', 
                        width=barWidth, label=signatures_sh[i])
                    suma_tree = suma_tree+val
                    suma_bar = suma_bar+shared_bar[i]
            else:
                if val != 0:
                    ax1.bar(1, val, bottom=suma_tree, color=colors_sh[i], 
                       edgecolor='white', width=barWidth,label=signatures_sh[i])
                    ax2.bar(1, shared_bar[i], bottom=suma_bar, color=colors_sh[i], 
                       edgecolor='white', width=barWidth,label=signatures_sh[i])
                    suma_tree = suma_tree+val
                    suma_bar = suma_bar+shared_bar[i]


        private_rel_tree = list()
        private_rel_bar = list()
        colors_rel = list() 
        signatures_rel = list()

        for sig in signatures_found:
            private_rel_bar.append(df_private_rel_pat.loc[0, sig])
            private_rel_tree.append(df_private_rel_pat.loc[0, sig+'_muts'])
            colors_rel.append(COLORS_SIGNATURES[sig])
            signatures_rel.append(sig)

        suma_tree = 0 
        suma_bar = 0
        for i,val in enumerate(private_rel_tree):
            if i == 0:
                if val != 0:
                    ax1.bar(2, -val, color=colors_rel[i], edgecolor='white', 
                        width=barWidth, label=signatures_rel[i])
                    ax2.bar(2, private_rel_bar[i], color=colors_rel[i], edgecolor='white', 
                        width=barWidth, label=signatures_rel[i])
                    suma_tree = suma_tree-val
                    suma_bar = suma_bar+private_rel_bar[i]
            else:
                if val != 0:
                    ax1.bar(2, -val, bottom=suma_tree, color=colors_rel[i], 
                       edgecolor='white', width=barWidth, label=signatures_rel[i])
                    ax2.bar(2, private_rel_bar[i], bottom=suma_bar, color=colors_rel[i], edgecolor='white', 
                        width=barWidth, label=signatures_rel[i])
                    suma_tree = suma_tree-val
                    suma_bar = suma_bar+private_rel_bar[i]

        # Custom axis
        
        
        ax1.set_xticks([0,1,2])
        ax1.set_xticklabels(labels=['P.Pry','Sh','P.Rel'])
        ax1.tick_params(axis='both', which='major')
        
        ax2.set_xticks([0,1,2])
        ax2.set_xticklabels(labels=['P.Pry','Sh','P.Rel'])
        ax2.tick_params(axis='both', which='major')
        
        if n != 0:   
            ax1.set_yticks([], [])
            ax2.set_yticks([], [])
      
            ax1.spines["top"].set_visible(False)
            ax1.spines["right"].set_visible(False)
            ax1.spines["left"].set_visible(False)
                
            ax2.spines["top"].set_visible(False)
            ax2.spines["right"].set_visible(False)
            ax2.spines["left"].set_visible(False)
            
        else:
            ax1.spines["top"].set_visible(False)
            ax1.spines["right"].set_visible(False)
            
            ax2.spines["top"].set_visible(False)
            ax2.spines["right"].set_visible(False)
            

        # Title
        ax1.set_title(pat, {'fontsize':14}, pad=25)
        
        ax1.set_ylim((-2100,3000))
        ax2.set_ylim((0,1))
        
        handles, labels = ax1.get_legend_handles_labels()
        labels_legend.extend(labels)
        labels_legend = list(set(labels_legend))
    

    patchList = []
    for sig in labels_legend:
            data_key = mpatches.Patch(color=COLORS_SIGNATURES[sig.replace(" ", '.')], label=sig)
            patchList.append(data_key)
    text_patch = mpatches.Patch(label='Sh: Shared SNVs', color='none')
    patchList.append(text_patch)
    text_patch = mpatches.Patch(label='P. Pry: Private Primary SNVs', color='none')
    patchList.append(text_patch)
    text_patch = mpatches.Patch(label='P. Rel: Private Relapse SNVs', color='none')
    patchList.append(text_patch)


    plt.legend(handles=patchList,bbox_to_anchor=(1.2, -0.2),prop={'size': 10},ncol=2)

    plt.tight_layout()

    fig.savefig(output_file+'.svg', dpi=300, bbox_inches='tight', format='svg')
    fig.savefig(output_file+'.png', dpi=25, bbox_inches='tight', format='png')
    plt.show()
    
    
def plot_x_sample_subset(in_dir, out_file, clinical, run_signatures, mutations):
    
    fig, axs = plt.subplots(2,1, figsize=(15,8),frameon=False,
                            gridspec_kw={'wspace':0.05, 'hspace':0.5, 'height_ratios':[3, 1]})
    barWidth = 1

    
    if mutations == 'Clonal and subclonal mutations':
        pry_dir = "primary"
        rel_dir = "relapse"
    elif mutations == 'Clonal mutations':
        pry_dir = "primary_clonals"
        rel_dir = "relapse_clonals"
    else:
        print("dires unknown")

    # read files
    df_pry = pd.read_csv(os.path.join(in_dir, pry_dir, 'signatures_weight.csv'), sep='\t') 
    df_rel = pd.read_csv(os.path.join(in_dir, rel_dir,'signatures_weight.csv'), sep='\t')
    
    ax1 = axs[0]
    ax2 = axs[1]
    
    dd_relative = defaultdict(dict)
    dd_absolute = defaultdict(dict)
    
    grps_clinical = clinical.groupby('PATIENT', sort=False)
    sample_labels = []
    
    for pat in grps_clinical.groups:
        print(pat)
        pat_info = grps_clinical.get_group(pat).reset_index(drop=True)
        df_pry_pat = df_pry[df_pry['sample_id'] == pat_info.loc[0,'PATIENT']].reset_index(drop=True)
        df_rel_pat = df_rel[df_rel['sample_id'] == pat_info.loc[1,'PATIENT']].reset_index(drop=True)
        
        sample_labels.append(pat+'_primary')
        sample_labels.append(pat+'_relapse')
        
        dd_relative[pat+'_primary'] = defaultdict(dict)
        dd_absolute[pat+'_primary'] = defaultdict(dict)
        dd_relative[pat+'_relapse'] = defaultdict(dict)
        dd_absolute[pat+'_relapse'] = defaultdict(dict)
        
        for sig in df_pry_pat.columns:
            if sig.startswith('SBS') or sig == 'unknown':
                dd_relative[pat+'_primary'][sig] = df_pry_pat.loc[0, sig]
                dd_absolute[pat+'_primary'][sig] = df_pry_pat.loc[0, sig]*df_pry_pat.loc[0, 'mutation_count']
                dd_relative[pat+'_relapse'][sig] = df_rel_pat.loc[0, sig]
                dd_absolute[pat+'_relapse'][sig] = df_rel_pat.loc[0, sig]*df_rel_pat.loc[0, 'mutation_count']
                
    for i,sam in enumerate(sample_labels):
        suma_tree = 0 
        suma_bar = 0
        for sig in run_signatures: 
            ax1.bar(i, dd_absolute[sam][sig], color=COLORS_SIGNATURES[sig], edgecolor='white', 
                        width=barWidth, label=sig, bottom=suma_tree)
            ax2.bar(i, dd_relative[sam][sig], color=COLORS_SIGNATURES[sig], edgecolor='white', 
                width=barWidth, label=sig, bottom=suma_bar)
            suma_tree = suma_tree+dd_absolute[sam][sig]
            suma_bar = suma_bar+dd_relative[sam][sig]
    
   # Custom axis
    ax1.set_xticks([x for x in range(0,len(grps_clinical)*2,1)])
    ax1.set_xticklabels(labels=sample_labels,ha='right')
    ax1.tick_params(axis='x', which='major',  rotation=45)
    ax1.tick_params(axis='y', which='major', )
    ax1.set_title(mutations, {'fontsize':14}, pad=25)
    
    ax2.set_xticks([x for x in range(0,len(grps_clinical)*2,1)])
    ax2.set_xticklabels(labels=sample_labels,ha='right')
    ax2.tick_params(axis='x', which='major', rotation=45)
    ax2.tick_params(axis='y', which='major',)

    ax1.set_ylim((0,4900))
    ax1.set_xlim((-0.5,(len(sample_labels)-0.5)))
    
    ax2.set_ylim(0,1)
    ax2.set_xlim((-0.5,(len(sample_labels)-0.5)))

    # Legend
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys(),prop={'size': 10},bbox_to_anchor=(1, 5))

    plt.tight_layout()
    fig.savefig(out_file+'.svg', dpi=300, bbox_inches='tight')

    plt.show()             

In [None]:
df_clinical_adults = pd.read_csv("", sep='\t') ## Additional file 2 Table S1
df_clinical_adults = df_clinical_adults[~df_clinical_adults['Patient_id'].isin(['PAT3', 'PAT4'])]

#in case the plot is sorted by batches otherwise by age better
#df_clinical_adults['num_pat'] = df_clinical_adults['Patient_id'].apply(lambda x: int(x.replace("PAT", "")))
#df_clinical_adults.sort_values("num_pat", inplace=True)

df_clinical_adults.sort_values('Primary_diagnosis_age', ascending=False, inplace=True)

df_clinical = df_clinical_adults.copy()

In [None]:
# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# all samples (primary and relapse) together. If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb
# it should correspond to a path pointing to a run with folder named run_subsets_together/
input_path = ""

# output path and name for the figure. It will automatically add the svg extension.
out_file = "" 

# figure
bar_plot_signatures(input_path, ['SBS1', 'SBS5', 'SBS18'], out_file, 'Clonal and subclonal mutations', df_clinical, 2) 

In [None]:
# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# all samples (primary and relapse) together. If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb
# it should correspond to a path pointing to a run with folder named run_subsets_together/
input_path = ""

# output path and name for the figure. It will automatically add the svg extension.
out_file = "" 

bar_plot_signatures(input_path, ['SBS1', 'SBS5', 'SBS18'], out_file, 'Clonal mutations', df_clinical, 2) 

In [None]:
clinical = stage_mapping(df_clinical)

In [None]:
# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# primary and relapse samples separately. If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb
# it should correspond to a path pointing to a run with folder named run_samples_treatment/
input_path = ""

# output path and name for the figure. It will automatically add the svg extension.
out_file = ""
signatures_tall_relapse = ['SBS1', 'SBS5', 'SBS18', 'SBS32', 'SBSA_new', 'SBSB_new', 'unknown']
plot_x_sample_subset(input_path, out_file, clinical, signatures_tall_relapse,'Clonal and subclonal mutations')