This notebook creates the input for the mutation rate models of each observable signature 5 contribution per patient of each subset (private primary, private relapse and shared) of clonal SNVs.

With this data the phylogenetic trees are also drawn which corresponds to Figure 3b of the paper

This piece of code relies on a workspace directory structure such as:
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
Be aware that the filtered mafs with clonal classification and joined mutations after running the scripts in ```filter/```  have the following file name: ```TumorID_vs_normalID + _strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf``` 
.This file name is used in the following code.

In [None]:
import sys, os
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]
import matplotlib
matplotlib.use('Agg')
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib import collections as mc

from aux_functions import stage_mapping, get_context_rev,add_pyrimidine_type, df_to_dict,count_variant_type, get_muts_x_signature
from aux_data_in_pyvar import PATS_DIRS, config_rcparams

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

%load_ext autoreload
%autoreload 2

In [None]:
config_rcparams()
plt.rcParams['font.size'] = 14

In [None]:
def figure_phylo_tree(df,sig, out_dir, clonality):
    # PLOT

    fig = plt.figure(figsize=(20, 80))

    outer = gridspec.GridSpec(len(df['PATIENT'].unique()), 1, wspace=0, hspace=0)

    j = 0

    num_pat = len(df['PATIENT'].unique())

    for i, rw in df.iterrows():

        # CREATE SEGMENTS
        # lines as proportion in percentatge
        if round(df.loc[i, 'PRIVATE_PRY_'+sig]) > round(
                df.loc[i, 'PRIVATE_REL_'+sig]):

            total_scale = df.loc[i, 'TRUNK_'+sig] + df.loc[i, 'PRIVATE_PRY_'+sig]
        else:
            total_scale = df.loc[i, 'TRUNK_'+sig] + df.loc[i, 'PRIVATE_REL_'+sig]

        line_trunk = (df.loc[i, 'TRUNK_'+sig] / total_scale) * 100
        line_pry = (df.loc[i, 'PRIVATE_PRY_'+sig] / total_scale) * 100
        line_rel = (df.loc[i, 'PRIVATE_REL_'+sig] / total_scale) * 100

        lines = [[(0, 1), (line_trunk, 1)], [(line_trunk, 1), (line_trunk + line_rel, 2.4)],
                 [(line_trunk, 1), (line_trunk + line_pry, 0.2)]]
        c = ['#006837', '#fd8d3c','#2c7fb8']

        # PLOT
        line_segments = mc.LineCollection(lines, linewidths=20,
                                          colors=c, linestyle='solid')

        ax = plt.subplot(outer[j, 0])
        ax.add_collection(line_segments)
        ax.scatter(line_trunk, 1, s=1000, color='r', zorder=3)

        ax.set_ylabel('{} \n {} years,\n{}, {}'.format(df.loc[i, 'PATIENT'],
                                                   int(df.loc[i, 'DIAGNOSIS_AGE_YEARS']),
                                                       df.loc[i, 'SEX'],
                                                       df.loc[i, 'IMMUNO_CLASS']), fontsize=60, labelpad=50)


        ax.set_ylim([-3, 5])
        ax.set_xlim([0, 100])
        ax.spines["right"].set_visible(False)

        ax.annotate('{}'.format(int(round(df.loc[i, 'TRUNK_'+sig]))),
                    (int(round(line_trunk/3)), -1), textcoords='data',
                    size=40, horizontalalignment='center', rotation=90)
        ax.annotate('{}'.format(int(round(df.loc[i, 'PRIVATE_PRY_'+sig]))),
                    (int(round((line_trunk + line_pry) - (line_pry / 2))), -1.2), textcoords='data',
                    size=40, verticalalignment='center', rotation=90, ha='right')
        ax.annotate('{}'.format(int(round(df.loc[i, 'PRIVATE_REL_'+sig]))),
                    (int(round((line_trunk + line_pry) - (line_pry / 2))), 3), textcoords='data', size=40,
                    verticalalignment='center', rotation=90, ha='left')
        ax.annotate('{} days'.format(int(df.loc[i, 'PRIMARY_TO_RELAPSE_AGE_DAYS'])),
                        (100, 1), textcoords='data', size=40, verticalalignment='center', rotation=90)

        if j < (num_pat-1):
            fig.add_subplot(ax)
            plt.yticks([], [])
            plt.xticks([], [])

        else:
            fig.add_subplot(ax)
            plt.tick_params(axis='x', which='major', labelsize=50)
            plt.yticks([], [])

        j = j + 1

    plt.tight_layout()

    plt.xlabel("Molecular time (%)", rotation=180, fontsize=80)
    plt.xticks(rotation=90)

    fig.savefig(os.path.join(out_dir,
        "phylo_tree_muts_"+clonality+"_"+sig+".svg"),
        dpi=500, bbox_inches='tight',
        orientation='portrait')
    plt.show()
    plt.close()

In [None]:
clinical = pd.read_csv("", sep='\t') # Additional file 1 Table S1
clinical = clinical[~clinical['Patient_id'].isin(['PAT3', 'PAT4'])]
clinical.head()

### Signatures contribution to each subset

In [None]:
# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# all samples (primary and relapse) together. If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb
# it should correspond to the results file signatures_weight.csv of a run with folder named run_subsets_together/
deconstruct_run = "" 

In [None]:
clonal = True

In [None]:
signature_subsets = pd.DataFrame()

for pat in clinical['Patient_id'].unique():

    # GET INFO
    pat_clinical = clinical[clinical['Patient_id'] == pat].reset_index()
    com_pry = pat_clinical.loc[0, 'Primary_seq_id']+'_vs_'+pat_clinical.loc[0, 'Remission_seq_id']
    com_rel = pat_clinical.loc[0, 'Relapse_seq_id']+'_vs_'+pat_clinical.loc[0, 'Remission_seq_id']

    print(pat)

    # READ MUTATION DATASET
    df_pry = pd.read_table(os.path.join(PATS_DIRS[pat], pat, com_pry,
                                        com_pry+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'),
                           sep='\t',low_memory=False)

    df_rel = pd.read_table(os.path.join(PATS_DIRS[pat], pat, com_rel,
                                        com_rel+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'), 
                        sep='\t',low_memory=False)

    # GET CLONALS and SNVS
    df_pry = df_pry[df_pry['mut_type'] == 'snv']
    df_rel = df_rel[df_rel['mut_type'] == 'snv']

    print(len(df_pry))
    print(len(df_rel))

    # SEPARATE CLONALS SUBCLONALS
    if clonal == True:
        df_pry = df_pry[df_pry['clonal_classification'] == 'clonal']
        df_rel = df_rel[df_rel['clonal_classification'] == 'clonal']

    # GET SUBSETS
    all_pry_variants = set(df_pry['Variant'].unique())
    all_rel_variants = set(df_rel['Variant'].unique())

    print(len(all_pry_variants))
    print(len(all_rel_variants))

    shared_variants = all_pry_variants.intersection(all_rel_variants)
    private_pry_variants = all_pry_variants.difference(shared_variants)
    private_rel_variants = all_rel_variants.difference(shared_variants) 

    df_shared = df_pry[df_pry['Variant'].isin(shared_variants)]
    df_private_pry = df_pry[df_pry['Variant'].isin(private_pry_variants)]
    df_private_rel = df_rel[df_rel['Variant'].isin(private_rel_variants)]

    for signature in ['SBS1', 'SBS5', 'SBS18']:

        count_pp, count_pr, count_sh = get_muts_x_signature(sh=df_shared, pp=df_private_pry, 
                                                        pr=df_private_rel, pat=pat, sig=signature, 
                                                        prob_file_path=deconstruct_run)

        signature_subsets.set_value(index=pat, col='PRIVATE_PRY_'+signature, value=count_pp)
        signature_subsets.set_value(index=pat, col='PRIVATE_REL_'+signature, value=count_pr)   
        signature_subsets.set_value(index=pat, col='TRUNK_'+signature, value=count_sh)   


    # ADD info for phylotree
    signature_subsets.set_value(index=pat, col='DIAGNOSIS_AGE_YEARS', value=pat_clinical.loc[0, 'Primary_diagnosis_age'])
    signature_subsets.set_value(index=pat, col='PRIMARY_TO_RELAPSE_AGE_DAYS', value=pat_clinical.loc[0, 'days_between_pry_rel'])
    signature_subsets.set_value(index=pat, col='IMMUNO_CLASS', value=pat_clinical.loc[0,'Primary_immunoclassification'])
    signature_subsets.set_value(index=pat, col='SEX', value=pat_clinical.loc[0,'Sex'])

In [None]:
signature_subsets.reset_index(inplace=True)
signature_subsets.rename(columns={'index':'PATIENT'}, inplace=True)
signature_subsets.sort_values('DIAGNOSIS_AGE_YEARS', ascending=True,  inplace=True)  # sort by age

In [None]:
signature_subsets.to_csv("../intermediate_files/signature_counts.tsv", sep='\t', index=False)

### phylotree plot

In [None]:
signature = 'SBS5'

In [None]:
output_path = "" # output path for the figure

In [None]:
figure_phylo_tree(signature_subsets, signature, output_path, 'clonal')

In [None]:
len(signature_subsets)

In [None]:
signature_subsets