This notebook does scatter plots with the Cancer Cell Fraction values of primary and relapse. The plots are then ensembled in figure of Additional file 3 Figure S1c. 

This piece of code relies on a workspace directory structure such as:
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
Be aware that the filtered mafs with clonal classification and joined mutations after running the scripts in ```filter/```  have the following file name: ```TumorID_vs_normalID + _strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf``` 
.This file name is used in the following code.

PATS_DIRS is a dictionary with the path to the patient folder where the MAF files are.  

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from collections import OrderedDict
from scipy.stats import gaussian_kde

from aux_data_in_pyvar import config_rcparams, PATS_DIRS
from aux_functions import get_three_subsets

In [None]:
## FUNCTIONS
def scatter_plot(dff, pat, info_clinic, out_path):
    dff.rename(columns={"clonal_classification_rel":'Relapse Clonal Categories',
                       'clonal_classification_pry':"Primary Clonal Categories"}, inplace=True)
    dff.sort_values(by=['ccf_primary', 'ccf_relapse'], inplace=True, ascending=False)
    
    ccf_x = dff['ccf_primary'].tolist()
    ccf_y = dff['ccf_relapse'].tolist()

    g = sns.JointGrid(x="ccf_primary", y="ccf_relapse", data=dff, xlim=[-.2,1.5], ylim=[-.2,1.5])
    g = g.plot_joint(sns.scatterplot,data=dff, hue='Primary Clonal Categories',
                     style="Relapse Clonal Categories", markers=['o', '^'], alpha=0.2, linewidth=0.8,
                     edgecolor='face', s=100,legend=False)
    g = g.plot_marginals(sns.kdeplot, shade=True, color='#4d4d4d')
        
    title = pat
    g.fig.suptitle(title, fontsize = 22, y=1.0)
    g.fig.set_figwidth(6)
    g.fig.set_figheight(4)
    g.ax_joint.set_xlabel('CCF Primary', fontsize = 18)
    g.ax_joint.set_ylabel('CCF Relapse', fontsize = 18)
    #g.ax_joint.legend(bbox_to_anchor=(1.20, 1.20), loc=2)
    #g.savefig(os.path.join(out_path, pat+".svg"), dpi=50, bbox_inches='tight')
    g.savefig(os.path.join(out_path, pat+".png"), dpi=50, bbox_inches='tight')
    plt.show()

In [None]:
config_rcparams()

In [None]:
out_figures = "" # output to write figures

In [None]:
df_clinical = pd.read_csv("", sep='\t') # Additional file 1 Table S1

In [None]:
shared_muts = pd.DataFrame()
private_primary_muts = pd.DataFrame()
private_relapse_muts = pd.DataFrame()

for p in df_clinical['Patient_id'].unique():
    
    maf_path = PATS_DIRS[p]
    
    pat_clinical = df_clinical[df_clinical['Patient_id'] == p].reset_index()
    
    #comparison
    com_pry = pat_clinical.loc[0, 'Primary_seq_id']+'_vs_'+pat_clinical.loc[0, 'Remission_seq_id']
    com_rel = pat_clinical.loc[0, 'Relapse_seq_id']+'_vs_'+pat_clinical.loc[0, 'Remission_seq_id']
    
    # read_mutations
    df_pry = pd.read_csv(os.path.join(maf_path, p, com_pry, com_pry+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'), sep='\t')
    df_rel = pd.read_csv(os.path.join(maf_path, p, com_rel, com_rel+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'), sep='\t')
    
    # remove indels and mnvs
    df_pry = df_pry[df_pry['mut_type'] == 'snv']
    df_rel = df_rel[df_rel['mut_type'] == 'snv']

    # make subsets
    trunk, private_pry, private_rel = get_three_subsets(df_pry, df_rel)

    # private primary
    private_pry_df = df_pry[df_pry['Variant'].isin(private_pry)]
    private_rel_df = df_rel[df_rel['Variant'].isin(private_rel)]
    shared_pry = df_pry[df_pry['Variant'].isin(trunk)]
    shared_rel = df_rel[df_rel['Variant'].isin(trunk)]
    
    shared_pry = shared_pry[['Variant','Consequence','FILTER','t_alt_reads', 't_ref_reads', 'clonal_classification', 'ccf']]
    shared_rel = shared_rel[['Variant','Consequence','FILTER','t_alt_reads', 't_ref_reads', 'clonal_classification', 'ccf']]
    
    shared_pry.rename(columns={'FILTER':'FILTER_pry','t_alt_reads':'t_alt_reads_pry', 
                               't_ref_reads':'t_ref_reads_pry', 'clonal_classification':'clonal_classification_pry',
                              'ccf':'ccf_primary'}, inplace=True)
    shared_rel.rename(columns={'FILTER':'FILTER_rel','t_alt_reads':'t_alt_reads_rel', 
                               't_ref_reads':'t_ref_reads_rel', 'clonal_classification':'clonal_classification_rel',
                              'ccf':'ccf_relapse'}, inplace=True)
    shared = shared_pry.merge(shared_rel, how='outer', on='Variant')
    shared['PATIENT'] = p
    private_pry_df['PATIENT'] = p
    private_rel_df['PATIENT'] = p
    
    print(p)
    print(len(shared))
    print(len(private_pry_df))
    print(len(private_rel_df))
    
    shared_muts = shared_muts.append(shared, ignore_index=True)
    private_primary_muts = private_primary_muts.append(private_pry_df[['Variant','Consequence','FILTER','t_alt_reads', 't_ref_reads', 'clonal_classification', 'ccf', 'PATIENT']], ignore_index=True)
    private_relapse_muts = private_relapse_muts.append(private_rel_df[['Variant','Consequence','FILTER','t_alt_reads', 't_ref_reads', 'clonal_classification', 'ccf', 'PATIENT']], ignore_index=True)

In [None]:
## prepare dataframes

private_primary_muts['ccf_primary'] = private_primary_muts['ccf']
private_primary_muts['ccf_relapse'] = 0

private_relapse_muts['ccf_relapse'] = private_relapse_muts['ccf']
private_relapse_muts['ccf_primary'] = 0

private_primary_muts['clonal_classification_pry'] = private_primary_muts['clonal_classification'] 
private_relapse_muts['clonal_classification_rel'] = private_relapse_muts['clonal_classification']  

In [None]:
# append them for plotting

dff_plot = shared_muts[['ccf_primary', 'ccf_relapse', 'PATIENT','clonal_classification_pry', 'clonal_classification_rel']]
dff_plot = dff_plot.append(private_primary_muts[['ccf_primary', 'ccf_relapse', 'PATIENT','clonal_classification_pry']], ignore_index=True)
dff_plot = dff_plot.append(private_relapse_muts[['ccf_primary', 'ccf_relapse', 'PATIENT','clonal_classification_rel']], ignore_index=True)

dff_plot['clonal_classification_pry'].fillna('subclonal', inplace=True)
dff_plot['clonal_classification_rel'].fillna('subclonal', inplace=True)

In [None]:
grps = dff_plot.groupby('PATIENT')

for g in grps.groups:
    
    dff_plot_pat = grps.get_group(g)
    print(g)
    print(len(dff_plot_pat))
    
    df_clinical_subset = df_clinical[df_clinical['Patient_id'] == g]
    df_clinical_subset  = df_clinical_subset[['Patient_id','Primary_seq_id', 'Remission_seq_id', 'Relapse_seq_id',
                                  'Primary_sample_type', 'Relapse_sample_type']].reset_index()
    df_clinical_subset = df_clinical_subset.fillna("unknown")
    
    scatter_plot(dff_plot_pat, g, df_clinical_subset, out_figures)