Output figure corresponds to Additional file 3 Figure S1a. The barplot shows the number of mutations recovered ("fished") when searching in the paired tumoral sample.

This piece of code relies on a workspace directory structure such as 
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
Be aware that the filtered mafs with joined mutations after running step 6 of filtering have the following file name: ```TumorID_vs_normalID + _strelka_uniq_all_anno_vep92_categories_filt_snps.maf``` 
.This file name is used in the following code.

PATS_DIRS is a dictionary with the path to the patient folder where the MAF files are.  

In [None]:
import pandas as pd
import numpy as np
import glob
import os

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

from collections import OrderedDict

from aux_data_in_pyvar import config_rcparams, PATS_DIRS

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

%load_ext autoreload
%autoreload 2

In [None]:
config_rcparams()

In [None]:
out_path = #output to write results

In [None]:
df_clinical = pd.read_csv("", sep='\t') # Additional file 2 Table S1 from the paper
df_clinical['num'] = df_clinical['Patient_id'].apply(lambda x: int(x.replace("PAT", "")))
df_clinical.sort_values(by='num',ascending=True, inplace=True)
df_clinical.head()

In [None]:
fig, axs = plt.subplots(2,1, figsize=(25,15),frameon=False,
                            gridspec_kw={'wspace':0.05, 'hspace':0.5})
barWidth = 1

bar_pass_standard_snvs_pry = list()
bar_fished_standard_snvs_pry = list()

bar_pass_standard_snvs_rel = list()
bar_fished_standard_snvs_rel = list()

bar_pass_standard_indels_pry = list()
bar_fished_standard_indels_pry = list()

bar_pass_standard_indels_rel = list()
bar_fished_standard_indels_rel = list()

samples = list()

for pat in df_clinical['Patient_id'].tolist():
    
    pat_clinical = df_clinical[df_clinical['Patient_id'] == pat].reset_index()
    
    #comparison
    com_pry = pat_clinical.loc[0, 'Primary_seq_id']+'_vs_'+pat_clinical.loc[0, 'Remission_seq_id']
    com_rel = pat_clinical.loc[0, 'Relapse_seq_id']+'_vs_'+pat_clinical.loc[0, 'Remission_seq_id']
    
    maf_path = PATS_DIRS[pat]
    
    # read files
    df_all_pry = pd.read_csv(os.path.join(maf_path, 
                                           pat, com_pry, com_pry+'_strelka_uniq_all_anno_vep92_categories_filt_snps.maf'),sep='\t')
    df_all_pry['subset_origin'] = 'tumor_vs_normal'
    df_snvs_joined_pry = df_all_pry[df_all_pry['mut_type'] == 'snv']
    df_indels_joined_pry = df_all_pry[df_all_pry['mut_type'] == 'indels']
    
    df_all_rel = pd.read_csv(os.path.join(maf_path, 
                                           pat, com_rel, com_rel+'_strelka_uniq_all_anno_vep92_categories_filt_snps.maf'),sep='\t')
    df_all_rel['subset_origin'] = 'tumor_vs_normal'
    df_snvs_joined_rel = df_all_rel[df_all_rel['mut_type'] == 'snv']
    df_indels_joined_rel = df_all_rel[df_all_rel['mut_type'] == 'indels']
    
    # new category
    df_snvs_joined_pry['filter'] = df_snvs_joined_pry['FILTER'].apply(lambda x: 'PASS' if x in ['PASS', 'DP'] else 'FISHED')
    df_indels_joined_pry['filter'] = df_indels_joined_pry['FILTER'].apply(lambda x: 'PASS' if x in ['PASS', 'DP'] else 'FISHED')
    df_snvs_joined_rel['filter'] = df_snvs_joined_rel['FILTER'].apply(lambda x: 'PASS' if x in ['PASS', 'DP'] else 'FISHED')
    df_indels_joined_rel['filter'] = df_indels_joined_rel['FILTER'].apply(lambda x: 'PASS' if x in ['PASS', 'DP'] else 'FISHED')

    #primary
    bar_pass_standard_snvs_pry.append(len(df_snvs_joined_pry[(df_snvs_joined_pry['filter'] == 'PASS') & (df_snvs_joined_pry['subset_origin'] == 'tumor_vs_normal')]))
    bar_pass_standard_indels_pry.append(len(df_indels_joined_pry[(df_indels_joined_pry['filter'] == 'PASS') & (df_indels_joined_pry['subset_origin'] == 'tumor_vs_normal')]))
    
    bar_fished_standard_snvs_pry.append(len(df_snvs_joined_pry[(df_snvs_joined_pry['filter'] == 'FISHED') & (df_snvs_joined_pry['subset_origin'] == 'tumor_vs_normal')]))
    bar_fished_standard_indels_pry.append(len(df_indels_joined_pry[(df_indels_joined_pry['filter'] == 'FISHED') & (df_indels_joined_pry['subset_origin'] == 'tumor_vs_normal')]))
    

    samples.append(pat+'_primary')
    
    # relapse
    bar_pass_standard_snvs_rel.append(len(df_snvs_joined_rel[(df_snvs_joined_rel['filter'] == 'PASS') & (df_snvs_joined_rel['subset_origin'] == 'tumor_vs_normal')]))
    bar_pass_standard_indels_rel.append(len(df_indels_joined_rel[(df_indels_joined_rel['filter'] == 'PASS') & (df_indels_joined_rel['subset_origin'] == 'tumor_vs_normal')]))
    
    bar_fished_standard_snvs_rel.append(len(df_snvs_joined_rel[(df_snvs_joined_rel['filter'] == 'FISHED') & (df_snvs_joined_rel['subset_origin'] == 'tumor_vs_normal')]))
    bar_fished_standard_indels_rel.append(len(df_indels_joined_rel[(df_indels_joined_rel['filter'] == 'FISHED') & (df_indels_joined_rel['subset_origin'] == 'tumor_vs_normal')]))    

    samples.append(pat+'_relapse')
    samples.append('')
    
    
ax1 = axs[0] 
ax2 = axs[1]

j = 0
for i, val in enumerate(bar_pass_standard_snvs_pry):
 
    sum_1 = 0
    # snvs primary column
    ax1.bar(j, val, color='#b2182b', edgecolor='white', 
               width=barWidth, label='PASS')
    ax1.bar(j, bar_fished_standard_snvs_pry[i], color='#f4a582', edgecolor='white', 
               width=barWidth, bottom=val,label='FISHED')
    sum_1 = val+bar_fished_standard_snvs_pry[i]
        
    # snvs relapse column
    sum_1 = 0

    ax1.bar(j+1, bar_pass_standard_snvs_rel[i], color='#b2182b', edgecolor='white', 
               width=barWidth, label='PASS')
    ax1.bar(j+1, bar_fished_standard_snvs_rel[i], color='#f4a582', edgecolor='white', 
               width=barWidth, bottom=bar_pass_standard_snvs_rel[i],label='FISHED')
    sum_1 = bar_pass_standard_snvs_rel[i]+bar_fished_standard_snvs_rel[i]

    j = j+3
    
    
j = 0   
for i, val in enumerate(bar_pass_standard_indels_pry):    

    sum_1 = 0

    # indels primary column
    ax2.bar(j, val, color='#b2182b', edgecolor='white', 
               width=barWidth, label='PASS')
    ax2.bar(j, bar_fished_standard_indels_pry[i], color='#f4a582', edgecolor='white', 
               width=barWidth, bottom=val,label='FISHED')
    sum_1 = val+bar_fished_standard_indels_pry[i]

    sum_1 = 0

    # indels primary column
    ax2.bar(j+1, bar_pass_standard_indels_rel[i], color='#b2182b', edgecolor='white', 
               width=barWidth, label='PASS')
    ax2.bar(j+1, bar_fished_standard_indels_rel[i], color='#f4a582', edgecolor='white', 
               width=barWidth, bottom=bar_pass_standard_indels_rel[i],label='FISHED')
    sum_1 = bar_pass_standard_indels_rel[i]+bar_fished_standard_indels_rel[i]

    j = j+3

  
    
# Custom axis
ax1.set_xticks([x for x in range(0,j,1)])
ax1.set_xticklabels(labels=samples,ha='right')
ax1.tick_params(axis='x', which='major', labelsize=18, rotation=45)
ax1.tick_params(axis='y', which='major', labelsize=18)
ax1.set_ylabel("SNVs",fontsize=18)
ax1.set_title("Variants", {'fontsize':25}, pad=25)

ax2.set_xticks([x for x in range(0,j,1)])
ax2.set_xticklabels(labels=samples,ha='right')
ax2.set_ylabel("InDels",fontsize=18)
ax2.tick_params(axis='x', which='major', labelsize=18, rotation=45)
ax2.tick_params(axis='y', which='major', labelsize=18)

ax1.set_ylim((0,10500))
ax1.set_xlim((-0.5,(len(samples)-0.5)))

ax2.set_ylim(0,30000)
ax2.set_xlim((-0.5,(len(samples)-0.5)))

# Legend
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(),prop={'size': 20},bbox_to_anchor=(1,2.5))

plt.tight_layout()
fig.savefig(os.path.join(out_path, "after_filters.svg"), dpi=150, bbox_inches='tight', format="svg")

plt.show()    