This notebook figure corresponds to Additional file 3 Figure S1b. It shows the filtered potential SNPs from the samples. 


This piece of code relies on a workspace directory structure such as:
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
Be aware that the mafs with joined mutations after running step 6 of filtering have the following file name: ```TumorID_vs_normalID + _strelka_uniq_all_anno_vep92_categories.maf```. This is a MAF file with Gnomad population frequencies added but before filtering for it. 
This file name is used in the following code.

PATS_DIRS is a dictionary with the path to the patient folder where the MAF files are.  

In [None]:
import sys, os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from collections import OrderedDict
from aux_data_in_pyvar import config_rcparams, PATS_DIRS
from aux_functions import stage_mapping

In [None]:
def individual_filter(in_path, pat, com, out_path):

    # READ FILE OF CATEGORIES
    in_file = os.path.join(in_path, pat, com, com+'_strelka_uniq_all_anno_vep92_categories.maf')
    df = pd.read_csv(in_file, sep='\t')
    df.rename(columns={'AF_less_0.01':'AF_less_001'}, inplace=True)

    # MAKE LIST OF VALUES FOR BARS
    j = 0

    bar_keep = list()
    bar_snps = list()
    r_pos = list()
    names = list()

    for b in df['bins'].unique():
        dff = df[df['bins'] == b]
        num_keep = len(set(dff[dff['AF_less_001'] == 'yes']['Variant'].unique()))
        bar_keep.append(num_keep)
        num_snps = len(set(dff[dff['AF_less_001'] == 'no']['Variant'].unique()))
        bar_snps.append(num_snps)

        names.append(b)
        r_pos.append(j)
        j = j + 1
    
    count = pd.DataFrame()
    count = count.append({"PATIENT":pat, "COMPARISON":com, 
                          'SNPs':len(set(df[df['AF_less_001'] == 'no']['Variant'].unique())),
                         'Keep':len(set(df[df['AF_less_001'] == 'yes']['Variant'].unique()))}, ignore_index=True)
    return count

In [None]:
output_path = ""

In [None]:
clinical = pd.read_csv("", sep='\t') # Additional file 2 Table S1 from the paper
clinical = stage_mapping(clinical)

In [None]:
grps = clinical.groupby(by=['PATIENT', 'COMPARISON'])

counts = pd.DataFrame()

for g in grps.groups:
    input_path = PATS_DIRS[g[0]]
    counts = counts.append(individual_filter(input_path, g[0], g[1], output_path), ignore_index=True)

In [None]:
counts = counts.merge(clinical[['COMPARISON', 'STAGE']], how='left', on='COMPARISON')
counts['SORTER'] = counts['PATIENT'].apply(lambda x: int(x.replace("PAT", "")))
counts.sort_values(by=['SORTER', 'STAGE'], inplace=True)
counts.reset_index(drop=True, inplace=True)

In [None]:
fig = plt.figure(figsize=(25,10))

outer = gridspec.GridSpec(1,1, wspace=0, hspace=0)

barWidth = 1

ax = plt.subplot(outer[0,0])

grps = counts.groupby("PATIENT")

labels = []

i = 0
for g in counts['PATIENT'].unique():
    df = grps.get_group(g).reset_index()
    # primary
    ax.bar(i, df[df['STAGE'] == 'primary'].loc[0,'Keep'], color='#678796', edgecolor='white', 
       width=barWidth, label='keep')
    ax.bar(i, df[df['STAGE'] == 'primary'].loc[0,'SNPs'], bottom= df[df['STAGE'] == 'primary'].loc[0,'Keep'], 
           color='#bababa', edgecolor='white', width=barWidth,label='filter out SNP')
    labels.append(g+'_primary')
    # relapse
    ax.bar(i+1, df[df['STAGE'] == 'relapse'].loc[1,'Keep'], color='#678796', edgecolor='white', 
       width=barWidth, label='keep')
    ax.bar(i+1, df[df['STAGE'] == 'relapse'].loc[1,'SNPs'], bottom= df[df['STAGE'] == 'relapse'].loc[1,'Keep'], 
           color='#bababa', edgecolor='white', width=barWidth,label='filter out SNP')
    labels.append(g+'_relapse')
    labels.append('')
    i = i+3

pos = [x for x in range(0, len(labels),1)]
# Custom axis
plt.xticks(pos,labels)
plt.xlabel("Patients",fontsize=22, labelpad=24)
plt.ylabel("Mutations", fontsize=22, labelpad=24)
ax.tick_params(axis='both', which='major', labelsize=18)

plt.xticks(rotation=45, fontsize=18, ha='right')
# Legend
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(),prop={'size': 20},bbox_to_anchor=(1,1))

plt.tight_layout()

fig.savefig(os.path.join(out_path, "barplot_snps_filter.svg"), dpi=300, bbox_inches='tight')
plt.show()  