# Check sample sizes

For a given cancer type and signature, check the number of available mutations and samples to compute hotspot propensity

In [1]:
import os
from collections import defaultdict

import pandas as pd

In [2]:
main_dir = ''
input_path_mutations = f'{main_dir}/propensity/data/filtered_mutations'

In [3]:
data_to_run = {
 'BLADDER_URI': ['SBS2', 'SBS13', 'SBS5', 'SBS1'],
 'BRCA': ['SBS5', 'SBS1', 'SBS2', 'SBS13', 'SBS3'],
 'COADREAD': ['SBS1', 'SBS40', 'SBS17b', 'SBS5', 'SBS18', 'SBS93', 'SBS17a'],
 'ESOPHA_STOMACH': ['SBS17b', 'SBS17a', 'SBS5', 'SBS1'],
 'NSCLC': ['SBS4', 'SBS5', 'SBS40', 'SBS2', 'SBS13', 'SBS1'],
 'PROSTATE': ['SBS5', 'SBS1', 'SBS8'],
 'SKCM': ['SBS7a', 'SBS7b']
}

## Compute number of mutations available per sample/cancer type/signature

In [4]:
sig_samples_dict = defaultdict(dict)
for cancertype, signatures in data_to_run.items(): 
    print(cancertype)
    mutations = os.path.join(input_path_mutations, f'{cancertype}_SBS96_total_maxprob.originalref.hg38_1000kb_autosomes_bin.tsv')
    ctype_mutations_df = pd.read_csv(mutations, sep='\t', header=0, low_memory=False)
    for signature in signatures: 
        signature_mutations_df = ctype_mutations_df.loc[(ctype_mutations_df['SIGNATURE'] == signature) & (ctype_mutations_df['PROB'] > 0.5)].copy()
        muts_per_sample = dict(signature_mutations_df['SAMPLE'].value_counts())
        sig_samples_dict[cancertype][signature] = muts_per_sample

BLADDER_URI
BRCA
COADREAD
ESOPHA_STOMACH
NSCLC
PROSTATE
SKCM


In [5]:
lines = []
mutations_cutoffs = [100, 125, 150, 175, 200, 225, 250, 275, 300]
for cancertype, signatures in data_to_run.items():
    for signature in sorted(signatures): 
        muts_per_sample = sig_samples_dict[cancertype][signature]
        total = len(muts_per_sample)
        subline = [cancertype, signature, total]
        for cutoff in mutations_cutoffs + [500, 600]: 
            available_samples = len([s for s, m in muts_per_sample.items() if m > cutoff])
            subline.append(available_samples)
        lines.append(pd.DataFrame([subline]))
table = pd.concat(lines)            
table.columns = ['CANCER_TYPE', 'SIGNATURE', 'TOTAL_SAMPLES'] + [f'{m}_MUTS_SAMPLES' for m in mutations_cutoffs + [500, 600]]

In [6]:
len(table)

31

In [7]:
# Check that propensity can be computed with up to 300 mutations and 100 samples
print(len(table.loc[table['300_MUTS_SAMPLES'] >= 100]))
table.loc[table['300_MUTS_SAMPLES'] >= 100][['CANCER_TYPE', 'SIGNATURE', '300_MUTS_SAMPLES']]

31


Unnamed: 0,CANCER_TYPE,SIGNATURE,300_MUTS_SAMPLES
0,BLADDER_URI,SBS1,117
0,BLADDER_URI,SBS13,135
0,BLADDER_URI,SBS2,131
0,BLADDER_URI,SBS5,110
0,BRCA,SBS1,402
0,BRCA,SBS13,351
0,BRCA,SBS2,378
0,BRCA,SBS3,164
0,BRCA,SBS5,698
0,COADREAD,SBS1,473


In [8]:
# Check that propensity can be computed with larger sample sizes
print(len(table.loc[table['600_MUTS_SAMPLES'] >= 100]))
table.loc[table['600_MUTS_SAMPLES'] >= 100][['CANCER_TYPE', 'SIGNATURE', '600_MUTS_SAMPLES']]

26


Unnamed: 0,CANCER_TYPE,SIGNATURE,600_MUTS_SAMPLES
0,BLADDER_URI,SBS13,132
0,BLADDER_URI,SBS2,129
0,BRCA,SBS13,231
0,BRCA,SBS2,240
0,BRCA,SBS3,160
0,BRCA,SBS5,658
0,COADREAD,SBS1,471
0,COADREAD,SBS17b,172
0,COADREAD,SBS18,255
0,COADREAD,SBS40,243


In [9]:
subset_df = table.loc[table['600_MUTS_SAMPLES'] >= 100]
run_data_dict = defaultdict(list)
for ctype, data in subset_df.groupby('CANCER_TYPE'): 
    for signature in data['SIGNATURE'].unique(): 
        run_data_dict[ctype].append(signature)
run_data_dict

defaultdict(list,
            {'BLADDER_URI': ['SBS13', 'SBS2'],
             'BRCA': ['SBS13', 'SBS2', 'SBS3', 'SBS5'],
             'COADREAD': ['SBS1', 'SBS17b', 'SBS18', 'SBS40', 'SBS5', 'SBS93'],
             'ESOPHA_STOMACH': ['SBS1', 'SBS17a', 'SBS17b', 'SBS5'],
             'NSCLC': ['SBS13', 'SBS2', 'SBS4', 'SBS40', 'SBS5'],
             'PROSTATE': ['SBS1', 'SBS5', 'SBS8'],
             'SKCM': ['SBS7a', 'SBS7b']})