# Run subsampling

For a given cancer type and signature, generate N random subsamples of a fixed number of tumours and mutations attributed to the signature by maximum likelihood (with a probability of at least 0.5 of being generated by the signature). 

In [1]:
import os
from collections import defaultdict

import numpy as np
import pandas as pd

## Compute propensity for 100 samples and 100-300 mutations/sample and signature

In [15]:
iterations = 1000
sample_size = 100
mutations_cutoffs = [100, 125, 150, 175, 200, 225, 250, 275, 300]

In [16]:
main_dir = ''

In [17]:
input_path_mutations = f'{main_dir}/data/filtered_mutations/'

In [18]:
map_file = f'{main_dir}/code/3_downsampling_{sample_size}samples_100-300muts.map'

In [19]:
code_f = f'{main_dir}/code/downsampling_signature.py'

In [20]:
output_path = f'{main_dir}/data'

In [21]:
info = [
    '[params]',
    'cores=1',
    'memory=24G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [26]:
data_to_run_new = {
 'SKCM': ['SBS7a', 'SBS7b'],
 'BLADDER_URI': ['SBS2', 'SBS13', 'SBS5', 'SBS1'],
 'BRCA': ['SBS5', 'SBS1', 'SBS2', 'SBS13', 'SBS3'],
 'COADREAD': ['SBS1', 'SBS40', 'SBS17b', 'SBS5', 'SBS18', 'SBS93', 'SBS17a'],
 'ESOPHA_STOMACH': ['SBS17b', 'SBS17a', 'SBS5', 'SBS1'],
 'NSCLC': ['SBS4', 'SBS5', 'SBS40', 'SBS2', 'SBS13', 'SBS1'],
 'PROSTATE': ['SBS5', 'SBS1', 'SBS8']
}

In [27]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
        
    for mcutoff in mutations_cutoffs: 
        output_dir = f'{output_path}/subsamples/{iterations}iter_{sample_size}samples_{mcutoff}muts'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
    
        for cancertype, signatures in data_to_run_new.items(): 
            mutations = os.path.join(input_path_mutations, f'{cancertype}_SBS96_total_maxprob.originalref.hg38_1000kb_autosomes_bin.tsv')
            for signature in signatures: 
                ofd.write(f'python {code_f} -c {cancertype} -sig {signature} -m {mutations} -o {output_dir} -i {iterations} -s {sample_size} -mc {mcutoff} \n')

## Compute propensity for 100 samples and 600 mutations/sample and signature

In [35]:
iterations = 1000
sample_size = 100
mutations_cutoffs = [600]

In [36]:
main_dir = ''

In [37]:
input_path_mutations = f'{main_dir}/data/filtered_mutations/'

In [42]:
map_file = f'{main_dir}/code/3_downsampling_{sample_size}samples_600muts.map'

In [43]:
code_f = f'{main_dir}/code/downsampling_signature.py'

In [44]:
output_path = f'{main_dir}/data'

In [45]:
info = [
    '[params]',
    'cores=1',
    'memory=24G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [46]:
data_to_run_new = {
    'BLADDER_URI': ['SBS13', 'SBS2'],
     'BRCA': ['SBS13', 'SBS2', 'SBS3', 'SBS5'],
     'COADREAD': ['SBS1', 'SBS17b', 'SBS18', 'SBS40', 'SBS5', 'SBS93'],
     'ESOPHA_STOMACH': ['SBS1', 'SBS17a', 'SBS17b', 'SBS5'],
     'NSCLC': ['SBS13', 'SBS2', 'SBS4', 'SBS40', 'SBS5'],
     'PROSTATE': ['SBS1', 'SBS5', 'SBS8'],
     'SKCM': ['SBS7a', 'SBS7b']
}

In [47]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
        
    for mcutoff in mutations_cutoffs: 
        output_dir = f'{output_path}/subsamples/{iterations}iter_{sample_size}samples_{mcutoff}muts'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
    
        for cancertype, signatures in data_to_run_new.items(): 
            mutations = os.path.join(input_path_mutations, f'{cancertype}_SBS96_total_maxprob.originalref.hg38_1000kb_autosomes_bin.tsv')
            for signature in signatures: 
                ofd.write(f'python {code_f} -c {cancertype} -sig {signature} -m {mutations} -o {output_dir} -i {iterations} -s {sample_size} -mc {mcutoff} \n')