# Run hotspotfinder

In [1]:
import os
from collections import defaultdict

import pandas as pd

In [2]:
main_dir = ''

In [3]:
map_file = f'{main_dir}/code/hotspotfinder_subsampling_200_300.map'

In [4]:
iterations = 1000
sample_size = 100
mutations_cutoffs = [100, 125, 150, 175, 200, 225, 250, 275, 300]

In [5]:
input_path = f'{main_dir}/data/subsamples'
output_path = f'{main_dir}/data/hotspots'

#### HotspotFinder configuration

In [6]:
samples_threshold = '2'
alternates = 'altsplit'
split_alternates = 'True' if alternates == 'altsplit' else 'False'
split_alternates

'True'

In [7]:
# Configuration
config = [
    'genome = "hg38"',
    'cores = 1',
    'output_format = "tsv.gz"',

    'mappable_regions = "%(bgdata://genomemappability/hg38/gem_100bp)"',
    'blacklisted_regions = "%(bgdata://genomemappability/hg38/blacklist)"',
    'population_variants = "%(bgdata://populationvariants/hg38/gnomad_v3_AF1)"',
    'repeats = "%(bgdata://repeatmasker/repeats/hg38)"',
    'ig_tr_regions = "%(bgdata://gencode/ig_tr_regions/hg38)"',
    'genomic_elements = "%(bgdata://gencode/annotations/hg38)"\n',

    '[finder]\n',

    f'samples_cutoff = {samples_threshold}',
    f'mutations_cutoff = {samples_threshold}',
    'remove_nonannotated_hotspots = False',
    f'split_alternates = {split_alternates}',
    'annotate = False',
]

#### Run HotspotFinder

In [8]:
info = [
    '[params]',
    'cores=1',
    'memory=24G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [9]:
# Cancer types and signatures to run
# Update if the mutation burden per sample is set to 600
data_to_run = {
 'BLADDER_URI': ['SBS2', 'SBS13', 'SBS5', 'SBS1'],
 'BRCA': ['SBS5', 'SBS1', 'SBS2', 'SBS13', 'SBS3'],
 'COADREAD': ['SBS1', 'SBS40', 'SBS17b', 'SBS5', 'SBS18', 'SBS93', 'SBS17a'],
 'ESOPHA_STOMACH': ['SBS17b', 'SBS17a', 'SBS5', 'SBS1'],
 'NSCLC': ['SBS4', 'SBS5', 'SBS40', 'SBS2', 'SBS13', 'SBS1'],
 'PROSTATE': ['SBS5', 'SBS1', 'SBS8'],
 'SKCM': ['SBS7a', 'SBS7b']
}
ctypes_to_run = data_to_run.keys()

In [11]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    
    for mcutoff in mutations_cutoffs: 
        input_dir = f'{input_path}/{iterations}iter_{sample_size}samples_{mcutoff}muts'
        output_dir = f'{output_path}/{iterations}iter_{sample_size}samples_{mcutoff}muts'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        
        # Write configuration
        config_path = os.path.join(output_dir, f'hotspot.conf')
        with open(config_path, 'w') as ofd2: 
            for line in config: 
                ofd2.write(f'{line}\n')
        
        # Call HotspotFinder
        for cancertype in ctypes_to_run:
            subsampling_dir_output = os.path.join(output_dir, cancertype)
            if not os.path.exists(subsampling_dir_output):
                os.makedirs(subsampling_dir_output, exist_ok=True)
            subsampling_dir_input = os.path.join(input_dir, cancertype)
            for signature in data_to_run[cancertype]: 
                for i in range(0, iterations): 
                    iteration_f = os.path.join(subsampling_dir_input, f'{cancertype}_iter_{i}_{signature}.in.gz')
                    ofd.write(f'hotspotfinder -i {iteration_f} -o {subsampling_dir_output} -conf {config_path} \n')