# Run HotspotFinder

Run HotspotFinder algorithm across cancer types, excluding mutations in driver regions. 
    
Identify hotspots of 2 or more mutated samples that are alternate specific, this is, are composed of the same mutation (altsplit). 

In [1]:
import os

In [2]:
main_dir = ''
input_path_main = os.path.join(main_dir, 'inputs', 'data') 
output_path_data = os.path.join(main_dir, 'hotspots', 'data')
output_path_code = os.path.join(main_dir, 'hotspots', 'code')

In [3]:
# Parameters
samples_threshold = '2'
data_types = ['cancertypes_filtered_nodrivers']
alternates = ['altsplit']

In [4]:
# Generate output directories and configuration files
for data_type in data_types: 
    for alternate in alternates: 
        split_alternates = 'True' if alternate == 'altsplit' else 'False'
        # Output
        output_path = os.path.join(output_path_data, f'{data_type}/hotspots_n{samples_threshold}_{alternate}')
        if not os.path.exists(output_path):
            os.makedirs(output_path, exist_ok=True)
        # Configuration
        config_path = os.path.join(output_path, f'hotspot.conf')
        config = [
            'genome = "hg38"',
            'cores = 1',
            'output_format = "tsv.gz"',

            'mappable_regions = "%(bgdata://genomemappability/hg38/gem_100bp)"',
            'blacklisted_regions = "%(bgdata://genomemappability/hg38/blacklist)"',
            'population_variants = "%(bgdata://populationvariants/hg38/gnomad_v3_AF1)"',
            'repeats = "%(bgdata://repeatmasker/repeats/hg38)"',
            'ig_tr_regions = "%(bgdata://gencode/ig_tr_regions/hg38)"',
            'genomic_elements = "%(bgdata://gencode/annotations/hg38)"\n',

            '[finder]\n',

            f'samples_cutoff = {samples_threshold}',
            f'mutations_cutoff = {samples_threshold}',
            'remove_nonannotated_hotspots = False',
            f'split_alternates = {split_alternates}',
            'annotate = True',
        ]
        with open(config_path, 'w') as ofd: 
            for line in config: 
                ofd.write(f'{line}\n')

### Run memory high cancer types

In [13]:
memory_high_cancertypes = ['PANCANCER', 'SKCM', 'SKIN', 'LUNG', 'NSCLC']

In [14]:
output_file = os.path.join(output_path_code, 'hotspotfinder_memory_high.map')

In [15]:
info = [
    '[params]',
    'cores=1',
    'memory=150G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [16]:
with open(output_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    # For each mutation type
    for data_type in data_types:
        input_path = os.path.join(input_path_main, data_type)
        # For each cancer type
        for entry in os.scandir(input_path): 
            if entry.is_file() and entry.name.endswith('.in.gz'): 
                if entry.name.split('.')[0] in memory_high_cancertypes: 
                    print(entry.name)
                    # Write jobs
                    for alternate in alternates: 
                        output_path = os.path.join(output_path_data, f'{data_type}/hotspots_n{samples_threshold}_{alternate}')
                        config_path = os.path.join(output_path, f'hotspot.conf')
                        ofd.write(f'hotspotfinder -i {entry.path} -o {output_path} -conf {config_path} \n')            

PANCANCER.filtered.nodrivers.in.gz
NSCLC.filtered.nodrivers.in.gz
SKIN.filtered.nodrivers.in.gz
LUNG.filtered.nodrivers.in.gz
SKCM.filtered.nodrivers.in.gz


### Run memory low cancer types

In [18]:
output_file = os.path.join(output_path_code, 'hotspotfinder_memory_low.map')

In [19]:
info = [
    '[params]',
    'cores=1',
    'memory=24G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [20]:
with open(output_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    # For each mutation type
    for data_type in data_types:
        input_path = os.path.join(input_path_main, data_type)
        # For each cancer type
        for entry in os.scandir(input_path): 
            if entry.is_file() and entry.name.endswith('.in.gz'): 
                if entry.name.split('.')[0] not in memory_high_cancertypes: 
                    print(entry.name)
                    # Write jobs
                    for alternate in alternates: 
                        output_path = os.path.join(output_path_data, f'{data_type}/hotspots_n{samples_threshold}_{alternate}')
                        config_path = os.path.join(output_path, f'hotspot.conf')
                        ofd.write(f'hotspotfinder -i {entry.path} -o {output_path} -conf {config_path} \n')            

HGGNOS.filtered.nodrivers.in.gz
BRAIN.filtered.nodrivers.in.gz
ACC.filtered.nodrivers.in.gz
NBL.filtered.nodrivers.in.gz
LNM.filtered.nodrivers.in.gz
LGGNOS.filtered.nodrivers.in.gz
EPM.filtered.nodrivers.in.gz
NHL.filtered.nodrivers.in.gz
PAST.filtered.nodrivers.in.gz
BOWEL.filtered.nodrivers.in.gz
PANCREAS.filtered.nodrivers.in.gz
BONE_SOFT_TISSUE.filtered.nodrivers.in.gz
KIDNEY.filtered.nodrivers.in.gz
LIVER.filtered.nodrivers.in.gz
LNET.filtered.nodrivers.in.gz
UCEC.filtered.nodrivers.in.gz
HEAD_NECK.filtered.nodrivers.in.gz
BILIARY_TRACT.filtered.nodrivers.in.gz
COADREAD.filtered.nodrivers.in.gz
ALL.filtered.nodrivers.in.gz
MPN.filtered.nodrivers.in.gz
AN.filtered.nodrivers.in.gz
ESOPHA_STOMACH.filtered.nodrivers.in.gz
SCLC.filtered.nodrivers.in.gz
ST.filtered.nodrivers.in.gz
RBL.filtered.nodrivers.in.gz
CERVIX.filtered.nodrivers.in.gz
VULVA.filtered.nodrivers.in.gz
PROSTATE.filtered.nodrivers.in.gz
PLMESO.filtered.nodrivers.in.gz
CSCC.filtered.nodrivers.in.gz
ODG.filtered.nodrive