# Split mutations

For each cancer type with 1 or more hotspots of equal alternate (alternate specific), read the file containing somatic mutations and the file containing hotspots. Then classify mutations as inside hotspots (overlapping hotspots) and outside hotspots (non-overlapping hotspots). 

Two files are generated per cancer type: 
* [CANCERTYPE].mutations_in_hotspots.gz       
* [CANCERTYPE].mutations_out_hotspots.gz


In [1]:
import os

In [2]:
main_dir = ''
output_path_data = os.path.join(main_dir, 'split_mutations', 'data')
output_path_code = os.path.join(main_dir, 'split_mutations', 'code')
code_file = os.path.join(output_path_code, 'split_mutations.py')
map_file = os.path.join(output_path_code, 'split_mutations.map')

In [3]:
# Parameters
samples_threshold = '2'
data_type = 'cancertypes_filtered_nodrivers'
suffix = '.filtered.nodrivers.in.gz'
alternates = 'altsplit'

In [4]:
## Input data

# Mutations file
input_path_mutations = os.path.join(main_dir, 'inputs', 'data', data_type) 

# Hotspots file
input_path_hotspots = os.path.join(
    main_dir, 'hotspots', 'data', f'{data_type}/hotspots_n{samples_threshold}_{alternates}')

In [5]:
# Create output directory
output_path = os.path.join(output_path_data, f'{data_type}/hotspots_n{samples_threshold}_{alternates}')
if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)

In [6]:
info = [
    '[params]',
    'cores=1',
    'memory=50G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [7]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    for entry in os.scandir(input_path_hotspots): 
        if entry.is_file() and entry.name.endswith('.results.tsv.gz'): 
            cancertype = entry.name.split(".")[0]
            if cancertype not in ['RBL']:    # cancer types without hotspots 
                mutations = os.path.join(input_path_mutations, f'{cancertype}{suffix}')
                hotspots = entry.path
                output = output_path
                ofd.write(
                    f'python {code_file} -c {cancertype} -m {mutations} -h {hotspots} -o {output}  \n'
                )            