# Run reformat input file for SigProfilerMatrixGenerator

SigProfilerMatrixGenerator requires a specific format as an input

Read mutation files containing 1) all mutations, 2) mutations inside hotspots and 3) mutations outside hotspots and re-format accordingly

In [1]:
import os
import gzip 

from collections import defaultdict

import json
import pandas as pd

### 1) Input cancer type files with all mutations (inside and outside hotspots)

In [2]:
main_dir = ''

In [3]:
data_dir = os.path.join(main_dir, 'inputs', 'data', 'cancertypes_filtered_nodrivers')

In [4]:
matrixgen_dir = os.path.join(main_dir, 'signatures', 'sigprofilermatrixgenerator')

In [5]:
code = os.path.join(matrixgen_dir, 'code', 'reformat_input_sigprofilermatrixgenerator.py')
map_file = os.path.join(matrixgen_dir, 'code', '1_reformat_input_sigprofilermatrixgenerator.map')
output_dir = os.path.join(matrixgen_dir, 'output', 'mutations_total')

In [6]:
info = [
    '[params]',
    'cores=1',
    'memory=8G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [7]:
with open(map_file, 'w') as ofd: 
    
    for line in info: 
        ofd.write(f'{line}\n')

    for entry in os.scandir(data_dir): 
        if entry.name.endswith('.filtered.nodrivers.in.gz'): 
            project = entry.name.split('.')[0]
            if project not in ['PANCANCER']: 
                project_output_directory = os.path.join(output_dir, project)
                output_file = os.path.join(project_output_directory, f'{project}.txt')
                if not os.path.exists(project_output_directory):
                    os.makedirs(project_output_directory, exist_ok=True)

                ofd.write(f'python {code} -i {entry.path} -o {output_file}\n')

### 2) Input cancer type files with mutations IN HOTSPOTS

In [8]:
main_dir = ''

In [9]:
data_type = 'cancertypes_filtered_nodrivers'
hotspot_type = 'hotspots_n2_altsplit'
location = 'in'

In [10]:
data_dir = f'{main_dir}/split_mutations/data/{data_type}/{hotspot_type}'

In [11]:
matrixgen_dir = os.path.join(main_dir, 'signatures', 'sigprofilermatrixgenerator')

In [12]:
code = os.path.join(matrixgen_dir, 'code', 'reformat_input_sigprofilermatrixgenerator.py')
map_file = os.path.join(matrixgen_dir, 'code', f'1_reformat_input_sigprofilermatrixgenerator_{location}hotspots.map')
output_dir = os.path.join(matrixgen_dir, 'output', f'mutations_in_hotspots')

In [13]:
info = [
    '[params]',
    'cores=1',
    'memory=8G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [14]:
with open(map_file, 'w') as ofd: 
    
    for line in info: 
        ofd.write(f'{line}\n')

    for entry in os.scandir(data_dir): 
        if entry.name.endswith(f'.mutations_{location}_hotspots.gz'): 
            project = entry.name.split('.')[0]
            # Skip pancancer and RBL (no hotspots)
            if project not in ['PANCANCER', 'RBL']: 
                project_output_directory = os.path.join(output_dir, project)
                output_file = os.path.join(project_output_directory, f'{project}.txt')
                if not os.path.exists(project_output_directory):
                    os.makedirs(project_output_directory, exist_ok=True)

                ofd.write(f'python {code} -i {entry.path} -o {output_file}\n')

### 3) Input cancer type files with mutations OUTSIDE HOTSPOTS

In [15]:
main_dir = ''

In [16]:
data_type = 'cancertypes_filtered_nodrivers'
hotspot_type = 'hotspots_n2_altsplit'
location = 'out'

In [17]:
data_dir = f'{main_dir}/split_mutations/data/{data_type}/{hotspot_type}'

In [18]:
matrixgen_dir = os.path.join(main_dir, 'signatures', 'sigprofilermatrixgenerator')

In [19]:
code = os.path.join(matrixgen_dir, 'code', 'reformat_input_sigprofilermatrixgenerator.py')
map_file = os.path.join(matrixgen_dir, 'code', f'1_reformat_input_sigprofilermatrixgenerator_{location}hotspots.map')
output_dir = os.path.join(matrixgen_dir, 'output', f'mutations_out_hotspots')

In [20]:
info = [
    '[params]',
    'cores=1',
    'memory=8G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [21]:
with open(map_file, 'w') as ofd: 
    
    for line in info: 
        ofd.write(f'{line}\n')

    for entry in os.scandir(data_dir): 
        if entry.name.endswith(f'.mutations_{location}_hotspots.gz'): 
            project = entry.name.split('.')[0]
            # Skip pancancer and RBL (no hotspots)
            if project not in ['PANCANCER', 'RBL']: 
                project_output_directory = os.path.join(output_dir, project)
                output_file = os.path.join(project_output_directory, f'{project}.txt')
                if not os.path.exists(project_output_directory):
                    os.makedirs(project_output_directory, exist_ok=True)

                ofd.write(f'python {code} -i {entry.path} -o {output_file}\n')