# Run SigProfilerExtractor

In [1]:
import os

import pandas as pd

In [2]:
main_dir = ''

## Run for all mutations

Run sigprofiler for cancer types with 30 or more samples and total SNV burden greater than 100k mutations

#### Select cancer types

In [3]:
# Read cancer types to extract signatures
ctypes_f = os.path.join(main_dir, 'inputs', 'tables', 'cancer_types_annotated.tsv')
ctypes_df = pd.read_csv(ctypes_f, sep='\t', header=0)
ctypes_df.head(1)

Unnamed: 0,CANCER_TYPE,CANCER_TYPE_LONG,LEVEL_ONCOTREE,LEVEL_ANALYSIS,SAMPLES,COHORTS,MUTATIONS_TOTAL,MUTATIONS_SNV,MUTATIONS_MNV,MUTATIONS_INS,MUTATIONS_DEL
0,ACC,Adrenocortical Carcinoma (ACC),2,A,20,D_ACC,16649,16614,0,26,9


In [4]:
# Select level B cancer types: 
ctypes_b = ctypes_df.loc[(ctypes_df['LEVEL_ANALYSIS'] == 'B') & (ctypes_df['CANCER_TYPE'] != 'PANCANCER')]
ctypes_b = ctypes_b.loc[ctypes_b['MUTATIONS_SNV'] > 100000].copy()

In [5]:
ctypes_b = ctypes_b['CANCER_TYPE'].to_list()

In [6]:
# Select level A cancer types and filter based on sample size
ctypes_a = ctypes_df.loc[ctypes_df['LEVEL_ANALYSIS'] == 'A']
ctypes_a = ctypes_a.loc[(ctypes_a['SAMPLES'] >= 30) & (ctypes_a['MUTATIONS_SNV'] >= 100000) ].copy()

In [7]:
ctypes_a = ctypes_a['CANCER_TYPE'].to_list()

#### Run

In [8]:
code = os.path.join(main_dir, 'signatures', 'sigprofiler', 'code', 'run_sigprofiler.py')
map_file = os.path.join(main_dir, 'signatures', 'sigprofiler', 'code', f'1_run_sigprofiler.map')

In [9]:
input_dir = os.path.join(main_dir, 'signatures', 'sigprofilermatrixgenerator', 'output', 'mutations_total')
output_dir = os.path.join(main_dir, 'signatures', 'sigprofiler', 'output', 'mutations_total')

In [10]:
sigs = 'SBS'
channels = '96'

In [11]:
cores = 56
info = [
    '[params]',
    f'cores={cores}',
    'memory=50G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [12]:
nmf_replicates = 1024
max_sigs = 20

In [13]:
with open(map_file, 'w') as ofd: 
    
    for line in info: 
        ofd.write(f'{line}\n')

    for ctype in ctypes_b + ctypes_a: 
        input_file = os.path.join(input_dir, ctype, 'output', sigs, f'{ctype}.{sigs}{channels}.all')
        run_output_dir = os.path.join(output_dir, f'{ctype}_{sigs}{channels}')
        if not os.path.exists(run_output_dir):
            os.makedirs(run_output_dir, exist_ok=True)

        ofd.write(
        f'python {code} '
        f'-i {input_file} '
        f'-o {run_output_dir} '
        f'-ct {sigs}{channels} '
        f'--nmf-replicates {nmf_replicates} '
        f'--max-sigs {max_sigs} '
        f'-c {cores} \n'   
        )

## Run for mutations INSIDE hotspots

In [14]:
code = os.path.join(main_dir, 'signatures', 'sigprofiler', 'code', 'run_sigprofiler.py')
map_file = os.path.join(main_dir, 'signatures', 'sigprofiler', 'code', f'1_run_sigprofiler_inhotspots.map')

In [15]:
input_dir = os.path.join(main_dir, 'signatures', 'sigprofilermatrixgenerator', 'output', 'mutations_in_hotspots')
output_dir = os.path.join(main_dir, 'signatures', 'sigprofiler', 'output', 'mutations_in_hotspots')

In [16]:
sigs = 'SBS'
channels = '96'

In [17]:
nmf_replicates = 1024
max_sigs = 15

In [18]:
cores = 56
info = [
    '[params]',
    f'cores={cores}',
    'memory=50G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [19]:
with open(map_file, 'w') as ofd: 
    
    for line in info: 
        ofd.write(f'{line}\n')

    for ctype in ['COADREAD','ESOPHA_STOMACH', 'NSCLC', 'SKCM']: 
        input_file = os.path.join(input_dir, ctype, 'output', sigs, f'{ctype}.{sigs}{channels}.all')
        run_output_dir = os.path.join(output_dir, f'{ctype}_{sigs}{channels}')
        if not os.path.exists(run_output_dir):
            os.makedirs(run_output_dir, exist_ok=True)

        ofd.write(
        f'python {code} '
        f'-i {input_file} '
        f'-o {run_output_dir} '
        f'-ct {sigs}{channels} '
        f'--nmf-replicates {nmf_replicates} '
        f'--max-sigs {max_sigs} '
        f'-c {cores} \n'   
        )