# Remove drivers 

Remove any mutation overlapping a CDS, splice site, intron, 5 and 3 UTR, proximal and distal promoter of a cancer gene. 

Cancer genes are those included in COSMIC CGC and IntOGen (cancer type unspecific).

In [13]:
import os
from collections import defaultdict
import gzip

from bgparsers import readers
from intervaltree import IntervalTree
import pandas as pd

In [2]:
main_dir = ''

In [3]:
code = os.path.join(main_dir, 'code', 'remove_drivers.py')
map_file = os.path.join(main_dir, 'code', '6_remove_drivers.map')

In [4]:
mutations_dir = main_dir + '/data/cancertypes_filtered'
output_dir = main_dir + '/data/cancertypes_filtered_nodrivers'

In [8]:
drivers_f = main_dir + '/data/cancerdrivers_regions.tsv'

In [9]:
info = [
    '[params]',
    'cores=1',
    'memory=8G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [10]:
with open(map_file, 'w') as ofd: 
    
    for line in info: 
        ofd.write(f'{line}\n')
          
    for entry in os.scandir(mutations_dir): 
        if entry.name.endswith('.filtered.in.gz'): 
            cohort_mutations_file = entry.path
            cohort = entry.name.split('.')[0]
            cohort_output_file = os.path.join(output_dir, f'{cohort}.filtered.nodrivers.in.gz')
            if os.path.isfile(cohort_mutations_file):
                ofd.write(f'python {code} -m {cohort_mutations_file} -d {drivers_f} -o {cohort_output_file}\n')

## Just checking

Double check that mutations in driver regions have been filtered out

In [14]:
main_dir = '../'

In [15]:
drivers_f = main_dir + '../data/cancerdrivers_regions.tsv'

In [16]:
# Select cancer type to test
ctype = 'ACC'
original_f = os.path.join(main_dir, 'data', 'cancertypes_filtered', f'{ctype}.filtered.in.gz')
nodrivers_f = os.path.join(main_dir, 'data', 'cancertypes_filtered_nodrivers', f'{ctype}.filtered.nodrivers.in.gz')

In [17]:
ori_df = pd.read_csv(original_f, sep='\t', header=0)
nod_f = pd.read_csv(nodrivers_f, sep='\t', header=0)

In [18]:
# Number of mutations that have been filtered out because they overlap driver regions
len(ori_df) - len(nod_f)

617

In [19]:
# Load driver regions
tree = defaultdict(IntervalTree)
with open(drivers_f, 'r') as fd:
    next(fd)
    for line in fd:
        chrom, start, end = line.strip().split('\t')
        tree[chrom].addi(int(start), int(end) + 1)  # +1

In [20]:
# Check overlap equals filtered out mutations
overlap = 0
with gzip.open(original_f, 'rt') as fd:
    for row in readers.variants(
            file=original_f,
            required=['CHROMOSOME', 'POSITION', 'REF', 'ALT', 'SAMPLE'],
            extra=['COHORT','CANCER_TYPE','PLATFORM','TYPE','AGE','TREATED','MUTYPE']
    ):
        chrom = row['CHROMOSOME']
        pos = row['POSITION']

        if tree[chrom][int(pos)]:
            overlap += 1
            continue

In [21]:
overlap

617

In [22]:
# Check no mutation in the filtered file overlaps driver regions
overlap = 0
with gzip.open(nodrivers_f, 'rt') as fd:
    for row in readers.variants(
            file=nodrivers_f,
            required=['CHROMOSOME', 'POSITION', 'REF', 'ALT', 'SAMPLE'],
            extra=['COHORT','CANCER_TYPE','PLATFORM','TYPE','AGE','TREATED','MUTYPE']
    ):
        chrom = row['CHROMOSOME']
        pos = row['POSITION']

        if tree[chrom][int(pos)]:
            overlap += 1
            continue
print(overlap)    # should be 0

0
