# Mutations per bin

This code intersects filtered mutations in mappable bins with sets of bins of different length (autosomes; driver regions removed)

Mutations in a cancer type are assigned to a signature by maximum likelihood 

Once mutations per signature are intersected with each set of bins, a normalised mutation rate is obtained for the signature across bins

In [1]:
import json

import pandas as pd

In [2]:
cancer_types = [
    'BRCA', 
    'BLADDER_URI', 
    'COADREAD', 
    'ESOPHA_STOMACH', 
    'NSCLC', 
    'PROSTATE', 
    'SKCM'
]

In [3]:
bins = [1000000, 500000, 250000, 100000, 50000, 25000, 10000]

In [4]:
main_dir = ''

In [5]:
input_dir = f'{main_dir}/propensity/data/filtered_mutations'
output_dir = f'{main_dir}/genomic_bins/data/mutations_per_bin'

### Run intersect of filtered mutations (within mappable megabases) and defined bins 

In [6]:
code_f = f'{main_dir}/genomic_bins/code/mutations_per_bin.py'
map_f = f'{main_dir}/genomic_bins/code/mutations_per_bin.map'

In [7]:
info = [
    '[params]',
    'cores=1',
    'memory=16G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [53]:
with open(map_f, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    for ctype in cancer_types: 
        input_f = f'{input_dir}/{ctype}_SBS96_total_maxprob.originalref.hg38_1000kb_autosomes_bin.tsv'
        for bin_size in bins: 
            bins_f = f'{main_dir}/genomic_bins/data/hg38_{int(bin_size/1000)}kb_bin.nodrivers.filtered.mappable_positions.autosomes.binids.txt'
            output_f = f'{output_dir}/{ctype}.{int(bin_size/1000)}kb.nodrivers.total_maxprob.mutations_per_bin.json'
            ofd.write(f'python {code_f} -m {input_f} -b {bins_f} -o {output_f}\n')            

### Check

Original number of available bins per size; if some bins are missing in the analysis below it means that either they do not have overlapping mutations or none of their sequence overlaps mappable and driver-free regions
- 1 Mb: 2196
- 500kb: 4392
- 250kb: 8784
- 100kb: 21960
- 50kb: 43920
- 25kb: 87840
- 10kb: 219600

In [55]:
for ctype in cancer_types: 
    print(f'\n{ctype}')
    for bin_size in bins: 
        bin_size = f'{int(bin_size/1000)}kb'
        f = f'{output_dir}/{ctype}.{bin_size}.nodrivers.total_maxprob.mutations_per_bin.json'
        with open(f, 'rt') as fd: 
            data = json.load(fd)
        print(bin_size, len(data.keys()))


BRCA
1000kb 2196
500kb 4392
250kb 8783
100kb 21932
50kb 43750
25kb 87266
10kb 217527

BLADDER_URI
1000kb 2196
500kb 4392
250kb 8783
100kb 21932
50kb 43748
25kb 87246
10kb 217319

COADREAD
1000kb 2196
500kb 4392
250kb 8783
100kb 21933
50kb 43753
25kb 87268
10kb 217557

ESOPHA_STOMACH
1000kb 2196
500kb 4392
250kb 8783
100kb 21932
50kb 43745
25kb 87250
10kb 217353

NSCLC
1000kb 2196
500kb 4392
250kb 8783
100kb 21934
50kb 43759
25kb 87283
10kb 217625

PROSTATE
1000kb 2196
500kb 4392
250kb 8783
100kb 21931
50kb 43743
25kb 87226
10kb 216761

SKCM
1000kb 2196
500kb 4392
250kb 8783
100kb 21933
50kb 43756
25kb 87286
10kb 217660


### Normalise mutation rate across bins

In [56]:
data_to_run = {
 'BLADDER_URI': ['SBS2', 'SBS13', 'SBS5', 'SBS1'],
 'BRCA': ['SBS5', 'SBS1', 'SBS2', 'SBS13', 'SBS3'],
 'COADREAD': ['SBS1', 'SBS40', 'SBS17b', 'SBS5', 'SBS18', 'SBS93', 'SBS17a'],
 'ESOPHA_STOMACH': ['SBS17b', 'SBS17a', 'SBS5', 'SBS1'],
 'NSCLC': ['SBS4', 'SBS5', 'SBS40', 'SBS2', 'SBS13', 'SBS1'],
 'PROSTATE': ['SBS5', 'SBS1', 'SBS8'],
 'SKCM': ['SBS7a', 'SBS7b']
}

In [58]:
for bin_size in bins:
    pseudocount = int(bin_size)/1000000
    print(pseudocount)

1.0
0.5
0.25
0.1
0.05
0.025
0.01


In [61]:
# Load mutations per bin

for bin_size in bins:
    
    pseudocount = int(bin_size)/1000000
    
    bin_size_str = f'{int(bin_size/1000)}kb'
    # get bins available
    bins_f = f'{main_dir}/genomic_bins/data/hg38_{bin_size_str}_bin.nodrivers.filtered.mappable_positions.autosomes.binids.txt'
    bins_df = pd.read_csv(bins_f, sep='\t', header=0)
    total_bins = set(bins_df['BINID'].unique())
    
    for ctype, signatures in data_to_run.items(): 
        
        # Read mutations per Mb
        muts_f = f'{output_dir}/{ctype}.{bin_size_str}.nodrivers.total_maxprob.mutations_per_bin.json'
        with open(muts_f, 'r') as fd: 
            m_per_bin = json.load(fd)

        for signature in signatures:

            # Get mutations 
            mutations = dict()
            for binid in total_bins: 
                
                mbinid_data = m_per_bin.get(binid)
                
                # If bin has at least 1 mutation from any signature
                if mbinid_data: 
                    mutations[binid] = mbinid_data.get(signature, 0) + pseudocount
                else: 
                    mutations[binid] = pseudocount
                    
            total_mutations = sum(mutations.values())
            muts_norm = dict([(binid, counts/total_mutations) for binid, counts in mutations.items()])

            output_f = f'{output_dir}/{ctype}_{signature}.{bin_size_str}.nodrivers.normmuts.total_maxprob.mutations_per_bin.relat_pcount.json'
            with open(output_f, 'w') as ofd: 
                json.dump(muts_norm, ofd) 

            print(bin_size, ctype, signature, len(muts_norm.keys()), sum(muts_norm.values()))


1000000 BLADDER_URI SBS2 2196 0.9999999999999992
1000000 BLADDER_URI SBS13 2196 0.9999999999999992
1000000 BLADDER_URI SBS5 2196 1.0000000000000018
1000000 BLADDER_URI SBS1 2196 1.0000000000000016
1000000 BRCA SBS5 2196 1.0000000000000009
1000000 BRCA SBS1 2196 1.0
1000000 BRCA SBS2 2196 1.0000000000000004
1000000 BRCA SBS13 2196 1.0000000000000022
1000000 BRCA SBS3 2196 0.9999999999999964
1000000 COADREAD SBS1 2196 0.9999999999999988
1000000 COADREAD SBS40 2196 1.0000000000000004
1000000 COADREAD SBS17b 2196 0.9999999999999988
1000000 COADREAD SBS5 2196 1.0000000000000027
1000000 COADREAD SBS18 2196 0.9999999999999989
1000000 COADREAD SBS93 2196 1.0000000000000018
1000000 COADREAD SBS17a 2196 1.0000000000000007
1000000 ESOPHA_STOMACH SBS17b 2196 0.9999999999999994
1000000 ESOPHA_STOMACH SBS17a 2196 1.0000000000000007
1000000 ESOPHA_STOMACH SBS5 2196 1.000000000000001
1000000 ESOPHA_STOMACH SBS1 2196 1.000000000000002
1000000 NSCLC SBS4 2196 0.9999999999999996
1000000 NSCLC SBS5 2196 0

25000 PROSTATE SBS5 87367 0.9999999999997782
25000 PROSTATE SBS1 87367 1.0000000000011486
25000 PROSTATE SBS8 87367 0.9999999999999583
25000 SKCM SBS7a 87367 0.9999999999984828
25000 SKCM SBS7b 87367 0.9999999999991253
10000 BLADDER_URI SBS2 217968 0.9999999999976973
10000 BLADDER_URI SBS13 217968 0.9999999999978314
10000 BLADDER_URI SBS5 217968 0.9999999999968141
10000 BLADDER_URI SBS1 217968 1.0000000000022022
10000 BRCA SBS5 217968 0.9999999999983777
10000 BRCA SBS1 217968 0.9999999999977921
10000 BRCA SBS2 217968 0.9999999999978689
10000 BRCA SBS13 217968 0.9999999999978676
10000 BRCA SBS3 217968 0.9999999999988541
10000 COADREAD SBS1 217968 0.9999999999984202
10000 COADREAD SBS40 217968 0.9999999999995753
10000 COADREAD SBS17b 217968 0.9999999999947795
10000 COADREAD SBS5 217968 0.9999999999984757
10000 COADREAD SBS18 217968 0.9999999999969962
10000 COADREAD SBS93 217968 0.9999999999988579
10000 COADREAD SBS17a 217968 1.0000000000015496
10000 ESOPHA_STOMACH SBS17b 217968 0.9999999