# Compute theoretical expected no. hotspots for SBS1

## * SBS1 only CpG sites

## * modified profiles with custom channels accounting for genome-wide CpG methylation

In [1]:
from collections import defaultdict
from decimal import *
import os
import glob
import gzip
import json
from functools import reduce
from operator import concat
import time

import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt

from utils import triplets, mut_key_gen, mut_key_gen_cosmic, sum_dict, sbs_normalize, sbs_format, sbs_index

## Constants

In [2]:
chunksizes = ['1000kb'] # Alternatively use: ['1000kb', '500kb', '250kb', '100kb', '50kb', '25kb', '10kb']

In [3]:
main_dir = ''

In [4]:
cpg_data_folder = f'{main_dir}/methylation/data/fractional_methylation'

In [5]:
bins_dir = f'{main_dir}/genomic_bins/data'

In [6]:
triplets = ['ACG', 'CCG', 'GCG', 'TCG']
triplets_extended = reduce(concat, [[t, t + '_meth'] for t in triplets], [])
colors = ['pink', 'brown', 'green']

# Input Data

## load COSMIC Signatures and sort channels canonically

In [7]:
cosmic_df = pd.read_csv('./COSMIC_v3.2_SBS_GRCh38.txt', sep='\t', index_col='Type')
canonical_context_sorting = list(mut_key_gen_cosmic())
cosmic_df = cosmic_df.loc[canonical_context_sorting]

## load CpG annotated data

**Note**: some bins may be missing from the input data due to lack of CpG methylation data overlapping it. To fix this, we add missing bins with 0 values in methylation categories. We then add their CpG trinucleotide counts as missing data.

In [8]:
mtypes = [
    'ME_POS_MUT_POS', 
    'ME_POS_MUT_NEG', 
    'ME_NEG_MUT_POS',
    'ME_NEG_MUT_NEG' 
]

In [9]:
%%time
# Merge datasets per cancer-type and bin size

data_dict = defaultdict(dict)
for binsize in chunksizes: 
    
    # Laod mappable bins
    bins_f = f'{bins_dir}/hg38_{binsize}_bin.nodrivers.filtered.mappable_positions.autosomes.binids.txt'
    bins_total = pd.read_csv(bins_f, sep='\t', header=0)['BINID'].tolist()
    
    ### Load trinucleotide composition per bin
    trinuc_comp_f = f'{main_dir}/genomic_bins/data/hg38_{binsize}_bin.nodrivers.filtered.mappable_positions.autosomes.trinuc_per_bin.json'
    with open(trinuc_comp_f, 'rt') as fd: 
        trinuc_dict = json.load(fd)
    
    print(binsize)
    for ctype in ['COADREAD', 'ESOPHA_STOMACH', 'NSCLC']:
        # List files to merge
        fns = []
        for fn in glob.glob(os.path.join(cpg_data_folder, f'SBS1_{ctype}*.{binsize}.*.counts_methyl_muts.tsv')):
            fns.append(os.path.basename(fn))
        # Merge files
        lines = []
        for file in fns: 
            fn = os.path.join(cpg_data_folder, file)
            df = pd.read_csv(fn, sep='\t', header=0)
            lines.append(df)
        df_with_data = pd.concat(lines)

        # Add missing bins
        epigenomes = df['EPIGENOMES'].iloc[0]
        for binid in set(bins_total).difference(set(df_with_data['BINID'].unique())): 
            chrom = binid.split(':')[0]
            for trinuc in triplets:
                # Leave methylation data with 0 counts
                for mtype in mtypes: 
                    lines += [pd.DataFrame([['SBS1', epigenomes, 0.5, chrom, binid, trinuc, mtype, 0]], columns=df.columns)]
                # Add trinucleotide counts of the bin to "MISSING"
                trinuc_counts_bin = trinuc_dict[binid][trinuc]
                lines += [pd.DataFrame([['SBS1', epigenomes, 0.5, chrom, binid, trinuc, 'MISSING', trinuc_counts_bin]], columns=df.columns)]
            
        data_dict[binsize][ctype] = pd.concat(lines)

1000kb
CPU times: user 1.96 s, sys: 105 ms, total: 2.06 s
Wall time: 5.64 s


# Methylatated vs non-methylated mutation rate analysis

The goal is 1) come up with a mutational profile that takes into account the mutability differences between methylated and unmethylated CpGs and 2) to impute the missing CpG values per chunk that are not annotated.

In what follows we measure the effect of methylated/non-methylated grouping by each cancer type and triplet context across bins.

Imputation of the CpG positions annotated as MISSING is based on the local proportion of methylated/unmethylated CpGs per bin.

## Compute mutation fold change

In [10]:
def fold_change(x, pcount=1):
    y = x.values
    return ((y[0] + pcount) / (y[0] + y[1] + pcount)) / ((y[2] + pcount) / (y[2] + y[3] + pcount))

In [11]:
%%time

fold_change_dict = defaultdict(dict)

for binsize in chunksizes: 
    
    for i, (cancer_type, df) in enumerate(data_dict[binsize].items()):
        
        pcount = int(binsize.split('kb')[0])/1000
        fold_change_trinuc = {}
        
        for triplet in triplets: 
            tri_df = df.loc[df['TRINUC'] == triplet]
            me_pos_mut_pos = tri_df.loc[tri_df['TYPE'] == 'ME_POS_MUT_POS']['COUNT'].sum()
            me_pos_mut_neg = tri_df.loc[tri_df['TYPE'] == 'ME_POS_MUT_NEG']['COUNT'].sum()
            me_neg_mut_pos = tri_df.loc[tri_df['TYPE'] == 'ME_NEG_MUT_POS']['COUNT'].sum()
            me_neg_mut_neg = tri_df.loc[tri_df['TYPE'] == 'ME_NEG_MUT_NEG']['COUNT'].sum()

            fc = ((me_pos_mut_pos + pcount) /(me_pos_mut_pos + me_pos_mut_neg + pcount)) / ((me_neg_mut_pos + pcount) /(me_neg_mut_pos + me_neg_mut_neg + pcount))
            
            fold_change_trinuc[triplet] = fc
        
        fold_change_dict[binsize][cancer_type] = fold_change_trinuc

CPU times: user 293 ms, sys: 8.05 ms, total: 301 ms
Wall time: 301 ms


## Create total mutrate dict merging mutation rates across NpCpGs in a bin

In [12]:
def get_total_mutations(x):
    y = x.values
    return y[0] + y[2]

In [13]:
%%time
# create a multiindex with BINID, TRINUC
# apply get_total_mutations per TRINUC and BINID

mutrate_dict = defaultdict(dict)

for binsize in chunksizes: 
    # Pseudocount is relative to binsize
    pseudocount = int(binsize.split('kb')[0])/1000

    for i, (cancer_type, df) in enumerate(data_dict[binsize].items()):

        dg = df.groupby(['BINID', 'TRINUC'])['COUNT'].apply(get_total_mutations).reset_index().rename(columns={'COUNT': 'TOTAL_MUTS'})
        dh = dg.groupby(['BINID'])['TOTAL_MUTS'].apply(lambda x: sum(list(x))).reset_index()
        mutrate_dict[binsize][(cancer_type, 'SBS1')] = dict(zip(dh['BINID'].values, dh['TOTAL_MUTS']))

    for (cancer_type, signature), d in mutrate_dict[binsize].items():
        new_d = {k: (v + pseudocount) for k, v in d.items()}
        total = sum(new_d.values())
        norm_new_d = {k: v / total for k, v in new_d.items()}
        mutrate_dict[binsize][(cancer_type, signature)] = norm_new_d
    
    print(binsize)

1000kb
CPU times: user 5.56 s, sys: 0 ns, total: 5.56 s
Wall time: 5.56 s


## Get bin NpCpG composition 

Impute MISSING counts as being in the same meth/non-meth proportion as the rest

### Compute 

<span style="color:red">(This code only needs to run once for a given chunksize)</span>

In [14]:
def methylated_composition(x):
    """Reformat missing counts as methylated or no methylated according to proportion of methylated trinucleotides"""
    
    y = x.values
    proportion_meth = ((y[0] + y[1] + 1) / (y[0] + y[1] + y[2] + y[3] + 1))
    meth = y[0] + y[1] + int(proportion_meth * y[4])
    nometh = y[2] + y[3] + int((1 - proportion_meth) * y[4])
    return int(meth), int(nometh)

def reformat_content_dict(d):
    
    d_meth = {t + '_meth': v[0] for t, v in d.items()}
    d_nometh = {t: v[1] for t, v in d.items()}
    return {**d_meth, **d_nometh}

In [15]:
%%time

triplet_content_per_chunk = {}    # we don't write a defaultdict because this is exported

for binsize in chunksizes: 
    triplet_content_per_chunk_bin = {}
    for i, (cancer_type, df) in enumerate(data_dict[binsize].items()):

        dg = df.groupby(by=['BINID', 'TRINUC'])['COUNT'].apply(methylated_composition).reset_index().rename(columns={'COUNT': 'METH_NOMETH'})
        dh = dg.groupby(by=['BINID'])[['TRINUC', 'METH_NOMETH']].apply(lambda r: dict(zip(r['TRINUC'], r['METH_NOMETH']))).reset_index().rename(columns={0: 'CONTENT'})

        # triplet_content_per_chunk
        # dict ttype -> chunk -> triplet -> count

        d = dict(zip(dh['BINID'], dh['CONTENT']))
        for chunk, content_dict in d.items():
            d[chunk] = reformat_content_dict(content_dict)

        triplet_content_per_chunk_bin[cancer_type] = d
    triplet_content_per_chunk[binsize] = triplet_content_per_chunk_bin
    print(binsize)

1000kb
CPU times: user 5.8 s, sys: 0 ns, total: 5.8 s
Wall time: 5.8 s


In [16]:
with gzip.open('./cpg_triplet_count.pickle.gz', 'wb') as f:
    pickle.dump(triplet_content_per_chunk, f)

### Alternatively, load counts

In [17]:
with gzip.open('./cpg_triplet_count.pickle.gz', 'rb') as f:
    triplet_content_per_chunk = pickle.load(f)

In [18]:
triplet_counts_per_chunk = triplet_content_per_chunk

## Triplet counts genomewide

In [19]:
%%time

triplet_content_genomewide = defaultdict(dict)
for binsize in chunksizes: 
    for i, (cancer_type, df) in enumerate(data_dict[binsize].items()):
        dg = df.groupby(by=['TRINUC'])['COUNT'].apply(sum).reset_index().rename(columns={'COUNT': 'TRIPLET_CONTENT'})
        triplet_content_genomewide[binsize][cancer_type] = dict(zip(dg['TRINUC'], dg['TRIPLET_CONTENT']))

CPU times: user 45 ms, sys: 0 ns, total: 45 ms
Wall time: 44.4 ms


## C>T mutability per triplet

In [20]:
%%time

mutability_dict = defaultdict(dict)

sig = cosmic_df['SBS1']
for binsize in chunksizes: 
    for cancer_type, d in triplet_content_genomewide[binsize].items():
        mutability_dict[binsize][cancer_type] = {}
        for t, content in d.items():
            ind = sbs_index(t, 'T')
            mutability_dict[binsize][cancer_type][t] = sig[ind] / content
            mutability_dict[binsize][cancer_type][f'{t}_meth'] = (sig[ind] / content) * fold_change_dict[binsize][cancer_type][t]

    for cancer_type, d in mutability_dict[binsize].items():
        total = sum(d.values())
        new_d = {k: v / total for k, v in d.items()}
        mutability_dict[binsize][cancer_type] = new_d
    print(binsize)

1000kb
CPU times: user 1.62 ms, sys: 0 ns, total: 1.62 ms
Wall time: 1.56 ms


# TripletRegion class

This class implements the method to compute expected number of hotspots described above.

In [21]:
class TripletRegion:
    
    getcontext().prec = 100  # using instances of decimal.Decimal() we can do arithmetic at an arbitrarily precision level
    
    def __init__(self, N, L, pa, pb, pc):
        
        """
        N: number of sample
        L: region length
        pa, pb, pc: per-base probabilities of either of 3 possible mutation types
        """
        
        self.N = Decimal(N)
        self.L = L
        self.pa = Decimal(pa)
        self.pb = Decimal(pb)
        self.pc = Decimal(pc)
        self.p_mutation = self.pa + self.pb + self.pc
        self.q = 1 - self.p_mutation
    
    @property
    def p_hotspot(self):
        t0 = self.q ** self.N
        t1 = Decimal(self.N) * self.q ** (self.N - 1) * (self.pa + self.pb + self.pc)
        t2 = Decimal(self.N) * Decimal(self.N - 1) * self.q ** (self.N - 2) * (self.pa * self.pb + self.pa * self.pc + self.pb * self.pc)
        t3 = Decimal(self.N) * Decimal(self.N - 1) * Decimal(self.N - 2) * self.q ** (self.N - 3) * self.pa * self.pb * self.pc
        return float(Decimal(1) - (t0 + t1 + t2 + t3))
    
    @property
    def expected_hotspots(self):
        return self.p_hotspot * self.L
    
    @property
    def binomial_distribution(self):
        rv = binom(self.L, self.p_hotspot)
        return rv

In [22]:
class TripletRegionSingleMutation:
    
    getcontext().prec = 1000  # using instances of decimal.Decimal() we can do arithmetic at an arbitrarily precision level
    
    def __init__(self, N, L, pa):
        
        """
        N: number of sample
        L: region length
        pa: per-base probabilities of the mutation type
        """
        
        self.N = Decimal(N)
        self.L = L
        self.pa = Decimal(pa)
        self.p_mutation = self.pa
        self.q = 1 - self.p_mutation
    
    @property
    def p_hotspot(self):
        t0 = self.q ** self.N
        t1 = Decimal(self.N) * self.q ** (self.N - 1) * self.pa
        return float(Decimal(1) - (t0 + t1))
    
    @property
    def expected_hotspots(self):
        return self.p_hotspot * self.L
    
    @property
    def binomial_distribution(self):
        rv = binom(self.L, self.p_hotspot)
        return rv

## Expected number of hotspots per chunk with variable mutation rate

In [23]:
def expected_hotspots_per_chunk(ttype, muts_per_sample, N, binsize):
    
    """
    ttype: cancer type
    muts_per_sample: # mutations per sample 
    N: # samples
    """
    
    # get relative mutation rates per chunk
    mutability = mutrate_dict[binsize][(ttype, signature)]
    
    # dictionary with hotspot rate functions for each chunk
    res = {}
    
    for chunk, abundance_dict in triplet_counts_per_chunk[binsize][ttype].items():
                
        chunk_mutrate = muts_per_sample * mutrate_dict[binsize][(ttype, 'SBS1')][chunk]
        
        # estimate relative mutation load for each triplet genome
        triplet_region_weights = {t: {'length': triplet_counts_per_chunk[binsize][ttype][chunk][t], 'weights': []} for t in triplets_extended}

        for t in triplets_extended:
            
            triplet_region_weights[t]['weights'] += [mutability_dict[binsize][ttype][t]]
            
        # mutrate
        # dict: triplet -> mutation burden that corresponds to the "triplet" region in the chunk per sample, 
        # given i) the relative mutation rate of the chunk and ii) the number of mutations per sample

        load = {t: sum(v['weights']) * v['length'] for t, v in triplet_region_weights.items()}
        total_load = sum([v for t, v in load.items()])
        if total_load > 0: 
            mutrate = {t: (v / total_load) * chunk_mutrate for t, v in load.items()}
                
        # probabilities of for each mutation type within each triplet region
        probabilities = {}
        for t, v in triplet_region_weights.items():
            if v['length'] > 0:
                perposition_mutrate = mutrate[t] / v['length']
                total_rate = sum(v['weights'])
                pa = (v['weights'][0] / total_rate) * perposition_mutrate
                probabilities[t] = (pa, )
            else: 
                probabilities[t] = (0, )
                    
        res[chunk] = 0
        region = {}
        for t, v in triplet_region_weights.items():
                        
            region[t] = TripletRegionSingleMutation(N, v['length'], *probabilities[t])
        
        for t, v in region.items():
            res[chunk] += v.expected_hotspots
            
    return res

# Compute expected hotspot propensity

Expected hotspot propensity is the sum of expected hotspots across chunks

In [24]:
ttype = 'COADREAD'
binsize = '1000kb'
N = 100  
n_muts = 300

res = expected_hotspots_per_chunk(ttype, n_muts, N, binsize)
expected_hotspot_propensity = res.values()