## 1. Import packages

In [1]:
# Import packages
import hail as hl
from bokeh.io import output_notebook,show
import gnomad.utils.vep
from hail.ggplot import *
import plotly
import plotly.io as pio
pio.renderers.default='iframe'

In [2]:
%run -i ~/BroadIS/utils/01_maps_funcs.py

## 2. Import data

In [None]:
# Import gnomaAD v.3.1.2
ht = hl.read_table('gs://gcp-public-data--gnomad/release/3.1.2/ht/genomes/gnomad.genomes.v3.1.2.sites.ht')

# Import mutation rates from gnomAD paper
ht_mu = hl.import_table('data/supplementary_dataset_10_mutation_rates.tsv.gz',
                delimiter='\t', impute=True, force_bgz=True)
ht_mu = ht_mu.key_by('context', 'ref', 'alt', 'methylation_level') # has to have a key in order to join using foreign key

# Import context table from gnomad (https://broadinstitute.github.io/gnomad_methods/api_reference/utils/vep.html?highlight=context#gnomad.utils.vep.get_vep_context)
context_table = gnomad.utils.vep.get_vep_context("GRCh38").ht()
context_table = context_table.filter(hl.is_defined(context_table.methyl_mean))


context_table_parsed = context_table.select(context_table.context, context_table.methyl_mean)
context_table_parsed = context_table_parsed.transmute(context = context_table_parsed.context[2:5])

Initializing Hail with default parameters...


In [None]:
context_table_parsed = context_table_parsed.annotate(methyl_mean = hl.float64(context_table_parsed.methyl_mean))

In [None]:
context_table_parsed = context_table_parsed.annotate(methylation_level = hl.if_else(context_table_parsed.methyl_mean <= 0.2,0,
                                                            hl.if_else(context_table_parsed.methyl_mean <= 0.8,1,2)))

### Show the data structure

In [None]:
ht.show(3)

In [None]:
# Table with methylation level and mutational rate in the trinucleotide context
ht_mu.show(3)

In [None]:
# This table contains already precalculated nucleotides -3/+3 from mutation site 
context_table_parsed.show(3)

## 3. Add context field to main data

In [None]:
# Before joining the tri-nucleotide context of mutation
ht.count() # data too big

In [None]:
# Join only matching rows from context to ht table.
#ht = ht.key_by('locus', 'alleles').join(context_table_parsed.key_by('locus', 'alleles'), how = 'left') # resorts the data making it slow
ht = ht.annotate(**context_table_parsed[ht.locus, ht.alleles])


In [None]:
# After
ht.count() # data too big

In [None]:
ht = ht.filter(hl.is_defined(ht.methylation_level))
ht = ht.head(10000) # Subset the data

ht.show(3)

## 4. Add mutation rates for added contexts

In [None]:
# Split alleles field to ref and alt allele
ht = ht.annotate(ref=ht.alleles[0], alt=ht.alleles[1])

# Add mutation rates according to the context, but also ref and alt allele for this context
#ht = ht.key_by("context", "ref", "alt").join(ht_mu.key_by("context", "ref", "alt"), how = 'left') # resorts the data making it slow
ht = ht.annotate(**ht_mu[ht.context, ht.ref, ht.alt, ht.methylation_level])


In [None]:
# After adding context and mutation rates to the main table 
# (can be more than original number of rows as context may occure more than once depending on the locus)
ht.count()

In [None]:
# Show that contexts may be the same, but locus is completely different
ht.show(3)

## 5. Train linear model on synonymous variants for mutational class correction

In [None]:
ht_syn_ps = train_on_synonymous(ht)

In [None]:
ht_syn_ps.show(3)

### Show input table for regression

In [None]:
# How many rows after adding mutation rates
ht_syn_ps.count()

### Perform regression

In [None]:
# Perform regression
ht_syn_lm = ht_syn_ps.aggregate(hl.agg.linreg(ht_syn_ps.ps, [1, ht_syn_ps.mu_snp], weight=ht_syn_ps.N_variants).beta)

# Show intercept and beta
ht_syn_lm

## 6. Predict expected number of variants for each context

### Function for regression eventually will be made starting here and put in `/utils/utils.py` script

In [None]:
maps_table = regress_per_context(ht, ht_syn_lm)

In [None]:
maps_table.show(maps_table.count())

In [None]:
ggplot(maps_table, aes(x=maps_table.consequence, y = maps_table.maps)) + geom_col(aes(fill=maps_table.consequence))
