## Imports and model initialization

In [2]:
%load_ext autoreload
%autoreload 2

import csv
from datetime import datetime
import math
import os
from pathlib import Path
import pickle

import kipoi
from kipoiseq.dataloaders import SeqIntervalDl
from IPython.display import clear_output
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from utils import detect_device

import bpnet
from bpnet.datasets import StrandedProfile
from bpnet.dataspecs import DataSpec, TaskSpec
from bpnet.utils import create_tf_session
from bpnet.utils import read_json
from bpnet.seqmodel import SeqModel
from bpnet.plot.evaluate import plot_loss, regression_eval

from in_silico_mutagenesis import compute_summary_statistics, generate_wt_mut_batches, write_results

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Using TensorFlow backend.






2020-09-02 21:27:05,074 [INFO] NumExpr defaulting to 8 threads.


In [3]:
!pwd

/home/stephenmalina/project/src


In [5]:
timestamp = datetime.now().strftime('%Y-%m-%H-%M-%S')
output_dir = f'/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-{timestamp}'
exposure_name = 'Sox2'
outcome_name = 'Nanog'

results_fname = f'{exposure_name}_{outcome_name}_effect_sizes.csv'

In [6]:
os.makedirs(output_dir, exist_ok=True)

# Loading BPNet

In [7]:
class Ensemble:
    def __init__(self, model_base_dir, n_reps=5):
        models = []
        for i in range(n_reps):
            models.append(SeqModel.from_mdir(os.path.join(model_base_dir, str(i))))
        self.models = models

    def predict(self, seqs):
        preds = {}
        for model in self.models:
            model_preds = model.predict(seqs)
            for key, preds_ in model_preds.items():
                preds.setdefault(key, []).append(preds_.mean(-1))
        return {k: np.stack(v) for k, v in preds.items()}

In [12]:
model_base_dir = "/home/stephenmalina/project/dat/res-bpnet-training-2020-09-02-16-18-50/output_ensemble"

In [13]:
model_dir = Path(model_base_dir)

In [14]:
create_tf_session(0)
clear_output()

In [15]:
model = Ensemble(model_base_dir, n_reps=3)
clear_output()

# Loading data

In [16]:
!cat {model_base_dir}/0/evaluation.valid.json

{
  "valid-peaks": {
    "Oct4/profile/binsize=1/auprc": 0.174503907210406,
    "Oct4/profile/binsize=1/random_auprc": 0.002968693730871131,
    "Oct4/profile/binsize=1/n_positives": 49840,
    "Oct4/profile/binsize=1/frac_ambigous": 0.07240139387999564,
    "Oct4/profile/binsize=1/imbalance": 0.002925521986142981,
    "Oct4/profile/binsize=10/auprc": 0.47671418115881486,
    "Oct4/profile/binsize=10/random_auprc": 0.03541463757849398,
    "Oct4/profile/binsize=10/n_positives": 39957,
    "Oct4/profile/binsize=10/frac_ambigous": 0.3622726777741479,
    "Oct4/profile/binsize=10/imbalance": 0.03411483457844183,
    "Oct4/counts/mse": 0.32489997148513794,
    "Oct4/counts/var_explained": 0.3644736409187317,
    "Oct4/counts/pearsonr": 0.6039534407707656,
    "Oct4/counts/spearmanr": 0.5689491091522799,
    "Oct4/counts/mad": 0.4609355032444,
    "Sox2/profile/binsize=1/auprc": 0.38153955490138125,
    "Sox2/profile/binsize=1/random_auprc": 0.00560025646079356,
    "Sox2

In [17]:
gin_config = read_json(os.path.join(model_base_dir, str(0), 'config.gin.json'))

In [18]:
ds = DataSpec.load(os.path.join(model_base_dir, '0', 'dataspec.yml'))
tasks = list(ds.task_specs)
tasks

['Oct4', 'Sox2', 'Nanog', 'Klf4']

In [19]:
dl_valid = StrandedProfile(ds, 
                           incl_chromosomes=gin_config['bpnet_data.valid_chr'], 
                           peak_width=gin_config['bpnet_data.peak_width'],
                           seq_width=gin_config['bpnet_data.seq_width'],
                           shuffle=False)

In [20]:
valid = dl_valid.load_all(num_workers=1)

100%|██████████| 915/915 [01:54<00:00,  8.01it/s]


In [21]:
valid['inputs'].keys(), valid['targets'].keys(), valid['metadata']

(dict_keys(['seq']),
 dict_keys(['Oct4/profile', 'Sox2/profile', 'Nanog/profile', 'Klf4/profile', 'Oct4/counts', 'Sox2/counts', 'Nanog/counts', 'Klf4/counts']),
 {'range': {'chr': array(['chr3', 'chr2', 'chr3', ..., 'chr3', 'chr4', 'chr4'], dtype='<U4'),
   'start': array([122145078,  52071743,  96334135, ...,   3672964, 127020434,
          124905354]),
   'end': array([122146078,  52072743,  96335135, ...,   3673964, 127021434,
          124906354]),
   'id': array([    0,     1,     2, ..., 29274, 29275, 29276]),
   'strand': array(['.', '.', '.', ..., '.', '.', '.'], dtype='<U1')},
  'interval_from_task': array(['Oct4', 'Oct4', 'Oct4', ..., 'Klf4', 'Klf4', 'Klf4'], dtype='<U5')})

## Predictions and in-silico mutagenesis

In [22]:
EXPOSURE_COL = f'{exposure_name}/counts'
OUTCOME_COL = f'{outcome_name}/counts'

In [23]:
sample_seqs = valid['inputs']['seq'][0:2]
sample_seqs.shape

(2, 1000, 4)

In [25]:
n_seqs = sample_seqs.shape
preds = {}
for seq in sample_seqs:
    muts = generate_wt_mut_batches(seq.T, seq.shape[0] * seq.shape[1]).squeeze()
    preds_ = model.predict(muts.transpose(0, 2, 1))
    for key, value in preds_.items():
        if key in (EXPOSURE_COL, OUTCOME_COL):
            preds.setdefault(key, []).append(preds_[key])

In [26]:
alphabet_size = 4
formatted_preds = np.stack((preds[EXPOSURE_COL], preds[OUTCOME_COL]))
n_features, n_seqs, n_reps, n_variants = formatted_preds.shape
formatted_preds = formatted_preds.reshape((n_reps, n_seqs, alphabet_size, -1, n_features))
formatted_preds.shape

(3, 2, 4, 1000, 2)

In [27]:
means, mean_diffs, stderrs = compute_summary_statistics(formatted_preds, sample_seqs.transpose(0, 2, 1))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




  stderrs = np.sqrt(ref_vars + mut_vars - 2 * covs)


In [28]:
np.mean(means), np.max(means)

(6.4399824, 7.166176)

In [29]:
results_fpath = os.path.join(output_dir, results_fname)
write_results(results_fpath, mean_diffs, stderrs)

In [30]:
results_fpath

'/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-21-27-21/Sox2_Nanog_effect_sizes.csv'