## Imports and model initialization

In [22]:
%load_ext autoreload
%autoreload 2

import csv
from datetime import datetime
import math
import os
from pathlib import Path
import pickle

import kipoi
from kipoiseq.dataloaders import SeqIntervalDl
from IPython.display import clear_output
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from utils import detect_device

import bpnet
from bpnet.datasets import StrandedProfile
from bpnet.dataspecs import DataSpec, TaskSpec
from bpnet.utils import create_tf_session
from bpnet.utils import read_json
from bpnet.seqmodel import SeqModel
from bpnet.plot.evaluate import plot_loss, regression_eval

from in_silico_mutagenesis import compute_summary_statistics, generate_wt_mut_batches, write_results

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
!pwd

/home/stephenmalina/dev/an1lam/deepmr/src


In [25]:
timestamp = datetime.now().strftime('%Y-%m-%H-%M-%S')
output_dir = f'/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-{timestamp}'
exposure_name = 'Sox2'
outcome_name = 'Nanog'

results_fname = f'{exposure_name}_{outcome_name}_effect_sizes.csv'

In [28]:
os.makedirs(output_dir, exist_ok=True)

# Loading BPNet

In [4]:
class Ensemble:
    def __init__(self, model_base_dir, n_reps=5):
        models = []
        for i in range(n_reps):
            models.append(SeqModel.from_mdir(os.path.join(model_base_dir, str(i))))
        self.models = models

    def predict(self, seqs):
        preds = {}
        for model in self.models:
            model_preds = model.predict(seqs)
            for key, preds_ in model_preds.items():
                preds.setdefault(key, []).append(preds_.mean(-1))
        return {k: np.stack(v) for k, v in preds.items()}

In [5]:
model_base_dir = "./bpnet/examples/chip-nexus/output_ensemble"

In [6]:
model_dir = Path(model_base_dir)

In [7]:
create_tf_session(0)
clear_output()

In [8]:
model = Ensemble(model_base_dir, n_reps=3)
clear_output()

# Loading data

In [9]:
!cat {model_base_dir}/0/evaluation.valid.json

{
  "valid-peaks": {
    "Oct4/profile/binsize=1/auprc": 0.2988962008129486,
    "Oct4/profile/binsize=1/random_auprc": 0.10733649347249394,
    "Oct4/profile/binsize=1/n_positives": 17906,
    "Oct4/profile/binsize=1/frac_ambigous": 0.2969518272425249,
    "Oct4/profile/binsize=1/imbalance": 0.10576866279962668,
    "Oct4/profile/binsize=10/auprc": 0.8636563781189565,
    "Oct4/profile/binsize=10/random_auprc": 0.7731474464412409,
    "Oct4/profile/binsize=10/n_positives": 9216,
    "Oct4/profile/binsize=10/frac_ambigous": 0.5080564784053156,
    "Oct4/profile/binsize=10/imbalance": 0.7779841296640216,
    "Oct4/counts/mse": 1.327919840812683,
    "Oct4/counts/var_explained": -0.6340404748916626,
    "Oct4/counts/pearsonr": -0.019652674369757955,
    "Oct4/counts/spearmanr": -0.0273016972445214,
    "Oct4/counts/mad": 0.9159870743751526,
    "Sox2/profile/binsize=1/auprc": 0.2922871596759037,
    "Sox2/profile/binsize=1/random_auprc": 0.10754282518904301,
    "Sox2/

In [10]:
gin_config = read_json(os.path.join(model_base_dir, str(0), 'config.gin.json'))

In [11]:
ds = DataSpec.load(os.path.join(model_base_dir, '0', 'dataspec.yml'))
tasks = list(ds.task_specs)
tasks

['Oct4', 'Sox2', 'Nanog']

In [12]:
dl_valid = StrandedProfile(ds, 
                           incl_chromosomes=gin_config['bpnet_data.valid_chr'], 
                           peak_width=gin_config['bpnet_data.peak_width'],
                           seq_width=gin_config['bpnet_data.seq_width'],
                           shuffle=False)

In [13]:
valid = dl_valid.load_all(num_workers=1)

100%|██████████| 221/221 [00:09<00:00, 24.01it/s]


In [14]:
valid['inputs'].keys(), valid['targets'].keys(), valid['metadata']

(dict_keys(['seq']),
 dict_keys(['Oct4/profile', 'Sox2/profile', 'Nanog/profile', 'Oct4/counts', 'Sox2/counts', 'Nanog/counts']),
 {'range': {'chr': array(['chr2', 'chr2', 'chr2', ..., 'chr2', 'chr2', 'chr2'], dtype='<U4'),
   'start': array([ 52072143,  27539745, 119046676, ...,  72287298,  10272211,
          171629019]),
   'end': array([ 52072343,  27539945, 119046876, ...,  72287498,  10272411,
          171629219]),
   'id': array([   0,    1,    2, ..., 7062, 7063, 7064]),
   'strand': array(['.', '.', '.', ..., '.', '.', '.'], dtype='<U1')},
  'interval_from_task': array(['Oct4', 'Oct4', 'Oct4', ..., 'Nanog', 'Nanog', 'Nanog'],
        dtype='<U5')})

## Predictions and in-silico mutagenesis

In [15]:
EXPOSURE_COL = f'{exposure_name}/counts'
OUTCOME_COL = f'{outcome_name}/counts'

In [16]:
sample_seqs = valid['inputs']['seq'][0:2]
sample_seqs.shape

(2, 200, 4)

In [17]:
n_seqs = sample_seqs.shape
preds = {}
for seq in sample_seqs:
    muts = generate_wt_mut_batches(seq.T, seq.shape[0] * seq.shape[1]).squeeze()
    preds_ = model.predict(muts.transpose(0, 2, 1))
    for key, value in preds_.items():
        if key in (EXPOSURE_COL, OUTCOME_COL):
            preds.setdefault(key, []).append(preds_[key])

(800,)
(800,)


In [18]:
alphabet_size = 4
formatted_preds = np.stack((preds[EXPOSURE_COL], preds[OUTCOME_COL]))
n_features, n_seqs, n_reps, n_variants = formatted_preds.shape
formatted_preds = formatted_preds.reshape((n_reps, n_seqs, alphabet_size, -1, n_features))
formatted_preds.shape

(3, 2, 4, 200, 2)

In [19]:
means, mean_diffs, stderrs = compute_summary_statistics(formatted_preds, sample_seqs.transpose(0, 2, 1))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




  stderrs = np.sqrt(ref_vars + mut_vars - 2 * covs)


In [31]:
np.mean(means), np.max(means)

(4.7667036, 5.064296)

In [29]:
results_fpath = os.path.join(output_dir, results_fname)
write_results(results_fpath, mean_diffs, stderrs)