## Imports and model initialization

In [1]:
%load_ext autoreload
%autoreload 2

import csv
from datetime import datetime
import math
import os
from pathlib import Path
import pickle

import kipoi
from kipoiseq.dataloaders import SeqIntervalDl
from IPython.display import clear_output
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from utils import detect_device

import bpnet
from bpnet.datasets import StrandedProfile
from bpnet.dataspecs import DataSpec, TaskSpec
from bpnet.utils import create_tf_session
from bpnet.utils import read_json
from bpnet.seqmodel import SeqModel
from bpnet.plot.evaluate import plot_loss, regression_eval

from in_silico_mutagenesis import compute_summary_statistics, generate_wt_mut_batches, write_results

Using TensorFlow backend.






2020-09-09 13:33:14,282 [INFO] NumExpr defaulting to 8 threads.


In [2]:
!pwd

/home/stephenmalina/project/src


In [3]:
timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
output_dir = f'/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-{timestamp}'
factor_names = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

exposure_outcome_pairs = [
    (exposure_name, outcome_name)
    for exposure_name in factor_names
    for outcome_name in factor_names
    if exposure_name != outcome_name
]

results_fnames = [
    f'{exposure_name}_{outcome_name}_effect_sizes.csv' for exposure_name, outcome_name in exposure_outcome_pairs
]
n_seqs = 2000
alphabet_size = 4

In [4]:
os.makedirs(output_dir, exist_ok=True)

# Loading BPNet

In [5]:
class Ensemble:
    def __init__(self, model_base_dir, n_reps=5):
        models = []
        for i in range(n_reps):
            models.append(SeqModel.from_mdir(os.path.join(model_base_dir, str(i))))
        self.models = models

    def predict(self, seqs):
        preds = {}
        for model in self.models:
            model_preds = model.predict(seqs)
            for key, preds_ in model_preds.items():
                preds.setdefault(key, []).append(preds_.mean(-1))
        return {k: np.stack(v) for k, v in preds.items()}

In [6]:
model_base_dir = "/home/stephenmalina/project/dat/res-bpnet-training-2020-09-02-16-18-50/output_ensemble"

In [7]:
model_dir = Path(model_base_dir)

In [8]:
create_tf_session(0)
clear_output()

In [9]:
model = Ensemble(model_base_dir)
clear_output()

# Loading data

In [10]:
!cat {model_base_dir}/0/evaluation.valid.json

{
  "valid-peaks": {
    "Oct4/profile/binsize=1/auprc": 0.174503907210406,
    "Oct4/profile/binsize=1/random_auprc": 0.002968693730871131,
    "Oct4/profile/binsize=1/n_positives": 49840,
    "Oct4/profile/binsize=1/frac_ambigous": 0.07240139387999564,
    "Oct4/profile/binsize=1/imbalance": 0.002925521986142981,
    "Oct4/profile/binsize=10/auprc": 0.47671418115881486,
    "Oct4/profile/binsize=10/random_auprc": 0.03541463757849398,
    "Oct4/profile/binsize=10/n_positives": 39957,
    "Oct4/profile/binsize=10/frac_ambigous": 0.3622726777741479,
    "Oct4/profile/binsize=10/imbalance": 0.03411483457844183,
    "Oct4/counts/mse": 0.32489997148513794,
    "Oct4/counts/var_explained": 0.3644736409187317,
    "Oct4/counts/pearsonr": 0.6039534407707656,
    "Oct4/counts/spearmanr": 0.5689491091522799,
    "Oct4/counts/mad": 0.4609355032444,
    "Sox2/profile/binsize=1/auprc": 0.38153955490138125,
    "Sox2/profile/binsize=1/random_auprc": 0.00560025646079356,
    "Sox2

In [11]:
gin_config = read_json(os.path.join(model_base_dir, str(0), 'config.gin.json'))

In [12]:
ds = DataSpec.load(os.path.join(model_base_dir, '0', 'dataspec.yml'))
tasks = list(ds.task_specs)
tasks

['Oct4', 'Sox2', 'Nanog', 'Klf4']

In [13]:
dl_valid = StrandedProfile(ds, 
#                            intervals_file='/home/stephenmalina/project/dat/bpnet-manuscript-data/data/chip-nexus/Nanog/crispr-seq.narrowPeak.gz',
                           incl_chromosomes=gin_config['bpnet_data.valid_chr'], 
                           peak_width=gin_config['bpnet_data.peak_width'],
                           seq_width=gin_config['bpnet_data.seq_width'],
                           shuffle=False)

In [14]:
valid = dl_valid.load_all(num_workers=1)

100%|██████████| 915/915 [01:44<00:00,  8.77it/s]


In [15]:
valid['inputs'].keys(), valid['targets'], valid['metadata']

(dict_keys(['seq']),
 {'Oct4/profile': array([[[0., 0.],
          [0., 0.],
          [0., 0.],
          ...,
          [0., 0.],
          [1., 0.],
          [1., 0.]],
  
         [[1., 1.],
          [0., 0.],
          [1., 0.],
          ...,
          [0., 0.],
          [1., 0.],
          [0., 0.]],
  
         [[0., 0.],
          [0., 0.],
          [0., 0.],
          ...,
          [0., 0.],
          [0., 1.],
          [0., 0.]],
  
         ...,
  
         [[0., 0.],
          [0., 0.],
          [0., 0.],
          ...,
          [0., 0.],
          [0., 0.],
          [0., 0.]],
  
         [[0., 0.],
          [0., 0.],
          [0., 0.],
          ...,
          [0., 0.],
          [0., 1.],
          [0., 1.]],
  
         [[0., 0.],
          [0., 0.],
          [0., 0.],
          ...,
          [0., 1.],
          [0., 1.],
          [0., 0.]]], dtype=float32),
  'Sox2/profile': array([[[0., 0.],
          [1., 0.],
          [0., 0.],
          ...,
       

## Predictions and in-silico mutagenesis

In [16]:
cols = [f'{factor_name}/counts' for factor_name in factor_names]

In [17]:
valid['inputs']['seq'].shape

(29277, 1000, 4)

In [18]:
valid_seqs = []
for seq in valid['inputs']['seq']:
    if ((seq == 0.0) | (seq == 1.0)).all():
        valid_seqs.append(seq)
valid_seqs = np.array(valid_seqs)
valid_seqs.shape

(29264, 1000, 4)

In [19]:
np.random.seed(42)
idxs = np.arange(len(valid_seqs))
np.random.shuffle(idxs)
sample_seqs = valid_seqs[idxs[:n_seqs]]
sample_seqs.shape

(2000, 1000, 4)

In [20]:
n_seqs = sample_seqs.shape
preds = {}
for seq in tqdm(sample_seqs):
    muts = generate_wt_mut_batches(seq.T, seq.shape[0] * seq.shape[1]).squeeze()
    preds_ = model.predict(muts.transpose(0, 2, 1))
    for key, value in preds_.items():
        if key in cols:
            preds.setdefault(key, []).append(preds_[key])

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [21]:
np.array(preds['Oct4/counts']).shape

(2000, 5, 4000)

In [22]:
seqs = sample_seqs.transpose(0, 2, 1)

for exposure, outcome in exposure_outcome_pairs:
    exposure_col = f'{exposure}/counts'
    outcome_col = f'{outcome}/counts'
    
    formatted_preds = np.stack((preds[exposure_col], preds[outcome_col]))
    n_features, n_seqs, n_reps, n_variants = formatted_preds.shape
    formatted_preds = formatted_preds.transpose(2, 1, 3, 0)
    formatted_preds = formatted_preds.reshape(n_reps, n_seqs, alphabet_size, -1, n_features)

    means, mean_diffs, stderrs = compute_summary_statistics(formatted_preds, seqs)
    
    results_fname = f'{exposure}_{outcome}_effect_sizes.csv'
    results_fpath = os.path.join(output_dir, results_fname)
    write_results(results_fpath, mean_diffs, stderrs)
    print(results_fpath)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




  stderrs = np.sqrt(ref_vars + mut_vars - 2 * covs)


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Oct4_Sox2_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Oct4_Nanog_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Oct4_Klf4_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Sox2_Oct4_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Sox2_Nanog_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Sox2_Klf4_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Nanog_Oct4_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Nanog_Sox2_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Nanog_Klf4_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Klf4_Oct4_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Klf4_Sox2_effect_sizes.csv


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


/home/stephenmalina/dev/an1lam/deepmr/dat/res-bpnet-2020-09-09-13-33-15/Klf4_Nanog_effect_sizes.csv
