## Imports and model initialization

In [1]:
%load_ext autoreload
%autoreload 2

import csv
from datetime import datetime
import math
import os
from pathlib import Path
import pickle

from IPython.display import clear_output
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from tqdm.notebook import tqdm
from utils import detect_device
import uncertainty_toolbox
import uncertainty_toolbox.data as udata
import uncertainty_toolbox.metrics as umetrics
from uncertainty_toolbox.metrics_calibration import (
    get_proportion_lists_vectorized,
)
import uncertainty_toolbox.viz as uviz
from uncertainty_toolbox.recalibration import iso_recal

import bpnet
from bpnet.datasets import StrandedProfile
from bpnet.dataspecs import DataSpec, TaskSpec
from bpnet.utils import create_tf_session
from bpnet.utils import read_json
from bpnet.seqmodel import SeqModel
from bpnet.plot.evaluate import plot_loss, regression_eval

from filter_instrument_candidates import filter_variants_by_score
from in_silico_mutagenesis import compute_summary_statistics, generate_wt_mut_batches, write_results

Using TensorFlow backend.






2022-07-04 21:30:20,585 [INFO] NumExpr defaulting to 4 threads.


In [2]:
!pwd

/home/ubuntu/dev/an1lam/deepmr/src


In [3]:
timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
output_dir = f'/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-{timestamp}'
model_base_dir = "/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-training-2022-06-18-14-11-05/output_ensemble"
factor_names = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

exposure_outcome_pairs = [
    (exposure_name, outcome_name)
    for exposure_name in factor_names
    for outcome_name in factor_names
    if exposure_name != outcome_name
]

results_fnames = [
    f'{exposure_name}_{outcome_name}_effect_sizes.csv' for exposure_name, outcome_name in exposure_outcome_pairs
]
n_seqs = 2000
n_reps = 5
alphabet_size = 4

In [4]:
os.makedirs(output_dir, exist_ok=True)

# Loading BPNet

In [5]:
class Ensemble:
    def __init__(self, model_base_dir, n_reps=5):
        models = []
        for i in range(n_reps):
            models.append(SeqModel.from_mdir(os.path.join(model_base_dir, str(i))))
        self.models = models

    def predict(self, seqs):
        preds = {}
        for model in self.models:
            model_preds = model.predict(seqs)
            for key, preds_ in model_preds.items():
                preds.setdefault(key, []).append(preds_.mean(-1))
        return {k: np.stack(v) for k, v in preds.items()}

In [6]:
model_dir = Path(model_base_dir)

In [7]:
create_tf_session(0)
clear_output()

In [8]:
!ls {model_base_dir}/0/

bpnet-train.kwargs.json  events.out.tfevents.1655561487.ip-172-31-61-149
config.gin		 history.csv
config.gin.json		 input-config.gin
dataspec.yml		 log
evaluate.html		 model.h5
evaluate.ipynb		 note_params.json
evaluation.valid.json	 seq_model.pkl


In [9]:
model = Ensemble(model_base_dir, n_reps=5)
clear_output()

# Loading data

In [10]:
!cat {model_base_dir}/0/evaluation.valid.json

{
  "valid-peaks": {
    "Oct4/profile/binsize=1/auprc": 0.1842908885317406,
    "Oct4/profile/binsize=1/random_auprc": 0.0029172058898126905,
    "Oct4/profile/binsize=1/n_positives": 49497,
    "Oct4/profile/binsize=1/frac_ambigous": 0.07206144485858364,
    "Oct4/profile/binsize=1/imbalance": 0.00292374568581084,
    "Oct4/profile/binsize=10/auprc": 0.49499918176971536,
    "Oct4/profile/binsize=10/random_auprc": 0.03453527112065736,
    "Oct4/profile/binsize=10/n_positives": 39527,
    "Oct4/profile/binsize=10/frac_ambigous": 0.3608791931593949,
    "Oct4/profile/binsize=10/imbalance": 0.03389930806887065,
    "Oct4/counts/mse": 0.2637477517127991,
    "Oct4/counts/var_explained": 0.387393057346344,
    "Oct4/counts/pearsonr": 0.6247291515481017,
    "Oct4/counts/spearmanr": 0.5989260316683799,
    "Oct4/counts/mad": 0.386260986328125,
    "Sox2/profile/binsize=1/auprc": 0.3810130867099396,
    "Sox2/profile/binsize=1/random_auprc": 0.005999426096531899,
    "Sox

In [11]:
gin_config = read_json(os.path.join(model_base_dir, '0', 'config.gin.json'))
gin_config

{'Adam.amsgrad': False,
 'Adam.beta_1': 0.9,
 'Adam.beta_2': 0.999,
 'Adam.decay': 0.0,
 'Adam.epsilon': 'None',
 'Adam.lr': 0.004,
 'DeConv1D.batchnorm': False,
 'DeConv1D.filters': 64,
 'DeConv1D.n_hidden': 0,
 'DeConv1D.n_tasks': 2,
 'DeConv1D.padding': 'same',
 'DeConv1D.tconv_kernel_size': 25,
 'DilatedConv1D.add_pointwise': False,
 'DilatedConv1D.batchnorm': False,
 'DilatedConv1D.conv1_kernel_size': 25,
 'DilatedConv1D.filters': 64,
 'DilatedConv1D.n_dil_layers': 9,
 'DilatedConv1D.padding': 'same',
 'DilatedConv1D.skip_type': 'residual',
 'GlobalAvgPoolFCN.batchnorm': False,
 'GlobalAvgPoolFCN.dropout': 0,
 'GlobalAvgPoolFCN.dropout_hidden': 0,
 'GlobalAvgPoolFCN.hidden': 'None',
 'GlobalAvgPoolFCN.n_splines': 0,
 'GlobalAvgPoolFCN.n_tasks': 2,
 'MovingAverages.window_sizes': [1, 50],
 'PeakPredictionProfileMetric.binsizes': [1, 10],
 'PeakPredictionProfileMetric.neg_max_threshold': 0.005,
 'PeakPredictionProfileMetric.pos_min_threshold': 0.015,
 'PeakPredictionProfileMetric.re

In [12]:
ds = DataSpec.load(os.path.join(model_base_dir, '0', 'dataspec.yml')) # remember to re-add 0
tasks = list(ds.task_specs)
tasks

['Oct4', 'Sox2', 'Nanog', 'Klf4']

In [13]:
dl_valid = StrandedProfile(ds, 
                           incl_chromosomes=gin_config['bpnet_data.valid_chr'], 
                           peak_width=gin_config['bpnet_data.peak_width'],
                           seq_width=gin_config['bpnet_data.seq_width'],
                           shuffle=False)

In [14]:
valid = dl_valid.load_all(batch_size=256, num_workers=2)

100%|██████████| 115/115 [01:00<00:00,  1.89it/s]


In [15]:
valid['targets']['Oct4/counts'].shape

(29277, 2)

# (Re-)Calibration

In [16]:
def fit_recalibrators(model, features, dataset: dict, batch_size=256):
    seqs = dataset['inputs']['seq']
    targets = dataset['targets']
    predictions = {f: np.zeros((seqs.shape[0], n_reps)) for f in features}
    ys = {k: np.zeros(seqs.shape[0]) for k, v in dataset['targets'].items()}
    for start_idx in range(0, len(seqs), batch_size):
        end_idx = min(start_idx + batch_size, len(seqs))
        seq_batch = seqs[start_idx: end_idx]
        p = model.predict(seq_batch)
        if start_idx % (256 * 10) == 0:
            print(start_idx)
        for f in features:
            predictions[f][start_idx: end_idx, :] = p[f].T
            ys[f][start_idx: end_idx] = targets[f][start_idx: end_idx, :].mean(axis=-1)

    pred_means = {}
    pred_stds = {}
    for f in features:
        pred_means[f] = np.mean(predictions[f], axis=1).squeeze()
        pred_stds[f] = np.std(predictions[f], axis=1).squeeze()

    recal_models = {}
    for f in features:
        y = ys[f]
        pred_mean, pred_std = pred_means[f], pred_stds[f]
        exp_props, obs_props = get_proportion_lists_vectorized(pred_mean, pred_std, y)
        recal_model = iso_recal(exp_props, obs_props)
        recal_models[f] = recal_model
    return recal_models

def recal_predict(recalibrators, preds, features):
    pred_means = {}
    pred_stds = {}
    for f in features:
        pred_means[f] = np.mean(preds[f], axis=0).squeeze()
        pred_stds[f] = np.std(preds[f], axis=0).squeeze()
    recal_preds = {k: np.zeros_like(v) for k, v in preds.items()}
    for f in features:
        pred_dist = stats.norm(loc=pred_means[f], scale=pred_stds[f])
        for c in range(preds[f].shape[0]):
            recal_model = recalibrators[f]
            orig_preds = preds[f][c, :]
            orig_quantiles = pred_dist.cdf(orig_preds)
            recal_quantiles = recal_model.predict(orig_quantiles)
            recal_preds[f][c] = pred_dist.ppf(recal_quantiles)
    return recal_preds

## Predictions and in-silico mutagenesis

In [17]:
cols = [f'{factor_name}/counts' for factor_name in factor_names]

In [18]:
valid['inputs']['seq'].shape

(29277, 1000, 4)

In [19]:
valid_range_meta

NameError: name 'valid_range_meta' is not defined

In [20]:
valid_records = []
valid_range_meta = valid['metadata']['range']
it = zip(valid['inputs']['seq'], valid_range_meta['start'], valid_range_meta['end'], valid_range_meta['strand'], valid_range_meta['chr'])
for seq, start, end, strand, chrom in tqdm(it):
    if ((seq == 0.0) | (seq == 1.0)).all():
        valid_records.append({
            "seq": seq,
            "start": start,
            "end": end,
            "strand": strand,
            "chrom": chrom
        })
valid_ranges = pd.DataFrame.from_records(valid_records)
valid_seqs = valid_ranges.seq.values
valid_seqs.shape

0it [00:00, ?it/s]

(29264,)

In [21]:
np.random.seed(42)
idxs = np.arange(len(valid_seqs))
np.random.shuffle(idxs)
sample_seqs = valid_seqs[idxs[:n_seqs]]
sample_ranges = valid_ranges.iloc[idxs[:n_seqs]].copy()
sample_seqs.shape, sample_ranges.shape

((2000,), (2000, 5))

In [22]:
mfi_df = pd.read_csv('/home/ubuntu/motif-instances.bed', sep='\t', header=None)
mfi_df.columns = ["chrom", "start", "end", "tf", "match-score", "strand", "contrib-score", "log-odds-score"]

In [23]:
valid_mfi_df = mfi_df.query('chrom in ["chr2", "chr3", "chr4"]').sort_values(by=['chrom', 'start'])

In [24]:
for idx, row in sample_ranges.iterrows():
    mfi_rows = valid_mfi_df.query(
        f"start >= {row.start} and end <= {row.end} and chrom == '{row.chrom}'"
    )
    # mfi_rows = mfi_rows.query(f"strand == '{row.strand}'")
    for _, mfi_row in mfi_rows.iterrows():
        sample_ranges.loc[idx, f"{mfi_row.tf}_start"] = mfi_row.start
        sample_ranges.loc[idx, f"{mfi_row.tf}_end"] = mfi_row.end  
        sample_ranges.loc[idx, f"{mfi_row.tf}_match_score"] = mfi_row["match-score"]

In [25]:
sample_ranges.columns

Index(['seq', 'start', 'end', 'strand', 'chrom', 'Oct4_start', 'Oct4_end',
       'Oct4_match_score', 'Sox2_start', 'Sox2_end', 'Sox2_match_score',
       'Oct4-Sox2_start', 'Oct4-Sox2_end', 'Oct4-Sox2_match_score',
       'Nanog_start', 'Nanog_end', 'Nanog_match_score', 'Essrb_start',
       'Essrb_end', 'Essrb_match_score', 'Oct4-Oct4_start', 'Oct4-Oct4_end',
       'Oct4-Oct4_match_score', 'Klf4_start', 'Klf4_end', 'Klf4_match_score',
       'Zic3_start', 'Zic3_end', 'Zic3_match_score', 'B-Box_start',
       'B-Box_end', 'B-Box_match_score', 'Nanog-partner_start',
       'Nanog-partner_end', 'Nanog-partner_match_score', 'Klf4-Klf4_start',
       'Klf4-Klf4_end', 'Klf4-Klf4_match_score'],
      dtype='object')

In [26]:
def get_distance(start1, start2, end1, end2):
    if np.isnan(start1) or np.isnan(start2):
        return np.nan
    if end1 < start2:
        return start2 - end1
    elif end2 < start1:
        return start1 - end2
    else:
        return 0

for exposure, outcome in exposure_outcome_pairs:
    distance_column_name = f"{exposure}_{outcome}_distance"
    assert f"{exposure}_start" in sample_ranges.columns and f"{outcome}_start" in sample_ranges.columns
    columns = [f"{exposure}_start", f"{outcome}_start", f"{exposure}_end", f"{outcome}_end"]
    sample_ranges[distance_column_name] = sample_ranges[columns].apply(
        lambda row: get_distance(*[row[c] for c in columns]),
        axis=1
    )

In [27]:
(~pd.isna(sample_ranges.Klf4_Oct4_distance)).sum()

359

In [28]:
valid['inputs']['seq'].shape, valid['targets'][cols[0]].shape

((29277, 1000, 4), (29277, 2))

In [29]:
recal_models = fit_recalibrators(model, cols, valid)

0
2560
5120
7680
10240
12800
15360
17920
20480
23040
25600
28160


In [30]:
n_seqs = sample_seqs.shape
preds = {}
recal_preds = {}
for seq in tqdm(sample_seqs):
    muts = generate_wt_mut_batches(seq.T, seq.shape[0] * seq.shape[1]).squeeze()
    preds_ = model.predict(muts.transpose(0, 2, 1))
    recal_preds_ = recal_predict(recal_models, preds_, cols)
    for key, value in preds_.items():
        if key in cols:
            preds.setdefault(key, []).append(preds_[key])
            recal_preds.setdefault(key, []).append(recal_preds_[key])

  0%|          | 0/2000 [00:00<?, ?it/s]

In [64]:
sample_seqs[0].shape

(1000, 4)

In [69]:
np.array(recal_preds['Sox2/counts'])[0].reshape(5, 1000, 4)[0][sample_seqs[0].astype(np.bool)]

array([2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  , 2.93739  ,
       2.93739  , 2.93739  , 2.93739  , 2.93739  , 

In [32]:
print(exposure_outcome_pairs)

[('Oct4', 'Sox2'), ('Oct4', 'Nanog'), ('Oct4', 'Klf4'), ('Sox2', 'Oct4'), ('Sox2', 'Nanog'), ('Sox2', 'Klf4'), ('Nanog', 'Oct4'), ('Nanog', 'Sox2'), ('Nanog', 'Klf4'), ('Klf4', 'Oct4'), ('Klf4', 'Sox2'), ('Klf4', 'Nanog')]


In [47]:
sample_seqs = np.array([seq for seq in sample_seqs])
sample_seqs.shape

(2000, 1000, 4)

In [55]:
print(np.array(recal_preds[exposure_col]).shape)

(2000, 5, 4000)


In [None]:
seqs = sample_seqs.transpose(0, 2, 1)

for exposure, outcome in exposure_outcome_pairs:
    exposure_col = f'{exposure}/counts'
    outcome_col = f'{outcome}/counts'
    distance_col = f"{exposure}_{outcome}_distance"
    
    if distance_col in sample_ranges.columns:
        print(distance_col)
        formatted_preds = np.stack((recal_preds[exposure_col], recal_preds[outcome_col]))
        n_features, n_seqs, n_reps, n_variants = formatted_preds.shape
        formatted_preds = formatted_preds.transpose(2, 1, 3, 0)
        formatted_preds = formatted_preds.reshape(n_reps, n_seqs, -1, alphabet_size, n_features)
        formatted_preds = formatted_preds.transpose(0, 1, 3, 2, 4)
        
        motif_distances = sample_ranges[distance_col].values
        means, mean_diffs, stderrs = compute_summary_statistics(formatted_preds, seqs)
        assert len(motif_distances) == mean_diffs.shape[0]

        sig_var_idxs = filter_variants_by_score(mean_diffs[:, :, :, 0], z_threshold=3.0)
        print(
            "Reduced number of instruments down from %d to %d (%.2f %%)"
            % (
                np.prod(mean_diffs.shape),
                len(np.nonzero(sig_var_idxs)[0]),
                float(len(np.nonzero(sig_var_idxs)[0]) / np.prod(mean_diffs.shape)) * 100,
            )
        )
        print(sig_var_idxs.shape)

        results_fname = f'{exposure}_{outcome}_effect_sizes_v2.csv'
        results_fpath = os.path.join(output_dir, results_fname)
        write_results(results_fpath, mean_diffs, stderrs,  sig_idxs=sig_var_idxs, motif_distance=motif_distances)
        print(results_fpath)

Oct4_Sox2_distance


  0%|          | 0/2000 [00:00<?, ?it/s]

  stderrs = np.sqrt(lambdas * ref_vars + lambdas * mut_vars - 2 * lambdas * covs)


Reduced number of instruments down from 12000000 to 74337 (0.62 %)
(2000, 3, 1000)
/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-2022-07-04-21-30-22/Oct4_Sox2_effect_sizes_v2.csv
Oct4_Nanog_distance


  0%|          | 0/2000 [00:00<?, ?it/s]

Reduced number of instruments down from 12000000 to 74337 (0.62 %)
(2000, 3, 1000)
/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-2022-07-04-21-30-22/Oct4_Nanog_effect_sizes_v2.csv
Oct4_Klf4_distance


  0%|          | 0/2000 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
mean_diffs.shape