## Imports and model initialization

In [1]:
%load_ext autoreload
%autoreload 2

import csv
from datetime import datetime
import math
import os
from pathlib import Path
import pickle

from IPython.display import clear_output
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from tqdm.notebook import tqdm
from utils import detect_device
import uncertainty_toolbox
import uncertainty_toolbox.data as udata
import uncertainty_toolbox.metrics as umetrics
from uncertainty_toolbox.metrics_calibration import (
    get_proportion_lists_vectorized,
)
import uncertainty_toolbox.viz as uviz
from uncertainty_toolbox.recalibration import iso_recal

import bpnet
from bpnet.datasets import StrandedProfile
from bpnet.dataspecs import DataSpec, TaskSpec
from bpnet.utils import create_tf_session
from bpnet.utils import read_json
from bpnet.seqmodel import SeqModel
from bpnet.plot.evaluate import plot_loss, regression_eval

from filter_instrument_candidates import filter_variants_by_score
from in_silico_mutagenesis import compute_summary_statistics, generate_wt_mut_batches, write_results

Using TensorFlow backend.






2022-06-29 11:59:13,890 [INFO] NumExpr defaulting to 4 threads.


In [None]:
!pwd

In [82]:
timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
output_dir = f'/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-{timestamp}'
model_base_dir = "/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-training-2022-01-29-22-58-12/output_ensemble"
factor_names = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

exposure_outcome_pairs = [
    (exposure_name, outcome_name)
    for exposure_name in factor_names
    for outcome_name in factor_names
    if exposure_name != outcome_name
]

results_fnames = [
    f'{exposure_name}_{outcome_name}_effect_sizes.csv' for exposure_name, outcome_name in exposure_outcome_pairs
]
n_seqs = 2000
n_reps = 5
alphabet_size = 4

In [None]:
os.makedirs(output_dir, exist_ok=True)

# Loading BPNet

In [None]:
class Ensemble:
    def __init__(self, model_base_dir, n_reps=5):
        models = []
        for i in range(n_reps):
            models.append(SeqModel.from_mdir(os.path.join(model_base_dir, str(i))))
        self.models = models

    def predict(self, seqs):
        preds = {}
        for model in self.models:
            model_preds = model.predict(seqs)
            for key, preds_ in model_preds.items():
                preds.setdefault(key, []).append(preds_.mean(-1))
        return {k: np.stack(v) for k, v in preds.items()}

In [None]:
model_dir = Path(model_base_dir)

In [None]:
create_tf_session(0)
clear_output()

In [None]:
!ls {model_base_dir}/0/

In [None]:
model = Ensemble(model_base_dir, n_reps=2)
clear_output()

# Loading data

In [None]:
!cat {model_base_dir}/0/evaluation.valid.json

In [None]:
gin_config = read_json(os.path.join(model_base_dir, '0', 'config.gin.json'))
gin_config

In [None]:
ds = DataSpec.load(os.path.join(model_base_dir, '0', 'dataspec.yml')) # remember to re-add 0
tasks = list(ds.task_specs)
tasks

In [None]:
dl_valid = StrandedProfile(ds, 
                           incl_chromosomes=gin_config['bpnet_data.valid_chr'], 
                           peak_width=gin_config['bpnet_data.peak_width'],
                           seq_width=gin_config['bpnet_data.seq_width'],
                           inter
                           shuffle=False)

In [None]:
valid = dl_valid.load_all(batch_size=256, num_workers=1)

In [None]:
valid['targets']['Oct4/counts'].shape

# (Re-)Calibration

In [None]:
def fit_recalibrators(model, features, dataset: dict, batch_size=256):
    seqs = dataset['inputs']['seq']
    targets = dataset['targets']
    predictions = {f: np.zeros((seqs.shape[0], n_reps)) for f in features}
    ys = {k: np.zeros(seqs.shape[0]) for k, v in dataset['targets'].items()}
    for start_idx in range(0, len(seqs), batch_size):
        end_idx = min(start_idx + batch_size, len(seqs))
        seq_batch = seqs[start_idx: end_idx]
        p = model.predict(seq_batch)
        if start_idx % (256 * 10) == 0:
            print(start_idx)
        for f in features:
            predictions[f][start_idx: end_idx, :] = p[f].T
            ys[f][start_idx: end_idx] = targets[f][start_idx: end_idx, :].mean(axis=-1)

    pred_means = {}
    pred_stds = {}
    for f in features:
        pred_means[f] = np.mean(predictions[f], axis=1).squeeze()
        pred_stds[f] = np.std(predictions[f], axis=1).squeeze()

    recal_models = {}
    for f in features:
        y = ys[f]
        pred_mean, pred_std = pred_means[f], pred_stds[f]
        exp_props, obs_props = get_proportion_lists_vectorized(pred_mean, pred_std, y)
        recal_model = iso_recal(exp_props, obs_props)
        recal_models[f] = recal_model
    return recal_models

def recal_predict(recalibrators, preds, features):
    pred_means = {}
    pred_stds = {}
    for f in features:
        pred_means[f] = np.mean(preds[f], axis=0).squeeze()
        pred_stds[f] = np.std(preds[f], axis=0).squeeze()
    recal_preds = {k: np.zeros_like(v) for k, v in preds.items()}
    for f in features:
        pred_dist = stats.norm(loc=pred_means[f], scale=pred_stds[f])
        for c in range(preds[f].shape[0]):
            recal_model = recalibrators[f]
            orig_preds = preds[f][c, :]
            orig_quantiles = pred_dist.cdf(orig_preds)
            recal_quantiles = recal_model.predict(orig_quantiles)
            recal_preds[f][c] = pred_dist.ppf(recal_quantiles)
    return recal_preds

## Predictions and in-silico mutagenesis

In [None]:
cols = [f'{factor_name}/counts' for factor_name in factor_names]

In [None]:
valid['inputs']['seq'].shape

In [None]:
valid_records = []
valid_range_meta = valid['metadata']['range']
it = zip(valid['inputs']['seq'], valid_range_meta['start'], valid_range_meta['end'], valid_range_meta['strand'])
for seq, start, end, strand in it:
    if ((seq == 0.0) | (seq == 1.0)).all():
        valid_records.append({
            "seq": seq,
            "start": start,
            "end": end,
            "strand": strand,
        })
valid_ranges = pd.DataFrame.from_records(valid_records)
valid_seqs = valid_df.seq.values
valid_seqs.shape

In [None]:
np.random.seed(42)
idxs = np.arange(len(valid_seqs))
np.random.shuffle(idxs)
sample_seqs = valid_seqs[idxs[:n_seqs]]
sample_ranges = valid_ranges.iloc[idxs[:n_seqs]]
sample_seqs.shape

In [None]:
mfi_df = pd.read_csv('/home/ubuntu/motif-instances.bed', sep='\t', header=None)
mfi_df.columns = ["chrom", "start", "end", "tf", "match-score", "strand", "contrib-score", "log-odds-score"]

In [None]:
valid_mfi_df = mfi_df.query('chrom in ["chr2", "chr3", "chr4"]').sort_values(by=['chrom', 'start'])

In [None]:
for idx, row in valid_ranges.iterrows():
    mfi_rows = valid_mfi_df.query(
        f"start >= {row.sequence_start} and end <= {row.sequence_end} and chrom == '{row.chrom}'"
    )
    # mfi_rows = mfi_rows.query(f"strand == '{row.strand}'")
    for _, mfi_row in mfi_rows.iterrows():
        valid_ranges.loc[idx, f"{mfi_row.tf}_start"] = mfi_row.start
        valid_ranges.loc[idx, f"{mfi_row.tf}_end"] = mfi_row.end  
        valid_ranges.loc[idx, f"{mfi_row.tf}_match_score"] = mfi_row["match-score"]                

In [None]:
def get_distance(start1, start2, end1, end2):
    if end1 < start2:
        return start2 - end1
    elif end2 < start1:
        return start1 - end2
    else:
        raise ValueError(f"Invalid positions: {start1}-{end1}, {start2}-{end2}")

for exposure, outcome in exposure_outcome_pairs:
    distance_column_name = f"{exposure}_{outcome}_distance"
    assert f"{exposure}_start" in valid_ranges.columns and f"{outcome}_start" in valid_ranges.columns
    columns = (f"{exposure}_start", f"{outcome}_start", f"{exposure}_end", f"{outcome}_end")
    valid_ranges[distance_column_name] = valid_ranges[columns].apply(
        lambda row: get_distance(*[row[c] for c in columns]),
        axis=1
    )

In [None]:
valid['inputs']['seq'].shape, valid['targets'][cols[0]].shape

In [None]:
recal_models = fit_recalibrators(model, cols, valid)

In [None]:
n_seqs = sample_seqs.shape
preds = {}
recal_preds = {}
for seq in sample_seqs:
    muts = generate_wt_mut_batches(seq.T, seq.shape[0] * seq.shape[1]).squeeze()
    preds_ = model.predict(muts.transpose(0, 2, 1))
    recal_preds_ = recal_predict(recal_models, preds_, cols)
    for key, value in preds_.items():
        if key in cols:
            preds.setdefault(key, []).append(preds_[key])
            recal_preds.setdefault(key, []).append(recal_preds_[key])

In [None]:
np.array(preds['Oct4/counts']).shape

In [None]:
print(exposure_outcome_pairs)

In [None]:
seqs = sample_seqs.transpose(0, 2, 1)

for exposure, outcome in exposure_outcome_pairs[6:]:
    print(exposure_col, outcome_col)
    exposure_col = f'{exposure}/counts'
    outcome_col = f'{outcome}/counts'
    
    formatted_preds = np.stack((preds[exposure_col], preds[outcome_col]))
    n_features, n_seqs, n_reps, n_variants = formatted_preds.shape
    formatted_preds = formatted_preds.transpose(2, 1, 3, 0)
    formatted_preds = formatted_preds.reshape(n_reps, n_seqs, alphabet_size, -1, n_features)
    motif_distances = valid_ranges[f"{exposure}_{outcome}_distance"].values
    means, mean_diffs, stderrs = compute_summary_statistics(formatted_preds, seqs)
    assert len(motif_distances) == mean_diffs.shape[0]
    
    sig_var_idxs = filter_variants_by_score(mean_diffs[:, :, :, 0], z_threshold=3.0)
    print(
        "Reduced number of instruments down from %d to %d (%.2f %%)"
        % (
            np.prod(mean_diffs.shape),
            len(np.nonzero(sig_var_idxs)[0]),
            float(len(np.nonzero(sig_var_idxs)[0]) / np.prod(mean_diffs.shape)) * 100,
        )
    )
    print(sig_var_idxs.shape)

    results_fname = f'{exposure}_{outcome}_effect_sizes_v2.csv'
    results_fpath = os.path.join(output_dir, results_fname)
    write_results(results_fpath, mean_diffs, stderrs,  sig_idxs=sig_var_idxs, motif_distance=motif_distances)
    print(results_fpath)

In [None]:
mean_diffs.shape