## Imports and model initialization

In [1]:
%load_ext autoreload
%autoreload 2
# !pip install kipoi
# !pip install kipoiseq
# !pip install pybedtools
# !pip uninstall -y kipoi_veff
# !pip install git+https://github.com/an1lam/kipoi-veff
# !pip install pyvcf
import csv
import math
import os
import pickle

import kipoi
from kipoi_interpret.importance_scores.ism import Mutation
from kipoiseq.dataloaders import SeqIntervalDl
from matplotlib import pyplot as plt
import numpy as np
from tqdm import tqdm

from motif_classifier import MotifClassifier
from utils import load_pfm

In [2]:
!pwd

/home/stephenmalina/project/src


# Constants

In [3]:
DATA_DIR = '../dat'

CHROM_ACC_COL = 'HepG2_DNase_None'
TF_COL = 'HepG2_FOXA1_None'

# Loading DNA sequence data

In [4]:
train_dl = SeqIntervalDl("../dat/FOXA1.train.samples.bed", "../dat/hg19.fa", auto_resize_len=1000)
test_dl = SeqIntervalDl("../dat/FOXA1.test.samples.bed", "../dat/hg19.fa", auto_resize_len=1000) 

# Loading DeepSEA

In [None]:
import IPython
import tensorflow as tf
print("TF version:", tf.__version__)
import torch
print("torch version:", torch.__version__)
from torch import nn
IPython.display.clear_output()

In [None]:
deepsea = kipoi.get_model("DeepSEA/predict", source="kipoi")
deepsea.model

In [None]:
deepsea.pipeline.predict_example().shape

# Loading Motifs

In [None]:
motifs = load_pfm(os.path.join(DATA_DIR, 'foxa1.pfm'))

## Predictions and in-silico mutagenesis

In [None]:
train_data = train_dl.load_all()
train_seqs = train_data['inputs'].transpose((0, 2, 1)).astype(np.float32)
train_labels = train_data['targets']

In [None]:
motif_classifier = MotifClassifier(motifs)
motif_classifier.train(train_seqs, train_labels)

In [None]:
print(motif_classifier.pwm_thresholds)

In [None]:
test_data = test_dl.load_all()
test_seqs = test_data['inputs'].transpose((0, 2, 1)).astype(np.float32)
test_labels = test_data['targets']

In [None]:
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

print(len(test_seqs))
scores_preds = [motif_classifier(seq) for seq in test_seqs]
fpr, tpr, thresholds = roc_curve(test_labels, [sp[0] for sp in scores_preds])
roc_auc = auc(fpr, tpr)
print(fpr, tpr, thresholds)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
relevant_cols = sorted([(i, label)
                        for i, label in enumerate(deepsea.schema.targets.column_labels)
                        if label in [CHROM_ACC_COL, TF_COL]])

def output_sel_fn(result):
    return np.array([result[:, col_idx] for col_idx, _ in relevant_cols]).T

relevant_cols

In [None]:
def next_seq(it):
    return (np
            .expand_dims(next(it)["inputs"].transpose(0, 2, 1), 2)
            .astype(np.float32)
            .squeeze())
    

epochs, n_seqs, batch_size = 50, 0, 301
preds = [[[] for _ in range(n_seqs)] for _ in range(epochs)]
it = dl.batch_iter(batch_size=1, num_workers=0, drop_last=False)

print(f"Generating predictions for {len(it)} seqs")

batch_size = 301
for i in range(min(n_seqs, len(it))):
    seq = next_seq(it)
    if np.allclose(seq, .25): continue
    wt_mut_batches = generate_wt_mut_batches(seq, batch_size)
    for batch in tqdm(wt_mut_batches):
        for epoch in range(epochs):
            preds[epoch][i].append(output_sel_fn(deepsea.predict_on_batch(np.expand_dims(batch, axis=2))))


# np_preds = np.array(preds)
# assert np_preds.shape[:2] == (epochs, n_seqs), np_preds.shape

In [None]:
# plt.hist(
#     [np.sqrt(epoch_count_vars[:, :, 1:, 1].reshape(-1)) for epoch_count_vars in diff_epoch_count_vars],
#     range=[0.0, .3],
#     histtype="bar",
#     label=[1, 10, 50, 100, 150]
# )
# plt.legend();

In [None]:
import pickle
# pickle_file = "../dat/most_recent_sat_mut_results__drop_channel.pickle"
pickle_file = "../dat/most_recent_sat_mut_results__original_mc_dropout.pickle"
# with open(pickle_file, 'wb') as f: pickle.dump(np_preds, f)

In [None]:
with open(pickle_file, 'rb') as f: np_preds = pickle.load(f)
print(np_preds.shape)
epochs, n_seqs, n_batches, batch_size, _ = np_preds.shape
np_preds.shape

In [None]:
log_uniform_prob = math.log(.05/(1-.05))
def compute_normalized_prob(prob, train_prob):
    # source: http://deepsea.princeton.edu/help/
    denom = 1+np.exp(-(np.log(prob/(1-prob))+log_uniform_prob-np.log(train_prob/(1-train_prob))))
    return 1 / denom

# Ratios and normalization formula drawn from here: http://deepsea.princeton.edu/media/help/posproportion.txt
tf_compute_normalized_prob = lambda prob: compute_normalized_prob(prob, 0.02394)
chrom_acc_normalized_prob = lambda prob: compute_normalized_prob(prob, 0.049791)

# Results & Analysis
## Computing relevant statistics
In this section, we compute the predictive mean and variance of the raw predictions and of each ref/mut pair, which also requires computing covariance.

In [None]:
np_preds[:, :, :, :, 0] = chrom_acc_normalized_prob(np_preds[:, :, :, :, 0])
np_preds[:, :, :, :, 1] = compute_normalized_prob(np_preds[:, :, :, :, 1], 0.020508)
np_preds[:, :, :, :, 2] = compute_normalized_prob(np_preds[:, :, :, :, 2], 0.02394)

In [None]:
n_batches = np_preds.shape[2]
batch_size = np_preds.shape[3]
np_preds.shape

In [None]:
np_pred_means = np.mean(np_preds[:, :, :, :, :], axis=0)
np_pred_vars = np.var(np_preds, axis=0, dtype=np.float64)
np_pred_means.shape

In [None]:
np_pred_mean_diffs = np_pred_means[:, :, 1:, :] - np_pred_means[:, :, 0:1, :] 
np_pred_mean_diffs.shape

In [None]:
np_pred_covs = np.zeros((n_seqs, n_batches, batch_size, 2, 2, len(relevant_cols)))
for seq in range(n_seqs):
    for batch in range(n_batches):
        for col in range(len(relevant_cols)):
            ref_seq_preds = np_preds[:, seq, batch, 0, col]
            for mut in range(batch_size):
                mut_seq_preds = np_preds[:, seq, batch, mut, col]
                cov = np.cov(np.stack((ref_seq_preds, mut_seq_preds)), ddof=0) # 2x2, symmetric
                np_pred_covs[seq, batch, mut, :, :, col] = cov # off diag idx

In [None]:
print(np_pred_covs[0, 0, 50, :, :, 0])
print(np_pred_vars[0, 0, 48:52, 0])

In [None]:
print(np_pred_covs.dtype)

In [None]:
np_pred_uncertainties = np.sqrt(np_pred_covs[:, :, 1:, 1, 1, :] + np_pred_covs[:, :, 1:, 0, 0, :] - 2 * np_pred_covs[:, :, 1:, 0, 1, :])
np.mean(np.mean(np_pred_uncertainties, axis=2), axis=1)

In [None]:
np_pred_uncertainties = np.sqrt(np_pred_vars[:, :, 1:, :] + np_pred_vars[:, :, 0:1, :] - 2 * np_pred_covs[:, :, 1:, 0, 1, :])
np.mean(np.mean(np_pred_uncertainties, axis=2), axis=1)

Results from prior runs:

    array([[0.02932001, 0.16316762, 0.16552047],
       [0.15966936, 0.22315478, 0.22175914]])

    array([[0.00388914, 0.03407012, 0.03650351],
       [0.04776923, 0.09040056, 0.08849534]])

## Accuracy & Calibration

In [None]:
import sklearn
from sklearn.metrics import auc, brier_score_loss, roc_auc_score, roc_curve
from sklearn.calibration import calibration_curve 

score = roc_auc_score(
    np.concatenate((np.ones(25), np.zeros(25))),
    np_pred_means[:, 0, 0, 1],
)
score

In [None]:
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.calibration import calibration_curve

score = roc_auc_score(
    np.concatenate((np.ones(25), np.zeros(25))),
    np_pred_means[:, 0, 0, 1],
)
score

In [None]:
score = roc_auc_score(
    np.concatenate((np.ones(25), np.zeros(25))),
    np_pred_means[:, 0, 0, 1],
)

In [None]:
fpr, tpr, thresholds = roc_curve(
    np.concatenate((np.ones(25), np.zeros(25))),
    np_pred_means[:, 0, 0, 1],
)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (TF binding)')
plt.legend(loc="lower right")
plt.show()

In [None]:
y_test = np.concatenate((np.ones(25), np.zeros(25)))
prob_pos = np_pred_means[:, 0, 0, 1]
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_test,
    prob_pos,
    n_bins=10
)
clf_score = brier_score_loss(y_test, prob_pos)
plt.plot(mean_predicted_value, fraction_of_positives, "s-",
     label="%s (%1.3f)" % ("MC dropout predictive means (TF)", clf_score))
plt.ylabel("Fraction of positives")
# ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
#          histtype="step", lw=2)
prob_pos

In [None]:
plt.hist(prob_pos, range=(0, 1), bins=10, label="Predictive means (TF)",
          histtype="step", lw=2)
plt.legend()

In [None]:
y_test = np.concatenate((np.ones(25), np.zeros(25)))
prob_pos = np_pred_means[:, 0, 0, 0]
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_test,
    prob_pos,
    n_bins=10
)
clf_score = brier_score_loss(y_test, prob_pos)
plt.plot(mean_predicted_value, fraction_of_positives, "s-",
     label="%s (%1.3f)" % ("MC dropout predictive means (CA)", clf_score))
plt.ylabel("Fraction of positives")

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 20))
plt.suptitle("Predictive Means (Ref)")

ax1.hist(
    (np_pred_means[:25, 0, 0, 1].reshape(-1),
     np_pred_means[25:, 0, 0, 1].reshape(-1)),
    label=("binding", "no binding"))
ax1.set_title(f"TF {relevant_cols[1][1]}")
ax1.legend()
ax2.hist(
    (np_pred_means[:25, 0, 0, 0].reshape(-1),
     np_pred_means[25:, 0, 0, 0].reshape(-1)),
    label=("accessible", "not accessible"))
ax2.set_title(f"DNase {relevant_cols[0][1]}")
ax2.legend()

fig.text(0.5, 0.08, "Predictive Means for y=0 vs y=1", ha="center")
fig.text(0.07, 0.5, "# of Seqs", va='center', rotation='vertical')

plt.show();

### Standard Error Calibration

In [None]:
from scipy import stats

sample_seq = 2
sample_pred_diffs = (np_preds[:, sample_seq, :, 1:, :] - np_preds[:, sample_seq, :, 0:1, :]).reshape(epochs, -1)
sample_std_errs = np_pred_uncertainties[sample_seq, :, :, :].reshape(-1)
sample_mean_diffs = np_pred_mean_diffs[sample_seq, :, :, :].reshape(-1)
normalized_preds = (sample_pred_diffs - sample_mean_diffs) / sample_std_errs

res = stats.probplot(normalized_preds[:, :].reshape(-1), plot=plt)

## TF-CA Relationship and Mutation Effect Exploration 

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 10))
plt.suptitle("Predictive Mean Diffs (Mut vs. Ref)")

ax1.hist(
    (np_pred_mean_diffs[:25, :, 1:, 1].reshape(-1),
     np_pred_mean_diffs[25:, :, 1:, 1].reshape(-1)),
    label=("binding", "no binding"),
    log=True)
ax1.set_title(f"TF {relevant_cols[1][1]}")
ax1.legend()
ax2.hist(
    (np_pred_mean_diffs[:25, :, 1:, 0].reshape(-1),
     np_pred_mean_diffs[25:, :, 1:, 0].reshape(-1)),
    label=("binding", "no binding"),
    log=True)
ax2.set_title(f"DNase {relevant_cols[2][1]}")
ax2.legend()

fig.text(0.5, 0.08, "Predictive Mean Diff", ha="center")
fig.text(0.07, 0.5, "# of Seqs", va='center', rotation='vertical')


plt.show();

In [None]:
TF_COL = 1
CA_COL = 0

_, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2, 2, figsize=(14, 10))
plt.suptitle("S.E. vs. Predictive Mean Diff (TF)")

for seq in range(n_seqs):
    ax11.scatter(
        np_pred_mean_diffs[seq, :, :, TF_COL].reshape(-1), 
        np_pred_uncertainties[seq, :, :, TF_COL].reshape(-1))
    ax11.set_title(f"{relevant_cols[1][1]} (binding)")

for seq in range(n_seqs // 2, n_seqs):
    ax12.scatter(
        np_pred_mean_diffs[seq, :, :, TF_COL].reshape(-1), 
        np_pred_uncertainties[seq, :, :, TF_COL].reshape(-1))
    ax12.set_title(f"{relevant_cols[1][1]} (no binding)")
    
for seq in range(n_seqs // 2):
    ax21.scatter(
        np_pred_mean_diffs[seq, :, :, CA_COL].reshape(-1), 
        np_pred_uncertainties[seq, :, :, CA_COL].reshape(-1))
    ax21.set_title(f"{relevant_cols[0][1]} (binding)")

for seq in range(n_seqs // 2, n_seqs):
    ax22.scatter(
        np_pred_mean_diffs[seq, :, :, CA_COL].reshape(-1), 
        np_pred_uncertainties[seq, :, :, CA_COL].reshape(-1))
    ax22.set_title(f"{relevant_cols[0][1]} (no binding)")
    
fig.text(0.5, 0.08, "Predictive Mean Diff", ha="center")
fig.text(0.07, 0.5, "Predictive S.E.", va='center', rotation='vertical')

plt.show()

In [None]:
n_binding_seqs = 25
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 10))
plt.suptitle("CA predictive mean diff vs. TF predictive mean diff")
for seq in range(n_binding_seqs):
    title, ax = ("(binding)", ax1) if seq < n_binding_seqs else ("(no binding)", ax1)
    ax1.scatter(
        np_pred_mean_diffs[seq, :, :, TF_COL].reshape(-1), 
        np_pred_mean_diffs[seq, :, :, CA_COL].reshape(-1), 
        label=seq)
    if seq < n_binding_seqs: ax1.set_title("(binding)")
    ax1.legend()
for seq in range(n_binding_seqs, n_seqs):
    ax2.scatter(
        np_pred_mean_diffs[seq, :, :, TF_COL].reshape(-1), 
        np_pred_mean_diffs[seq, :, :, CA_COL].reshape(-1), 
        label=seq)
    ax2.set_title("(no binding)")
    ax2.legend()
fig.text(0.5, 0.08, "TF Predictive Mean Diff", ha="center")
fig.text(0.07, 0.5, "CA Predictive Mean Diff", va='center', rotation='vertical')

plt.show()

In [None]:
from matplotlib import patches
from matplotlib import cm

cols, margin = 3, 10 # margin determined empirically
fig, axs = plt.subplots(math.ceil(n_seqs / float(cols)), cols, figsize=(16, n_seqs + margin))
# sample_seqs = np.random.random_integers(0, n_seqs, size=4)
# sample_batches = np.random.random_integers(0, n_batches, size=4)
# sample_muts = np.random.random_integers(0, batch_size-1, size=4)
# sample_col = np.random.random_integers(0, len(relevant_cols)-1)

for i in range(n_seqs):
    np_sample_mut_preds = np_preds[:, i, :, 1:, TF_COL]
    np_sample_ref_preds = np.zeros_like(np_sample_mut_preds) + np_preds[:, i, :, :1, TF_COL]
    assert np.allclose(np_sample_ref_preds[:, 0, 0], np_sample_ref_preds[:, 0, 1]) # spot check
    colors = (
        (np_sample_mut_preds.ravel() - np_sample_mut_preds.mean())**2 + 
        (np_sample_ref_preds.ravel() - np_sample_ref_preds.mean())**2
    )
    ax = axs[i // cols, i % cols]
    ax.hexbin(
        np_sample_ref_preds.ravel(), 
        np_sample_mut_preds.ravel(), 
        C=colors,
        cmap=cm.jet,
        bins=None,
    )
    xq1, xq2 = np.quantile(np_sample_ref_preds, (.25, .75))
    yq1, yq2 = np.quantile(np_sample_mut_preds, (.25, .75))
    rect = patches.Rectangle((xq1, yq1), xq2 - xq1, yq2 - yq1, fill=False, edgecolor='black')
    rect = ax.add_patch(rect)
    xlabel = "seq {i} ref (std: {stddev:.3f})".format(i=i, stddev=np.std(np_sample_ref_preds))
    ylabel = "seq {i} mut (std: {stddev:.3f})".format(i=i, stddev=np.std(np_sample_mut_preds))
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

In [None]:
import scipy
from scipy import stats
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std



cols, margin = 3, 20 # margin determined empirically
fig, axs = plt.subplots(math.ceil(n_seqs / float(cols)), cols, figsize=(16, n_seqs + margin))
# plt.suptitle("CA predictive mean diff vs. TF predictive mean diff")
slopes = []
rsquareds = []

for seq in range(n_seqs):
    ax = axs[seq // cols, seq % cols]
    
    x, y = np_pred_mean_diffs[seq, :, :, TF_COL].reshape(-1), np_pred_mean_diffs[seq, :, :, CA_COL].reshape(-1)
    xc = sm.add_constant(x)
    model = sm.OLS(y, xc)
    result = model.fit()
    intercept, slope = result.params
    slopes.append(slope)
    rsquared = result.rsquared
    rsquareds.append(rsquared)
    stderr = result.bse[1]
    
    title = "%d - " % (seq)
    title += " (b) " if seq < n_binding_seqs else " (nb)"
    title += "(slope: %.2f, r^2: %.3f, std: %.4f)" % (slope, rsquared, stderr)
    ax.set_title(title)
    
    line = slope*x+intercept
    prstd, iv_l, iv_u = wls_prediction_std(result)
    ax.plot(x, line, 'r')
    ax.plot(x, y, 'o')
    ax.plot(x, iv_u, 'r--')
    ax.plot(x, iv_l, 'r--')
    legend = ax.legend(loc="best")

    ax1.legend()

    
plt.show()

In [None]:
print(sum(slopes) / len(slopes))
print(sum(rsquareds) / len(rsquareds))

In [None]:

IDX_TO_NT = 'ACGT'

def _convert_to_mutation(pos_nt_pair):
    return "%d%s" % (pos_nt_pair[0], IDX_TO_NT[pos_nt_pair[1]])


TF_COL = 1
CA_COL = 0

def _write_row(writer, seq, batch, i):
    seq_num = seq+1
    mut_num = (batch * (batch_size)) + i
    mut = _convert_to_mutation((mut_num // 3, mut_num % 3))
    x_pred_mean_diff = np_pred_mean_diffs[seq, batch, i, TF_COL]
    x_pred_uncertainty = np_pred_uncertainties[seq, batch, i, TF_COL]
    y_pred_mean_diff = np_pred_mean_diffs[seq, batch, i, CA_COL]
    y_pred_uncertainty = np_pred_uncertainties[seq, batch, i, CA_COL]
    writer.writerow(
        {
            "seq_num": seq_num,
            "mut": mut,
            "X_pred_mean": x_pred_mean_diff,
            "X_pred_var": x_pred_uncertainty,
            "Y_pred_mean": y_pred_mean_diff,
            "Y_pred_var": y_pred_uncertainty,
        }
    )
        

with open("../dat/means_and_uncertainties_hopefully_correct.csv", 'w', newline="") as out_file:
    fieldnames = [
        "seq_num",
        "mut",
        "X_pred_mean",
        "X_pred_var",
        "Y_pred_mean",
        "Y_pred_var",
    ]
    writer = csv.DictWriter(out_file, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    
    for seq in range(n_seqs):
        _write_row(writer, seq, batch, i)
        for batch in range(n_batches):
            for i in range(0, batch_size-1):
                _write_row(writer, seq, batch, i)
 


In [None]:
# zeros = np.zeros((2, 3))
# def sanitize_scores(scores):
#     orig_shape = scores.shape
#     sanitized_scores = np.ndarray((*orig_shape, 2, 3), dtype=scores.dtype)
#     flattened_scores = scores.reshape(-1)
    
#     for i, score in enumerate(flattened_scores):
#         idx = np.unravel_index(i, orig_shape)
#         if score is None: sanitized_scores[idx] = zeros
#         else: sanitized_scores[idx] = np.array(score)
#     return sanitized_scores

# sanitized_scores = sanitize_scores(np.squeeze(ism_score))
# ctcf_original_preds = sanitized_scores[:, :, :, 0, 1]
# ctcf_pred_diffs = sanitized_scores[:, :, :, 1, 1]
# dnase_original_preds = sanitized_scores[:, :, :, 0, 0]
# dnase_pred_diff = sanitized_scores[:, :, :, 1, 0]

In [None]:

log_uniform_prop = math.log(.05/(1-.05))
def compute_normalized_prob(prob, train_prob):
    denom = 1+np.exp(-(np.log(prob/(1-prob))+log_uniform_prop-np.log(train_prob/(1-train_prob))))
    return 1 / denom

# Ratios and normalization formula drawn from here: http://deepsea.princeton.edu/media/help/posproportion.txt
# tf_compute_normalized_prob = lambda prob: compute_normalized_prob(prob, .020029)
# ENCODE	A549	DNase	None	0.048136
# chrom_acc_normalized_prob = lambda prob: compute_normalized_prob(prob, 0.048136)