In [1]:
%load_ext autoreload
%autoreload 2

In [33]:
import os

import h5py
import kipoi
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import seaborn as sns
import scipy
import scipy.io

# Load DeepSEA Test Data

In [15]:
data_dir = '../data/deepsea_train/'

In [16]:
test_mat_file = scipy.io.loadmat(os.path.join(data_dir, 'test.mat'))
test_seqs, test_labels = test_mat_file['testxdata'], test_mat_file['testdata']
test_seqs.shape

(455024, 4, 1000)

In [43]:
# Show that the first and 227,513th sequences are reverse complements of each other.
test_seqs[0], "\n", np.flip(test_seqs[227512], axis=1)

(array([[1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 1, 0, 1],
        [0, 1, 0, ..., 0, 1, 0]], dtype=uint8),
 '\n',
 array([[0, 1, 0, ..., 0, 1, 0],
        [0, 0, 1, ..., 1, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]], dtype=uint8))

# Load DeepSEA Sequence Region Metadata

In [48]:
bed = pybedtools.BedTool('../data/allTFs.pos.bed')
all_tfs_df = bed.to_dataframe()
test_regions_df = all_tfs_df[(all_tfs_df["chrom"] == "chr8") | (all_tfs_df["chrom"] == "chr9")]
test_regions_df = pd.concat((test_regions_df, test_regions_df))
test_regions_df.head()

# Misc. Discussion

Sequences: $ \mathbf{X} $ with dimension $ N \times L $.
Labels: $ \mathbf{Y} $ with dimension $ N \times D $ where $ D = 919 $.

DeepSEA learns $ P(Y \mid X) $.

You're going to learn $ P(Y, X) $. You also know $ P(Y \mid X) $ which you can use to compare to DeepSEA's predictions.

For a fair comparison, split $ X $ into $ X_{\text{train}} $ and $ X_{\text{test}} $. Train both DeepSEA
and your model on $ X_{\text{train}} $ and then compare their performance at approximating
$ P(Y_{\text{test}} \mid X_{\text{test}}). $

# Load DeepSEA Model

In [14]:
model = kipoi.get_model("DeepSEA/predict")

Using downloaded and verified file: /home/ubuntu/.kipoi/models/DeepSEA/predict/downloaded/model_files/weights/89e640bf6bdbe1ff165f484d9796efc7


In [26]:
sample_preds = model.predict_on_batch(np.expand_dims(test_seqs[:100], axis=2).astype(np.float32))

In [30]:
sample_preds.shape

(100, 919)

In [32]:
sample_preds[:10, 0], test_labels[:10, 0]

(array([0.06626297, 0.00989131, 0.0716022 , 0.11329851, 0.11433659,
        0.15173239, 0.04799287, 0.07929325, 0.02166701, 0.05837096],
       dtype=float32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8))