# Runs and evaluates Deepsea

In [1]:
import collections
import os

import tensorflow as tf
import h5py
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics
import kipoi

## User specific locations

In [2]:
########################## PATHS ###########################
# path to where dnase bams are stored. Bams need to be sorted and indexed. See bin/download_dnase_encode.sh for
# data processing
_ENCODE_DATA_PREFIX =  "/data/akmorrow/encode_data/"

deepsea_path = "/data/akmorrow/epitome_data/deepsea_train/"
dnase_preprocessed_path = "/data/akmorrow/epitome_data/processed_dnase/"
feature_path = "/home/eecs/akmorrow/epitome/data/feature_name"

_DEEPSEA_GENOME_REGIONS_FILENAME = "/home/eecs/akmorrow/epitome/data/allTFs.pos.bed"



In [3]:
exec(open("./constants.py").read())
exec(open("./functions.py").read())

deepsea_path = "/data/akmorrow/epitome_data/deepsea_train/"

# Load Data

In [10]:
train_data, valid_data, test_data = load_deepsea_data(deepsea_path)
print(valid_data["x"].shape, train_data["x"].shape, test_data["x"].shape)
print(valid_data["y"].shape, train_data["y"].shape, test_data["y"].shape)

(408000, 4, 1000) (4455024, 4, 1000) (455024, 4, 1000)
(919, 408000) (919, 4455024) (919, 455024)


# Choose cell types

In [29]:
# matrix is cell types by factors, contains indices in feature vector
matrix, cellmap, assaymap = get_assays_from_feature_file()
print(list(cellmap.keys())[:11], list(assaymap.keys())[:20])
y_indices = matrix[cellmap["A549"]]

indices_mat = np.delete(matrix, [0,3,5], axis=0)


['K562', 'HepG2', 'H1-hESC', 'GM12878', 'HeLa-S3', 'A549', 'HUVEC', 'HCT-116', 'GM12892', 'GM12891', 'MCF-7'] ['Pol2', 'DNase', 'CTCF', 'YY1', 'TAF1', 'Pol2-4H8', 'c-Myc', 'Max', 'p300', 'Rad21', 'NRSF', 'GABP', 'EZH2', 'CEBPB', 'c-Jun', 'ZBTB33', 'USF2', 'USF-1', 'TBP', 'RFX5']


# Check how DeepSea does

In [None]:
model = kipoi.get_model('DeepSEA/predict') # load the model

In [12]:

batch_size = 100
preds = []
for i in np.arange(0, len(test_data["x"]), batch_size):
    batch = test_data["x"][i:i+batch_size]
    batch = np.expand_dims(batch, 2)
    batch = batch[:,[0,2,1,3]]
    preds.append(model.predict_on_batch(batch.astype(np.float32)))
preds = np.concatenate(preds, axis=0)

### Direct performance of DeepSea  on A549


In [101]:
assaylist= list(assaymap)

A549_assays = list(filter(lambda x: x[1] != -1, zip(assaylist, y_indices)))
A549_preds = preds.T[y_indices[y_indices != -1]]
A549_truth = test_data["y"][y_indices[y_indices != -1]]


# calculate Auc for available assays in A549
for i in range(len(A549_assays)):
        
    print(A549_assays[i][0], sklearn.metrics.roc_auc_score(A549_truth[i].T, A549_preds[i].T, average="macro"))

# get total averages
print("Macro score from averaging", sklearn.metrics.roc_auc_score(A549_truth.T, A549_preds.T, average='macro'))
print("Micro score from averaging", sklearn.metrics.roc_auc_score(A549_truth.T, A549_preds.T, average='micro'))

Pol2 0.9835379602141645
DNase 0.8891200468674318
CTCF 0.9796873209063074
YY1 0.9412569522126875
TAF1 0.9339937597111095
Max 0.9700962068145859
p300 0.88337104127891
Rad21 0.9885791954539361
NRSF 0.9048150350795633
GABP 0.9430083472897535
CEBPB 0.9715652508801853
ZBTB33 0.9131234213730131
USF-1 0.9305090220181208
Macro score from averaging 0.9409741200076746
Micro score from averaging 0.9545428044977946


### Performance of DeepSea through averaging

In [93]:
weights = np.tile((indices_mat!=-1).reshape(indices_mat.shape + (1,)), (1, 1, test_data["y"].shape[-1]))

average_preds = np.average(preds.T[indices_mat], axis=0, weights=weights)
average_preds = average_preds[y_indices != -1]


(8, 20, 455024)

In [102]:
print("Macro score from averaging", sklearn.metrics.roc_auc_score(truth.T, average_preds.T, average='macro'))
print("Micro score from averaging", sklearn.metrics.roc_auc_score(truth.T, average_preds.T, average='micro'))


for i in range(len(A549_assays)):
    print(A549_assays[i][0], sklearn.metrics.roc_auc_score(truth[i].T, average_preds[i].T, average='macro'))


Macro score from averaging 0.9038947798546453
Micro score from averaging 0.9144402337031453
Pol2 0.9752907041571531
DNase 0.8700758330282392
CTCF 0.9728487642891959
YY1 0.9011811622763934
TAF1 0.9156682727829428
Max 0.9674639814541746
p300 0.8213540913260786
Rad21 0.9875811386426235
NRSF 0.8245115947487238
GABP 0.8445088893090317
CEBPB 0.9682144553662642
ZBTB33 0.8552369335274348
USF-1 0.8466963172021347
