In [1]:
import argparse
import os
from epitome.models import *
from epitome.functions import *
from epitome.viz import *

from epitome.constants import *
import yaml
import subprocess
from timeit import default_timer as timer

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


### Set Up

In [31]:
results_path = "results"
epitome_data_path = "data/epitome_data" 
# motif_dir = "data/motif_data/"
feature_path = os.path.join(epitome_data_path, "feature_name")
TF = "EGR1"
query_cell = 'K562' #'T47D'

In [32]:
# create user directories if they do not exist
epitome_results_dir = os.path.join(results_path, "epitome_results")
if not os.path.exists(epitome_results_dir):
    os.makedirs(epitome_results_dir)
tf_epitome_results_dir = os.path.join(epitome_results_dir, TF + "_results")
if not os.path.exists(tf_epitome_results_dir):
    os.makedirs(tf_epitome_results_dir)
model_dir = os.path.join(results_path, "epitome_models")
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
tf_model_dir = os.path.join(model_dir, TF+"_models")
if not os.path.exists(tf_model_dir):
    os.makedirs(tf_model_dir)

### Load in Data for Epitome

In [33]:
train_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'train.npz')).toarray()
valid_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'valid.npz')).toarray()
test_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'test.npz')).toarray()
# data = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}
# all_data = np.concatenate((data[Dataset.TRAIN], data[Dataset.VALID], data[Dataset.TEST]), axis=1)

In [34]:
data = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}

In [35]:
anchor_tfs = ["CTCF", "E2F1", "EGR1", "FOXA1", "FOXA2", "GABPA", "HNF4A", "JUND", 
              "MAX", "NANOG", "REST", "TAF1"]
# anchor_overlap_tfs = set(epitome_tfs).intersection(set(anchor_tfs))
# len(anchor_tfs), len(anchor_overlap_tfs), anchor_overlap_tfs

### VLP Model With Early Stopping

In [36]:
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = None,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2) #10)

In [37]:
max_valid_iterations= 1000
model = VLP([TF], 
            max_valid_records=max_valid_iterations)

start = timer()
iter_trained = model.train(5000) # train for 5000 iterations
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)
model_path = os.path.join(model_dir, query_cell + "_" + TF + "_no_motif_early_stop_" + str(max_valid_iterations))
model.save(model_path)

indices.shape:  (14839,)
indices.shape:  (1000,)
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.VALID
INFO:tensorflow:Starting Training
INFO:tensorflow:0 tf.Tensor(49.05606, shape=(), dtype=float32)tf.Tensor(41.69575, shape=(), dtype=float32)tf.Tensor(7.360309, shape=(), dtype=float32)
INFO:tensorflow:0 Validation:tf.Tensor(42.204384, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:1000 tf.Tensor(7.1004367, shape=(), dtype=float32)tf.Tensor(1.8727999, shape=(), dtype=float32)tf.Tensor(5.227637, shape=(), dtype=float32)
INFO:tensorflow:1000 Validation:tf.Tensor(12.6759815, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:2000 tf.Tensor(11.200504, shape=(), dtype=float32)tf.Tensor(7.461975, shape=(), dtype=float32)tf.Tensor(3.7385287, shape=(), dtype=float32)
INFO:tensorflow:2000 Validation:tf.Tensor(11.870875, shape=(), dtype=fl

In [38]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

157it [00:19,  7.87it/s]

INFO:tensorflow:macro auROC:     0.7816391364420292
INFO:tensorflow:auPRC:     0.10222287596374605
INFO:tensorflow:GINI:     0.563278282953341
Model auROC: 0.7816391364420292. Model auPRC: 0.10222287596374605.





In [39]:
model_results['preds_mean']

<tf.Tensor: shape=(10000, 1), dtype=float32, numpy=
array([[0.01288659],
       [0.00488806],
       [0.00672068],
       ...,
       [0.00624098],
       [0.00804776],
       [0.06892596]], dtype=float32)>

In [40]:
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : TF,
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC'],
   'iterations_trained' : iter_trained,
   'train_time': train_time}, 
    ignore_index=True)
eval_results_df.to_csv(os.path.join(tf_model_dir, query_cell + "_" + TF + '_no_motif_early_stop_'+ 
                                    str(max_valid_iterations) + '.csv'), sep="\t")

### VLP Model Without Early Stopping

In [41]:
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = None,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2) #10)

In [42]:
model = VLP([TF])

start = timer()
iter_trained = model.train(5000) # train for 5000 iterations
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)

model_path = os.path.join(model_dir, query_cell + "_" + TF +"_no_motif")
model.save(model_path)

using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.VALID
INFO:tensorflow:Starting Training
INFO:tensorflow:0 tf.Tensor(48.95171, shape=(), dtype=float32)tf.Tensor(41.546783, shape=(), dtype=float32)tf.Tensor(7.4049263, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:1000 tf.Tensor(14.521696, shape=(), dtype=float32)tf.Tensor(9.224182, shape=(), dtype=float32)tf.Tensor(5.2975144, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:2000 tf.Tensor(4.7312956, shape=(), dtype=float32)tf.Tensor(0.9051087, shape=(), dtype=float32)tf.Tensor(3.8261867, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:3000 tf.Tensor(3.875723, shape=(), dtype=float32)tf.Tensor(0.91823786, shape=(), dtype=float32)tf.Tensor(2.957485, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:4000 tf.Tensor(3.6272936, shape=(), dtype=float32)tf.Tensor(1.081

In [43]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

157it [00:18,  8.50it/s]


INFO:tensorflow:macro auROC:     0.7775049899549683
INFO:tensorflow:auPRC:     0.09626827599994939
INFO:tensorflow:GINI:     0.5550099814998234
Model auROC: 0.7775049899549683. Model auPRC: 0.09626827599994939.


In [44]:
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : TF,
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC'],
   'iterations_trained' : iter_trained,
   'train_time': train_time}, 
    ignore_index=True)

eval_results_df.to_csv(os.path.join(tf_model_dir,
                                    query_cell + "_" + TF + '_no_motif' + '.csv'), sep="\t")