In [1]:
import argparse
import os
from epitome.models import *
from epitome.functions import *
from epitome.viz import *

from epitome.constants import *
import yaml
import subprocess
from timeit import default_timer as timer

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


### Set Up

In [40]:
results_path = "results"
epitome_data_path = "data/epitome_data" 
feature_path = os.path.join(epitome_data_path, "feature_name")
TF = "EGR1"
query_cell = 'K562' #'T47D'

In [41]:
# create user directories if they do not exist
epitome_results_dir = os.path.join(results_path, "epitome_results")
if not os.path.exists(epitome_results_dir):
    os.makedirs(epitome_results_dir)
    
tf_epitome_results_dir = os.path.join(epitome_results_dir, TF + "_results")
if not os.path.exists(tf_epitome_results_dir):
    os.makedirs(tf_epitome_results_dir)
    
model_dir = os.path.join(results_path, "epitome_models")
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    
tf_model_dir = os.path.join(model_dir, TF + "_models")
if not os.path.exists(tf_model_dir):
    os.makedirs(tf_model_dir)

### Load in Data for Epitome

In [42]:
train_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'train.npz')).toarray()
valid_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'valid.npz')).toarray()
test_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'test.npz')).toarray()
# data = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}
# all_data = np.concatenate((data[Dataset.TRAIN], data[Dataset.VALID], data[Dataset.TEST]), axis=1)

In [43]:
data = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}

In [44]:
anchor_tfs = ["CTCF", "E2F1", "EGR1", "FOXA1", "FOXA2", "GABPA", "HNF4A", "JUND", 
              "MAX", "NANOG", "REST", "TAF1"]
# anchor_overlap_tfs = set(epitome_tfs).intersection(set(anchor_tfs))
# len(anchor_tfs), len(anchor_overlap_tfs), anchor_overlap_tfs

### VLP Model With Early Stopping

In [45]:
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = None,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2) #10)

In [46]:
max_valid_iterations= 1000

model = VLP([TF], max_valid_records=max_valid_iterations)

start = timer()
iter_trained = model.train(5000) # train for 5000 iterations
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)
model_path = os.path.join(tf_model_dir, query_cell + "_" + TF + 
                          "_no_motif_early_stop_" + str(max_valid_iterations))
model.save(model_path)

indices.shape:  (14839,)
indices.shape:  (1000,)
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.VALID
INFO:tensorflow:Starting Training
INFO:tensorflow:0 tf.Tensor(53.44295, shape=(), dtype=float32)tf.Tensor(46.102493, shape=(), dtype=float32)tf.Tensor(7.340458, shape=(), dtype=float32)
INFO:tensorflow:0 Validation:tf.Tensor(45.60356, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:1000 tf.Tensor(7.273051, shape=(), dtype=float32)tf.Tensor(2.094101, shape=(), dtype=float32)tf.Tensor(5.17895, shape=(), dtype=float32)
INFO:tensorflow:1000 Validation:tf.Tensor(14.276231, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:2000 tf.Tensor(4.647738, shape=(), dtype=float32)tf.Tensor(0.9399397, shape=(), dtype=float32)tf.Tensor(3.7077985, shape=(), dtype=float32)
INFO:tensorflow:2000 Validation:tf.Tensor(13.186669, shape=(), dtype=float3

In [47]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

157it [00:26,  6.02it/s]

INFO:tensorflow:macro auROC:     0.7879201303303518
INFO:tensorflow:auPRC:     0.10178134574069214
INFO:tensorflow:GINI:     0.5758402754996467
Model auROC: 0.7879201303303518. Model auPRC: 0.10178134574069214.





In [48]:
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : TF,
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC'],
   'iterations_trained' : iter_trained,
   'train_time': train_time}, 
    ignore_index=True)

eval_results_dir = os.path.join(tf_epitome_results_dir, query_cell + "_" + TF + 
                           '_no_motif_early_stop_'+ str(max_valid_iterations) + 
                           '.csv')
eval_results_df.to_csv(eval_results_dir, sep="\t")

### VLP Model Without Early Stopping

In [49]:
# matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
#                                                          eligible_assays = None,
#                                                          eligible_cells = None, 
#                                                          min_cells_per_assay = 2, 
#                                                          min_assays_per_cell= 2) #10)

In [50]:
model = VLP([TF])

start = timer()
iter_trained = model.train(5000) # train for 5000 iterations
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)

model_path = os.path.join(tf_model_dir, query_cell + "_" + TF +"_no_motif")
model.save(model_path)

using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'H1', 'GM12878'] as labels for mode Dataset.VALID
INFO:tensorflow:Starting Training
INFO:tensorflow:0 tf.Tensor(49.49818, shape=(), dtype=float32)tf.Tensor(42.18291, shape=(), dtype=float32)tf.Tensor(7.3152695, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:1000 tf.Tensor(7.015763, shape=(), dtype=float32)tf.Tensor(1.8432049, shape=(), dtype=float32)tf.Tensor(5.172558, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:2000 tf.Tensor(4.631224, shape=(), dtype=float32)tf.Tensor(0.89494103, shape=(), dtype=float32)tf.Tensor(3.7362833, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:3000 tf.Tensor(4.6076465, shape=(), dtype=float32)tf.Tensor(1.7090476, shape=(), dtype=float32)tf.Tensor(2.898599, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:4000 tf.Tensor(3.3653479, shape=(), dtype=float32)tf.Tensor(0.85764

In [51]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

157it [00:24,  6.29it/s]

INFO:tensorflow:macro auROC:     0.7928965182650259
INFO:tensorflow:auPRC:     0.10307252958579244
INFO:tensorflow:GINI:     0.5857930412997123
Model auROC: 0.7928965182650259. Model auPRC: 0.10307252958579244.





In [52]:
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : TF,
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC'],
   'iterations_trained' : iter_trained,
   'train_time': train_time}, 
    ignore_index=True)

eval_results_df.to_csv(os.path.join(tf_epitome_results_dir,
                                    query_cell + "_" + TF + '_no_motif' + '.csv'), sep="\t")