In [1]:
import argparse
import os
from epitome.models import *
from epitome.functions import *
from epitome.viz import *

from epitome.constants import *
import yaml
import subprocess
from timeit import default_timer as timer

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


### Set Up

In [2]:
results_path = "results"
epitome_data_path = "data/epitome_data" 
feature_path = os.path.join(epitome_data_path, "feature_name")
TF = "USF2"
query_cell = 'K562' #'T47D'
prefix = "cons_dec" # consecutive decrease

In [3]:
# create user directories if they do not exist
epitome_results_dir = os.path.join(results_path, "epitome_results")
if not os.path.exists(epitome_results_dir):
    os.makedirs(epitome_results_dir)
    
tf_epitome_results_dir = os.path.join(epitome_results_dir, TF + "_" + prefix + "_results")
if not os.path.exists(tf_epitome_results_dir):
    os.makedirs(tf_epitome_results_dir)
    
model_dir = os.path.join(results_path, "epitome_models")
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    
tf_model_dir = os.path.join(model_dir, TF +  "_" + prefix + "_models")
if not os.path.exists(tf_model_dir):
    os.makedirs(tf_model_dir)

### Load in Data for Epitome

In [None]:
train_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'train.npz')).toarray()
valid_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'valid.npz')).toarray()
test_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'test.npz')).toarray()
data = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}
# all_data = np.concatenate((data[Dataset.TRAIN], data[Dataset.VALID], data[Dataset.TEST]), axis=1)

### VLP Model with Multiple TFs

In [None]:
anchor_overlap_tfs = pd.read_csv(
    "/home/eecs/jahnavis/epitome_new/epitome-1/data/epitome_data/Anchor_Epitome_Overlap_TFs.csv")['TF'].tolist()
anchor_tfs = ["CTCF", "E2F1", "EGR1", "FOXA1", "FOXA2", "GABPA", "HNF4A", "JUND", 
              "MAX", "NANOG", "REST", "TAF1"]
# anchor_overlap_tfs = set(epitome_tfs).intersection(set(anchor_tfs))
# len(anchor_tfs), len(anchor_overlap_tfs), anchor_overlap_tfs

In [None]:
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = anchor_overlap_tfs,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2)
VLP(anchor_overlap_tfs,
    data = data,
    matrix = matrix,
    cellmap = cellmap,
    assaymap = assaymap)

In [None]:
TFs = ["CEBPB", "CHD2", "CTCF", "EP300", "GABPA", "JUND", "MAFK", "MAX", 
       "MYC", "NRF1", "RAD21", "REST", "RFX5", "SRF", "TAF1", "TBP", "USF2"]
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = TFs,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2)
VLP(TFs,
    data = data,
    matrix = matrix,
    cellmap = cellmap,
    assaymap = assaymap)

### VLP Model with Early Stopping (Single TF)

In [7]:
max_valid_iterations= 1000

model = VLP([TF], 
            max_valid_records=max_valid_iterations)

start = timer()
model_checkpoint_path = os.path.join(tf_model_dir, TF + "_test_earlystop_model_checkpoint")
best_iters_trained, actual_iters_trained, valid_losses = model.train(2, checkpoint_path = model_checkpoint_path)
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)
# model_path = os.path.join(tf_model_dir, TF + "_early_stop_" + str(iters_trained) + "_" + str(max_valid_iterations))
# model.save(model_path)

using ['K562', 'IMR-90', 'HepG2', 'HeLa-S3', 'H1', 'GM12878', 'A549'] as labels for mode Dataset.TRAIN
using ['K562', 'IMR-90', 'HepG2', 'HeLa-S3', 'H1', 'GM12878', 'A549'] as labels for mode Dataset.TRAIN
using ['K562', 'IMR-90', 'HepG2', 'HeLa-S3', 'H1', 'GM12878', 'A549'] as labels for mode Dataset.VALID
INFO:tensorflow:Starting Training
INFO:tensorflow:0 tf.Tensor(46.807243, shape=(), dtype=float32)tf.Tensor(46.795624, shape=(), dtype=float32)tf.Tensor(0.011619375, shape=(), dtype=float32)
INFO:tensorflow:0 Validation Generator Time: 217.7490125373006 seconds
INFO:tensorflow:0 Validation:tf.Tensor(45.24661, shape=(), dtype=float32)
INFO:tensorflow:
epitome train: 220.723507


In [18]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

157it [01:04,  2.44it/s]

INFO:tensorflow:macro auROC:     0.9969883234872206
INFO:tensorflow:auPRC:     0.6081196403616143
INFO:tensorflow:GINI:     0.9939766243796371
Model auROC: 0.9969883234872206. Model auPRC: 0.6081196403616143.





In [19]:
model_checkpoint_model = VLP(checkpoint=model_checkpoint_path)

using ['K562', 'IMR-90', 'HepG2', 'HeLa-S3', 'H1', 'GM12878', 'A549'] as labels for mode Dataset.TRAIN
using ['K562', 'IMR-90', 'HepG2', 'HeLa-S3', 'H1', 'GM12878', 'A549'] as labels for mode Dataset.TRAIN
using ['K562', 'IMR-90', 'HepG2', 'HeLa-S3', 'H1', 'GM12878', 'A549'] as labels for mode Dataset.VALID


In [20]:
model_checkpoint_results = model_checkpoint_model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_checkpoint_results['auROC'], model_checkpoint_results['auPRC'])) 

157it [01:06,  2.35it/s]

INFO:tensorflow:macro auROC:     0.997354660579044
INFO:tensorflow:auPRC:     0.590895647096396
INFO:tensorflow:GINI:     0.9947092910316824
Model auROC: 0.997354660579044. Model auPRC: 0.590895647096396.





In [27]:
eval_results_df = pd.DataFrame(columns=['valid_losses'])
eval_results_df = eval_results_df.append({ 'valid_losses':valid_losses}, ignore_index=True)

[<tf.Tensor: shape=(), dtype=float32, numpy=35.49988>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.34261>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.11333>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.866198>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.798954>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.6424932>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.615142>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.87403>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.5501018>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.5447745>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.628159>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.5731063>]

In [23]:
pd.Series(valid_losses).to_csv()

0      tf.Tensor(35.49988, shape=(), dtype=float32)
1       tf.Tensor(6.34261, shape=(), dtype=float32)
2       tf.Tensor(6.11333, shape=(), dtype=float32)
3      tf.Tensor(5.866198, shape=(), dtype=float32)
4      tf.Tensor(5.798954, shape=(), dtype=float32)
5     tf.Tensor(6.6424932, shape=(), dtype=float32)
6      tf.Tensor(5.615142, shape=(), dtype=float32)
7       tf.Tensor(5.87403, shape=(), dtype=float32)
8     tf.Tensor(5.5501018, shape=(), dtype=float32)
9     tf.Tensor(5.5447745, shape=(), dtype=float32)
10     tf.Tensor(5.628159, shape=(), dtype=float32)
11    tf.Tensor(5.5731063, shape=(), dtype=float32)
dtype: object

In [17]:
iters_trained

9000

In [None]:
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : TF,
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC'],
   'iterations_trained' : iter_trained,
   'train_time': train_time}, 
    ignore_index=True)

eval_results_dir = os.path.join(tf_epitome_results_dir, query_cell + "_" + TF + 
                           '_no_motif_early_stop_'+ str(max_valid_iterations) + 
                           '.csv')
eval_results_df.to_csv(eval_results_dir, sep="\t")

### VLP Model Without Early Stopping

In [None]:
# matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
#                                                          eligible_assays = None,
#                                                          eligible_cells = None, 
#                                                          min_cells_per_assay = 2, 
#                                                          min_assays_per_cell= 2) #10)

In [3]:
TF = "JUND"
model = VLP([TF])

start = timer()
iter_trained = model.train(500) # train for 5000 iterations
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)

model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

using ['K562', 'HepG2', 'HeLa-S3', 'HCT116', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'HepG2', 'HeLa-S3', 'HCT116', 'H1', 'GM12878'] as labels for mode Dataset.TRAIN
using ['K562', 'HepG2', 'HeLa-S3', 'HCT116', 'H1', 'GM12878'] as labels for mode Dataset.VALID
Instructions for updating:
Please use `layer.add_weight` method instead.
INFO:tensorflow:Starting Training
INFO:tensorflow:0 tf.Tensor(42.065647, shape=(), dtype=float32)tf.Tensor(42.05568, shape=(), dtype=float32)tf.Tensor(0.0099677, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:200 tf.Tensor(3.3761077, shape=(), dtype=float32)tf.Tensor(3.3661175, shape=(), dtype=float32)tf.Tensor(0.009990176, shape=(), dtype=float32)
INFO:tensorflow:
INFO:tensorflow:400 tf.Tensor(1.4245248, shape=(), dtype=float32)tf.Tensor(1.4145114, shape=(), dtype=float32)tf.Tensor(0.010013362, shape=(), dtype=float32)
INFO:tensorflow:


0it [00:00, ?it/s]

epitome train: 135.497210


157it [00:34,  4.56it/s]

INFO:tensorflow:macro auROC:     0.9939963978387032
INFO:tensorflow:auPRC:     0.17390570108786427
INFO:tensorflow:GINI:     0.9879927956774065
Model auROC: 0.9939963978387032. Model auPRC: 0.17390570108786427.





In [None]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

In [None]:
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : TF,
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC'],
   'iterations_trained' : iter_trained,
   'train_time': train_time}, 
    ignore_index=True)

eval_results_df.to_csv(os.path.join(tf_epitome_results_dir,
                                    query_cell + "_" + TF + '_no_motif' + '.csv'), sep="\t")

In [None]:
model_results['preds_mean'].shape

In [None]:
TF = "JUND"

matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = TF,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2)
    
model2 = VLP([TF],
            data = data,
            matrix = matrix,
            cellmap = cellmap,
            assaymap = assaymap)

start = timer()
iter_trained = model2.train(500) # train for 5000 iterations
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)

model2_results = model2.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

In [None]:
model2_results['preds_mean'].shape

In [None]:
model2_results

In [None]:
JUND_results = os.path.join(tf_epitome_results_dir, "K562_JUND_motif_anchor.npz")
JUND_preds = np.load(JUND_results)

In [None]:
JUND_preds['pred'].shape