In [1]:
import argparse
import os
from epitome.models import *
from epitome.functions import *
from epitome.viz import *

from epitome.constants import *
import yaml
import subprocess
from datetime import datetime
from timeit import default_timer as timer

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


### Set Up

In [None]:
results_path = "results"
epitome_data_path = "data/epitome_data" 
feature_path = os.path.join(epitome_data_path, "feature_name")
TF = "USF2"
query_cell = 'K562' #'T47D'
prefix = "cons_dec" # consecutive decrease

In [None]:
# create user directories if they do not exist
epitome_results_dir = os.path.join(results_path, "epitome_results")
if not os.path.exists(epitome_results_dir):
    os.makedirs(epitome_results_dir)
    
tf_epitome_results_dir = os.path.join(epitome_results_dir, TF + "_" + prefix + "_results")
if not os.path.exists(tf_epitome_results_dir):
    os.makedirs(tf_epitome_results_dir)
    
model_dir = os.path.join(results_path, "epitome_models")
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    
tf_model_dir = os.path.join(model_dir, TF +  "_" + prefix + "_models")
if not os.path.exists(tf_model_dir):
    os.makedirs(tf_model_dir)

### Load in Data for Epitome

In [None]:
train_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'train.npz')).toarray()
valid_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'valid.npz')).toarray()
test_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'test.npz')).toarray()
data = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}
# all_data = np.concatenate((data[Dataset.TRAIN], data[Dataset.VALID], data[Dataset.TEST]), axis=1)

### VLP Model with Multiple TFs

In [None]:
anchor_overlap_tfs = pd.read_csv(
    "/home/eecs/jahnavis/epitome_new/epitome-1/data/epitome_data/Anchor_Epitome_Overlap_TFs.csv")['TF'].tolist()
anchor_tfs = ["CTCF", "E2F1", "EGR1", "FOXA1", "FOXA2", "GABPA", "HNF4A", "JUND", 
              "MAX", "NANOG", "REST", "TAF1"]
# anchor_overlap_tfs = set(epitome_tfs).intersection(set(anchor_tfs))
# len(anchor_tfs), len(anchor_overlap_tfs), anchor_overlap_tfs

In [None]:
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = anchor_overlap_tfs,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2)
VLP(anchor_overlap_tfs,
    data = data,
    matrix = matrix,
    cellmap = cellmap,
    assaymap = assaymap)

In [None]:
# TFs = ["CEBPB", "CHD2", "CTCF", "EP300", "GABPA", "JUND", "MAFK", "MAX", 
#        "MYC", "NRF1", "RAD21", "REST", "RFX5", "SRF", "TAF1", "TBP", "USF2"]
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path)
#                                                          eligible_assays = TFs, 
#                                                          eligible_cells = None, 
#                                                          min_cells_per_assay = 2)
# VLP(TFs,
#     data = data,
#     matrix = matrix,
#     cellmap = cellmap,
#     assaymap = assaymap)

In [None]:
feature_assays = [a for a in list(assaymap)]
label_assays = [a for a in feature_assays if a not in ["DNase"]]
indices = np.concatenate([get_y_indices_for_assay(matrix, assaymap, assay) for assay in label_assays])

In [None]:
indices

### VLP Model with Early Stopping (Single TF)

In [None]:
max_valid_iterations= 1000

model = VLP(assays=["CTCF"], 
            assaymap=assaymap, 
            cellmap=cellmap, 
            matrix=matrix,
            max_valid_records=max_valid_iterations)

start = timer()
model_checkpoint_path = os.path.join("test_path")
best_iters_trained, actual_iters_trained, valid_losses = model.train(200, 
                                                                     patience=2, 
                                                                     min_delta=0.1)
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)
# model_path = os.path.join(tf_model_dir, TF + "_early_stop_" + str(iters_trained) + "_" + str(max_valid_iterations))
# model.save(model_path)

In [None]:
model_results = model.test(200, calculate_metrics=True)
# print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

In [None]:
model_checkpoint_model = VLP(checkpoint=model_checkpoint_path)

In [None]:
model_checkpoint_results = model_checkpoint_model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_checkpoint_results['auROC'], model_checkpoint_results['auPRC'])) 

In [None]:
eval_results_df = pd.DataFrame(columns=['valid_losses'])
eval_results_df = eval_results_df.append({ 'valid_losses':valid_losses}, ignore_index=True)

In [None]:
pd.Series(valid_losses).to_csv()

In [None]:
iters_trained

In [None]:
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : TF,
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC'],
   'iterations_trained' : iter_trained,
   'train_time': train_time}, 
    ignore_index=True)

eval_results_dir = os.path.join(tf_epitome_results_dir, query_cell + "_" + TF + 
                           '_no_motif_early_stop_'+ str(max_valid_iterations) + 
                           '.csv')
eval_results_df.to_csv(eval_results_dir, sep="\t")

### VLP Model Without Early Stopping

In [None]:
# matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
#                                                          eligible_assays = None,
#                                                          eligible_cells = None, 
#                                                          min_cells_per_assay = 2, 
#                                                          min_assays_per_cell= 2) #10)

In [None]:
TF = "JUND"
model = VLP([TF])

start = timer()
iter_trained = model.train(500) # train for 5000 iterations
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)

model_results = model.test(1000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

In [None]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

In [None]:
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : TF,
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC'],
   'iterations_trained' : iter_trained,
   'train_time': train_time}, 
    ignore_index=True)

eval_results_df.to_csv(os.path.join(tf_epitome_results_dir,
                                    query_cell + "_" + TF + '_no_motif' + '.csv'), sep="\t")

In [None]:
model_results['preds_mean'].shape

In [None]:
TF = "JUND"

matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = TF,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2)
    
model2 = VLP([TF],
            data = data,
            matrix = matrix,
            cellmap = cellmap,
            assaymap = assaymap)

start = timer()
iter_trained = model2.train(500) # train for 5000 iterations
end = timer()
train_time = end - start
print('epitome train: %f' % train_time)

model2_results = model2.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

In [None]:
model2_results['preds_mean'].shape

In [None]:
model2_results

In [None]:
JUND_results = os.path.join(tf_epitome_results_dir, "K562_JUND_motif_anchor.npz")
JUND_preds = np.load(JUND_results)

In [None]:
JUND_preds['pred'].shape

### VLP Early Stop Model (Multi TF)

In [None]:
TFs = ['ZNF384',
 'ZNF274',
 'ZNF24',
 'ZNF143',
 'ZKSCAN1',
 'ZFP36',
 'ZBTB40',
 'ZBTB33',
 'YY1',
 'YBX1',
 'USF2',
 'USF1',
 'TCF7L2',
 'TCF12',
 'TBP',
 'TARDBP',
 'TAF1',
 'SUZ12',
 'SRF',
 'SP1',
 'SMC3',
 'SIN3A',
 'RNF2',
 'RFX5',
 'REST',
 'RCOR1',
 'RAD51',
 'RAD21',
 'POLR2AphosphoS5',
 'POLR2AphosphoS2',
 'POLR2A',
 'PKNOX1',
 'PHF8',
 'NRF1',
 'NR2C2',
 'NFRKB',
 'NFE2L2',
 'NBN',
 'MYC',
 'MXI1',
 'MAZ',
 'MAX',
 'MAFK',
 'KDM1A',
 'JUND',
 'JUN',
 'HDGF',
 'HDAC2',
 'HCFC1',
 'H4K20me1',
 'H3K9me3',
 'H3K9me2',
 'H3K9ac',
 'H3K79me2',
 'H3K4me3',
 'H3K4me2',
 'H3K4me1',
 'H3K36me3',
 'H3K27me3',
 'H3K27ac',
 'H3F3A',
 'H2AFZ',
 'GTF2F1',
 'GABPA',
 'FOXK2',
 'FOXA1',
 'FOS',
 'EZH2phosphoT487',
 'EZH2',
 'ETS1',
 'ESRRA',
 'EP300',
 'ELK1',
 'ELF1',
 'CTCF',
 'CHD2',
 'CHD1',
 'CEBPB',
 'BRCA1',
 'BHLHE40',
 'ATF7',
 'ATF3',
 'ATF2',
 'ARNT',
 'ARID3A']

dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%d%b%Y_%H%M")

if len(TFs) > 1:
    print(TFs)
    # Generate a new prefix with the time stamp
    TF_names = "ALL_TFS"
#     prefix = prefix + "_" + timestampStr
    
    # matrix and cellmaps from feature file
    feature_path = os.path.join(epitome_data_path, "feature_name")
    matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = TFs,
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2)


In [None]:
TRAIN_VALID_ITERS = 100
model_check_p = TF_names + "_modelcheck"
model_path = TF_names + "_model"

model = VLP(TFs,
            data = data,
            matrix = matrix,
            cellmap = cellmap,
            assaymap = assaymap) #,
#             max_valid_records = TRAIN_VALID_ITERS)

In [None]:
TRAIN_ITERS, TEST_ITERS, patience, min_delta = 500, 1000, 2, 0.1
start = timer()
# Number of iterations actually trained
best_train_iters, tot_train_iters, valid_losses = model.train(TRAIN_ITERS) #,
#                                                               checkpoint_path = model_check_p,
#                                                               patience = patience,
#                                                               min_delta = min_delta)
end = timer()
train_time = end - start

model_results = model.test(TEST_ITERS, calculate_metrics=True)

In [None]:
model_results

In [None]:
import pickle
# dict = {'Python' : '.py', 'C++' : '.cpp', 'Java' : '.java'}
f = open("file.pkl","wb")
pickle.dump(model_results, f)
f.close()

In [None]:
pkl_file = open('file.pkl', 'rb')
mydict2 = pickle.load(pkl_file)
pkl_file.close()

In [None]:
mydict2

In [None]:
TRAIN_ITERS, TEST_ITERS, patience, min_delta = 500, 1000, 2, 0.1
start = timer()
# Number of iterations actually trained
best_train_iters, tot_train_iters, valid_losses = model.train(TRAIN_ITERS) #,
#                                                               checkpoint_path = model_check_p,
#                                                               patience = patience,
#                                                               min_delta = min_delta)
end = timer()
train_time = end - start
model.save(model_path)

# Test Early Stopping Model
model_results = model.test(TEST_ITERS, calculate_metrics=True)

In [None]:
model_results

In [None]:
pd.DataFrame.from_dict(model_results)

In [None]:
best_model = VLP(checkpoint= model_check_p)
best_model_results = best_model.test(TEST_ITERS, calculate_metrics=True)

### Early Stopping Test

In [2]:
eligible_cells = ['K562','HepG2','H1','A549','HeLa-S3']
eligible_assays = ['DNase','CTCF']
similarity_assays = ['DNase']
matrix, cellmap, assaymap = get_assays_from_feature_file(eligible_assays = eligible_assays,
                                                         similarity_assays = similarity_assays,
                                                         eligible_cells = eligible_cells, 
                                                         min_cells_per_assay = 3, 
                                                         min_assays_per_cell = 1)
matrix.shape




(5, 2)

In [3]:
matrix

array([[249, 120],
       [396, 398],
       [364, 276],
       [535, 114],
       [614, 701]])

In [4]:
np.ones((5,2)).astype(int)

array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1]])

In [None]:
model = VLP(list(eligible_assays),
			test_celltypes = ['K562'],
			matrix = matrix, #np.ones((5,2)).astype(int),
			assaymap = assaymap,
			cellmap = cellmap)

In [None]:
train_iters = 200
validation_size = 10

# create model and train
best_model_steps, num_steps, train_valid_losses = model.train(1)
results1 = model.test(validation_size)
best_model_steps2, num_steps2, train_valid_losses2 = model.train(train_iters)
results2 = model.test(validation_size)

In [None]:
# Make sure predictions are not random
# after first iterations
assert(results1['preds_mean'].shape[0] == validation_size)
assert(results2['preds_mean'][0] < results1['preds_mean'].shape[0])

In [None]:
best_model_steps, best_model_steps2, num_steps, num_steps2, train_valid_losses, train_valid_losses2

In [5]:
model = VLP(list(eligible_assays),
			test_celltypes = ['K562'],
			matrix = matrix, #np.ones((5,2)).astype(int),
			assaymap = assaymap,
			cellmap = cellmap,
            max_valid_records=10)
train_iters = 200
validation_size = 10

# create model and train
best_model_steps, num_steps, train_valid_losses = model.train(1)

using ['HepG2', 'HeLa-S3', 'H1', 'A549'] as labels for mode Dataset.TRAIN
using ['HepG2', 'HeLa-S3', 'H1', 'A549'] as labels for mode Dataset.TRAIN
using ['HepG2', 'HeLa-S3', 'H1', 'A549'] as labels for mode Dataset.VALID
using ['K562'] as labels for mode Dataset.TEST
Instructions for updating:
Please use `layer.add_weight` method instead.
INFO:tensorflow:Starting Training
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Num'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Num'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Num'
Please repor

TypeError: in converted code:

    /home/eecs/jahnavis/epitome_new/epitome-1/epitome/models.py:359 loopiter
        new_valid_loss = int(tf.concat(new_valid_loss, axis=0))

    TypeError: int() argument must be a string, a bytes-like object or a number, not 'Tensor'


In [None]:
results1 = model.test(validation_size)
best_model_steps2, num_steps2, train_valid_losses2 = model.train(train_iters)
results2 = model.test(validation_size)

In [None]:
best_model_steps, best_model_steps2, num_steps, num_steps2, train_valid_losses, train_valid_losses2