# Demonstration Notebook


# Imports

In [None]:
import collections
import datetime

import tensorflow as tf
import h5py
from scipy.io import loadmat

import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics
import os
import h5sparse
import datetime
import logging

from scipy import stats

from scipy.sparse import coo_matrix, vstack
from epitome.functions import *
from epitome.models import *
import datetime
import sys
import scipy.stats as stats
import yaml
import h5py

# Load Data

In [None]:
with open('/home/eecs/akmorrow/epitome/config.yml') as f:
    config = yaml.safe_load(f)
config

In [None]:

train_data, valid_data, test_data = load_epitome_data(config['epitome_data_dir'])


In [None]:
# train_data, valid_data, test_data = load_epitome_data(config['epitome_data_dir'])
data = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}
print(data[Dataset.TRAIN].shape, data[Dataset.VALID].shape, data[Dataset.TEST].shape)


# Validation and Test Cell Types

## Get matrix of cell types and assays

In [None]:
assays = ['DNase', 'ZZZ3', 'ZNF274', 'ZBTB7A', 'ZBTB33', 'YY1', 'USF2', 'TEAD4', 'TCF7L2', 'TCF12', 'TBP', 'TAF1', 'STAT5A', 'STAT3', 'STAT1', 'SRF', 'SP2', 'SP1', 'SMC3', 'SIX5', 'RXRA', 'RFX5', 'POU2F2', 'PML', 'NFIC', 'MEF2A', 'MAZ', 'IRF3', 'HDAC2', 'GTF2F1', 'FOXA1', 'FOSL2', 'EZH2', 'ETS1', 'ELK1', 'ELF1', 'E2F6', 'E2F4', 'CTCF', 'CHD2', 'CHD1', 'CEBPB', 'BRCA1', 'BHLHE40', 'BCLAF1', 'BCL3', 'ATF3', 'ARID3A']

cells = ['MCF-7', 'K562', 'HepG2', 'HeLa-S3', 'GM12892', 'GM12891', 'GM12878', 'A549']

print(len(cells), len(assays))

In [None]:
matrix, cellmap, assaymap = get_assays_from_feature_file(EPITOME_FEATURE_NAME_FILE,
                                  eligible_assays = assays,
                                  eligible_cells = cells, min_cells_per_assay = 2, min_assays_per_cell=2)

fig = plt.figure(figsize = (20,10))
ax = fig.add_subplot(1,1,1)
ax.set_aspect('equal')
plt.xticks(np.arange(len(assaymap)), rotation = 90)
ax.set_xticklabels(assaymap.keys())
plt.yticks(np.arange(len(cellmap)))
ax.set_yticklabels(cellmap.keys())

plt.imshow(matrix!=-1)
print(len(assaymap), len(cellmap))

In [None]:
# train_data, valid_data, test_data = load_epitome_data(config['epitome_data_dir'])
dtrain_data, dvalid_data, dtest_data = load_deepsea_label_data(config["deepsea_data_path"])
ddata = {Dataset.TRAIN: dtrain_data, Dataset.VALID: dvalid_data, Dataset.TEST: dtest_data}
print(ddata[Dataset.TRAIN].shape, ddata[Dataset.VALID].shape, ddata[Dataset.TEST].shape)

dmatrix, dcellmap, dassaymap = get_assays_from_feature_file(DEEPSEA_FEATURE_NAME_FILE,
                                  eligible_assays = assays,
                                  eligible_cells = cells, min_cells_per_assay = 2, min_assays_per_cell=2)


fig = plt.figure(figsize = (20,10))
ax = fig.add_subplot(1,1,1)
ax.set_aspect('equal')
plt.xticks(np.arange(len(dassaymap)), rotation = 90)
ax.set_xticklabels(dassaymap.keys())
plt.yticks(np.arange(len(dcellmap)))
ax.set_yticklabels(dcellmap.keys())

plt.imshow(dmatrix!=-1)

# Run the Model

In [None]:
import multiprocessing
test_celltypes = ['A549'] # most available cell types

model = MLP(data,
            test_celltypes,
            matrix,
            assaymap,
            cellmap,
            shuffle_size=2, 
            batch_size=64)
        
model.train(10)
model.score_peak_file("/home/eecs/akmorrow/epitome/data/test.bed")

In [None]:
from epitome.models import *
test_celltypes = ['A549'] # most available cell types

model2 = MLP(ddata,
            test_celltypes,
            dmatrix,
            dassaymap,
            dcellmap,
            shuffle_size=2, 
            batch_size=64)
        
model2.train(3000)
model.score_peak_file("/home/eecs/akmorrow/epitome/data/test.bed")

In [None]:
test_celltypes = ['A549'] # most available cell types

model = MLP(data,
            test_celltypes,
            matrix,
            assaymap,
            cellmap,
            shuffle_size=2, 
            batch_size=64)
        
model.train(5000)



In [None]:
res = model.score_peak_file("/home/eecs/akmorrow/epitome/data/test.bed")


In [None]:
from epitome.generators import *
_, g = generator_to_tf_dataset(load_data(data[Dataset.VALID], 
                                               model.eval_cell_types, 
                                               model.eval_cell_types,
                                               model.matrix,
                                               model.assaymap,
                                               model.cellmap,
                                               radii = model.radii, mode = Dataset.VALID),
                                               model.batch_size, 1, model.prefetch_size)
results2_epitome = model2.test_from_generator(4000 * len(model2.eval_cell_types), g)


_, g = generator_to_tf_dataset(load_data(data[Dataset.VALID], 
                                               model.eval_cell_types, 
                                               model.eval_cell_types,
                                               model.matrix,
                                               model.assaymap,
                                               model.cellmap,
                                               radii = model.radii, mode = Dataset.VALID),
                                               model.batch_size, 1, model.prefetch_size)
results_original_epitome = model.test_from_generator(4000 * len(model.eval_cell_types), g)




In [None]:

from epitome.viz import *
for i in results2_epitome [3]["CTCF"].keys():
    joint_plot(results2_epitome [3], results_original_epitome[3],
               metric = i,
               model1_name = "DeepSEA_data", 
               model2_name = "new_dataset",
                outlier_filter = 'new_dataset < DeepSEA_data')

# look for correlation between cell type counts and performace

In [None]:
matrix2 = np.copy(matrix)
# remove test celltypes
matrix2 = np.delete(matrix2, cellmap[test_celltypes[0]], 0)
matrix3 = np.copy(matrix)
matrix3 = np.delete(matrix3, cellmap[test_celltypes[0]], 0)

In [None]:


matrix2[matrix2 >= 0]  = 1
matrix2[matrix2 == -1]  = 0
tmp = np.sum(matrix2, axis=0)

In [None]:
t = [(k, v['AUC']) for (k, v) in assay_dict.items()]
zipped = zip(tmp, t)

k = list(filter(lambda x: not np.isnan(x[1][1]),  zipped))

x = list(map(lambda x: x[0], k))
y = list(map(lambda x: x[1][1], k))


fig, ax = plt.subplots()
ax.scatter(x, y)
ax.set_xlabel('Number of cell types for each factor in train')
ax.set_ylabel('AUC for each factor')

for i, txt in enumerate(k):
    label = txt[1][0]
    if (y[i] < 0.7):
        ax.annotate(label, (x[i], y[i]),
        textcoords='offset points', ha='right', va='bottom',
        bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
        arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))



In [None]:
# for each assay, count positives in train
results_dict = {}


for (assay, res) in assay_dict.items():
    if (not np.isnan(res['AUC'])):
        x = matrix3[:,assaymap[assay]]
        filtered = x[np.where(x >= 0)[0]]

        sum_ = np.sum(train_data["y"][filtered,:])
        results_dict[assay] = (sum_, res['AUC'])




In [None]:
macroAUC

In [None]:

x = [v[0] for (k, v) in results_dict.items()]
y = [v[1] for (k, v) in results_dict.items()]

fig, ax = plt.subplots()
ax.scatter(x, y)
ax.set_xlabel('Number of positive examples for factor')
ax.set_ylabel('AUC for each factor')
ax.set_xscale('symlog')

for i, txt in enumerate(results_dict.items()):
    label = txt[0]
    if (y[i] < 0.6 or x[i]>400000):
        ax.annotate("%s: %i" % (label, x[i]), (x[i], y[i]),
        textcoords='offset points', ha='right', va='bottom',
        bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
        arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))

# Results

- 96 TFs test GM12878 0.8819470290234287
- 96 TFS test K562    0.8340156671491048
- 96 TFS test K562 with filtering 0.7928056830435212
- 96 TFS test K562 (One hot encoding) 0.8608675037689738 (seems to help some)



Does it help to use "one hot" encoding? (yes)
Remove TFs with too few positives

In [None]:
model.save("/data/akmorrow/epitome_data/saved_models/test_model_full_2.ckpt")


# Restore Model

In [None]:

model  = MLP(4, [100, 100, 100, 50], 
            tf.tanh, 
            train_data, 
            valid_data, 
            test_data, 
            [],
            gen_from_peaks, 
            matrix,
            assaymap,
            cellmap,
            shuffle_size=2, 
            radii=[1,3,10,30])

model.restore('/data/akmorrow/epitome_data/saved_models/model__2019_04_01__14_48.ckpt')


In [None]:

# load in all cell types for evaluation (64 cell types)
all_matrix, all_cellmap, all_assaymap = get_assays_from_feature_file(feature_path=feature_path,eligible_assays = list(assaymap),
                                  eligible_cells = None, min_cells_per_assay = 2, min_assays_per_cell=2)


In [None]:
test_celltype = "A549"

eval_cell_types = list(cellmap).copy()

# if test_celltype is in eval_cell_types, replace it with something else
if (test_celltype in eval_cell_types):
    if (test_celltype == "PANC-1"):
        new_eval_celltype = "NT2-D1" # TODO AM 4/1/2019 maybe don't hardcode
    else:
        new_eval_celltype = "PANC-1" # TODO AM 4/1/2019 maybe don't hardcode

    print("removing %s from eval_celltypes and replacing with %s" % (test_celltype, new_eval_celltype))
    eval_cell_types.remove(test_celltype)
    eval_cell_types.append(new_eval_celltype)

_, iter_ = generator_to_one_shot_iterator(gen_from_peaks(test_data, 
                                               [test_celltype], 
                                               eval_cell_types,
                                               all_matrix,
                                               assaymap,
                                               all_cellmap,
                                               radii = model.radii, mode = Dataset.TEST),
                                                   model.batch_size, 1, model.prefetch_size)



In [None]:

preds, truth, assay_dict, microAUC, macroAUC = model.test_from_generator(test_data["y"].shape[1], iter_, log=True)


In [None]:
def bin_total(y_true, y_prob, n_bins):
    bins = np.linspace(0., 1. + 1e-8, n_bins + 1)

    # In sklearn.calibration.calibration_curve,
    # the last value in the array is always 0.
    binids = np.digitize(y_prob, bins) - 1

    return np.bincount(binids, minlength=len(bins))


In [None]:
list_assaymap = list(assaymap)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms

fig, ax = plt.subplots()
# only these two lines are calibration curves
for i in range(truth.shape[1]):
    logreg_y, logreg_x = calibration_curve(truth[:,i], preds[:,i], n_bins=10)
 
    if (not np.isnan(assay_dict[list_assaymap[i+1]]["AUC"])): # test cell type does not have all factors!
        plt.plot(logreg_x,logreg_y, marker='o', linewidth=1, label=list_assaymap[i+1])

        
# reference line, legends, and axis labels
line = mlines.Line2D([0, 1], [0, 1], color='black')
transform = ax.transAxes
line.set_transform(transform)
ax.add_line(line)
fig.suptitle('Calibration plot for for A549 test regions')
ax.set_xlabel('Predicted probability')
ax.set_ylabel('True probability in each bin')

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),
          ncol=2, fancybox=True, shadow=True)

plt.show()