# Testing single vs multilabel models
Alyssa's 11/6

Question: Does single vs multiple labels perform better?

In this file, we first test the SMC3/Rad21/CTCF complex.

We then run all single model TFs on 20,000 iterations.


Answer is that it really depends on the TFs being chosen. Some perform better alone, some perform better with others.

# Imports

In [44]:
import collections
import datetime

import pyDNase
import tensorflow as tf
import h5py
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics
import kipoi
import os
import pybedtools
import torch
import h5sparse
import datetime
import logging

from scipy import stats

from pyDNase import GenomicInterval
from scipy.sparse import coo_matrix, vstack

from scipy.fftpack import fft, ifft

import sys

import json

# Define Paths for this user

In [2]:
# path to where dnase bams are stored. Bams need to be sorted and indexed. See bin/download_dnase_encode.sh for
# data processing
# Required in constants.py
# _ENCODE_DATA_PREFIX =  "/data/akmorrow/encode_data/"

# # where training data is stored
# deepsea_path = "/data/akmorrow/epitome_data/deepsea_train/"

feature_path = '../data/feature_name'

output_path = '/home/eecs/akmorrow/epitome/out/Epitome'

data_path = "/data/akmorrow/epitome_data/deepsea_labels_train/"

In [4]:
exec(open("../constants.py").read())
exec(open("../functions.py").read())
exec(open("../generators.py").read())
exec(open("../models.py").read())

# Load Data

### Load DeepSEA data

In [5]:
train_data, valid_data, test_data = load_deepsea_label_data(data_path)


In [7]:
print(valid_data["y"].shape, train_data["y"].shape, test_data["y"].shape)

(919, 408000) (919, 4455024) (919, 455024)


# Choose cell types and assays

Here, we choose SMC3, CTCF, and Rad21, a well known complex that is correlated.

# Model definition

# Run on  all 3 TFs

['DNase', 'Rad21', 'CTCF', 'SMC3']

In [8]:
# Available cell types
validation_celltypes = ["K562"] # we remove hepg2 from the validation, as there are so few SMC3 cell types to begin with 
test_celltypes = ["HepG2"]


## How well did we perform on A549 using all 3 TFs?

In [65]:

matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path='../../data/feature_name', 
                                  eligible_assays = None,
                                  eligible_cells = None, min_cells_per_assay = 2, min_assays_per_cell=5)
    
inv_assaymap = {v: k for k, v in assaymap.items()}

fig = plt.figure(figsize = (20,10))
ax = fig.add_subplot(1,1,1)
ax.set_aspect('equal')
plt.xticks(np.arange(len(assaymap)), rotation = 90)
ax.set_xticklabels(assaymap.keys())
plt.yticks(np.arange(len(cellmap)))
ax.set_yticklabels(cellmap.keys())
plt.imshow(matrix!=-1)


<matplotlib.image.AxesImage at 0x7f6db3e68748>

In [None]:
radii = [1,3,10,30]
model = MLP(4, [100, 100, 100, 50], 
            tf.tanh, 
            train_data, 
            valid_data, 
            test_data, 
            test_celltypes,
            gen_from_peaks, 
            matrix,
            assaymap,
            cellmap,
            shuffle_size=2, 
            radii=radii)
model.train(20000)


eval cell types ['SK-N-SH_RA', 'K562', 'HeLa-S3', 'H1-hESC', 'GM12878', 'A549']
using ['SK-N-SH_RA', 'K562', 'HeLa-S3', 'H1-hESC', 'GM12878', 'A549'] as labels for mode Dataset.TRAIN
using ['SK-N-SH_RA', 'K562', 'HeLa-S3', 'H1-hESC', 'GM12878', 'A549'] as labels for mode Dataset.VALID
using ['HepG2'] as labels for mode Dataset.TEST
INFO:tensorflow:Scale of 0 disables regularizer.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Initializing variables
INFO:tensorflow:Starting Training
INFO:tensorflow:1000 0.606929
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.959133762756
INFO:tensorflow:Our micro AUC:     0.959133762756
INFO:tensorflow:
INFO:tensorflow:2000 0.0278339
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.95978293593
INFO:tensorflow:Our micro AUC:     0.95978293593
INFO:tensorflow:
INFO:tensorflow:3000 0.00861721
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.96583775779
INFO:tensorflow:Our micro AUC:     0.96583775779
INFO:tensorflow:
INFO:tensorflow:4000 0.0156686
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.961813978834
INFO:tensorflow:Our micro AUC:     0.961813978834
INFO:tensorflow:
INFO:tensorflow:5000 0.254632
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.965727189548
INFO:tensorflow:Our micro AUC:     0.965727189548
INFO:tensorflow:


In [19]:
test_DNase = model.test(455024, log=True)


# INFO:tensorflow:2018-12-05 12:19:02.977124: 0, DNase, NaN
# INFO:tensorflow:2018-12-05 12:19:02.979063: 1, Rad21, 0.936362
# INFO:tensorflow:2018-12-05 12:19:03.135981: 2, CTCF, 0.972709
# INFO:tensorflow:2018-12-05 12:19:03.300613: 3, SMC3, 0.905783

INFO:tensorflow:Our macro AUC:     0.958331269719
INFO:tensorflow:Our micro AUC:     0.958331269719


  precision = tps / (tps + fps)


In [22]:
test_DNase

(array([[ 0.00776174,  0.0111055 ,  0.01409647],
        [ 0.00702672,  0.00960908,  0.01299283],
        [ 0.00754961,  0.01068342,  0.01376804],
        ..., 
        [ 0.01300736,  0.02671668,  0.02711702],
        [ 0.0109169 ,  0.01609266,  0.01691344],
        [ 0.07065497,  0.11577307,  0.08959483]], dtype=float32),
 array([[ 0.,  0.,  0.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.]], dtype=float32),
 {'CTCF': {'AUC': 0.96921207273245069,
   'GINI': 0.93842455835965499,
   'auPRC': 0.66959315982241596},
  'Rad21': {'AUC': 0.94812406335683352,
   'GINI': 0.89625013512858376,
   'auPRC': 0.60837253216734022},
  'SMC3': {'AUC': 0.95765767306660565,
   'GINI': 0.70143988061727014,
   'auPRC': 0.61844290285714454}},
 0.95833126971862992,
 0.95833126971862992)

# Running just SMC3, Rad21, and CTCF

In [66]:
factors = list(assaymap)[1:]
eligible_cells = list(cellmap)
factors

['p300',
 'c-Myc',
 'c-Jun',
 'c-Fos',
 'Znf143',
 'ZZZ3',
 'ZNF274',
 'ZBTB7A',
 'ZBTB33',
 'YY1',
 'USF2',
 'USF-1',
 'TR4',
 'TFIIIC-110',
 'TEAD4',
 'TCF7L2',
 'TCF12',
 'TBP',
 'TBLR1',
 'TAF7',
 'TAF1',
 'Sin3Ak-20',
 'STAT5A',
 'STAT3',
 'STAT1',
 'SRF',
 'SP2',
 'SP1',
 'SMC3',
 'SIX5',
 'SIN3A',
 'Rad21',
 'RXRA',
 'RPC155',
 'RFX5',
 'RBBP5',
 'Pol3',
 'Pol2-4H8',
 'Pol2(phosphoS2)',
 'Pol2(b)',
 'Pol2',
 'PU.1',
 'POU2F2',
 'PML',
 'PAX5-C20',
 'Nrf1',
 'NRSF',
 'NFKB',
 'NFIC',
 'NF-YB',
 'NF-YA',
 'NF-E2',
 'Mxi1',
 'Max',
 'MafK',
 'MafF',
 'MEF2A',
 'MAZ',
 'JunD',
 'Ini1',
 'IRF3',
 'HDAC2',
 'HA-E2F1',
 'GTF2F1',
 'GATA3',
 'GATA-2',
 'GABP',
 'FOXA1',
 'FOSL2',
 'FOSL1',
 'Egr-1',
 'EZH2',
 'ETS1',
 'ELK1',
 'ELF1',
 'E2F6',
 'E2F4',
 'CTCF',
 'COREST',
 'CHD2',
 'CHD1',
 'CEBPB',
 'Brg1',
 'Bach1',
 'BRF2',
 'BRF1',
 'BRCA1',
 'BHLHE40',
 'BDP1',
 'BCLAF1',
 'BCL3',
 'BCL11A',
 'ATF3',
 'ATF2',
 'ARID3A']

In [67]:

def get_assays_from_feature_file(feature_path, 
                                 eligible_assays = None, 
                                 eligible_cells = None,
                                 min_cells_per_assay= 3, 
                                 min_assays_per_cell = 2):
    ''' Parses a feature name file from DeepSea. File can be found in repo at ../data/feature_name.
    Returns at matrix of cell type/assays which exist for a subset of cell types.
    NOTE: this changes the ordering from the previous function. Dnase is not first.

    Args:
        :param feature_path: location of feature_path
        :param eligible_assays: list of assays to filter by (ie ["CTCF", "EZH2", ..]). If None, then returns all assays.
        Note that DNase will always be included in the factors, as it is required by the method.
        :param eligible_cells: list of cells to filter by (ie ["HepG2", "GM12878", ..]). If None, then returns all cell types.
        :param min_cells_per_assay: number of cell types an assay must have to be considered
        :param min_assays_per_cell: number of assays a cell type must have to be considered. Includes DNase.
    Returns
        matrix: cell type by assay matrix
        cellmap: index of cells
        assaymap: index of assays
    '''

    # check argument validity
    if (min_assays_per_cell < 2):     
         print("Warning: min_assays_per_cell should not be < 2 (this means it only has DNase) but was set to %i" % min_assays_per_cell)
         
    
    if (min_cells_per_assay < 2):     
         print("Warning: min_cells_per_assay should not be < 2 (this means you may only see it in test) but was set to %i" % min_cells_per_assay)
         
    if (eligible_assays != None):     
        if (len(eligible_assays) + 1 < min_assays_per_cell):
            raise Exception("""%s is less than the minimum assays required (%i). 
            Lower min_assays_per_cell to (%i) if you plan to use only %i eligible assays""" \
                            % (eligible_assays, min_assays_per_cell, len(eligible_assays)+1, len(eligible_assays)))

    if (eligible_cells != None):     
        if (len(eligible_cells) + 1 < min_cells_per_assay):
            raise Exception("""%s is less than the minimum cells required (%i). 
            Lower min_cells_per_assay to (%i) if you plan to use only %i eligible cells""" \
                            % (eligible_cells, min_cells_per_assay, len(eligible_cells)+1, len(eligible_cells)))
            
    # TFs are 126 to 816 and DNase is 1 to 126, TFs are 126 to 816
    # We don't want to include histone information.
    elegible_assay_indices  = np.linspace(1,815, num=815).astype(int)

    # TODO want a dictionary of assay: {list of cells}
    # then filter out assays with less than min_cells_per_assay cells
    # after this, there may be some unused cells so remove those as well
    with open(feature_path) as f:

        indexed_assays={}    # dict of {cell: {dict of indexed assays} }
        for i,l in enumerate(f):
            if i not in elegible_assay_indices: 
                continue # skip first rows and non-transcription factors

            # for example, split '8988T|DNase|None' 
            cell, assay = l.split('\t')[1].split('|')[:2]

            # check if cell and assay is valid
            valid_cell = (eligible_cells == None) or (cell in eligible_cells) 
            valid_assay = (eligible_assays == None) or (assay in eligible_assays) or (assay == "DNase")

            # if cell and assay is valid, add it in
            if valid_cell and valid_assay:
                if cell not in indexed_assays:
                    indexed_assays[cell] = {assay: i-1} # add index of assay
                else:
                    indexed_assays[cell][assay] = i-1



    # finally filter out cell types with < min_assays_per_cell and have DNase
    indexed_assays = {k: v for k, v in indexed_assays.items() if 'DNase' in v.keys() and len(v) >= min_assays_per_cell}

    # make flatten list of assays from cells
    tmp = [list(v) for k, v in indexed_assays.items()]
    tmp = [item for sublist in tmp for item in sublist]

    # list of assays that meet min_cell criteria
    valid_assays = {k:v for k, v in Counter(tmp).items() if v >= min_cells_per_assay}
    
    # remove invalid assays from indexed_assays
    for key, values in indexed_assays.items():
        
        # remove assays that do not mean min_cell criteria
        new_v = {k: v for k, v in values.items() if k in valid_assays.keys()}
        indexed_assays[key] = new_v 
    
    potential_assays = valid_assays.keys()
    cells = indexed_assays.keys()

    # sort cells alphabetical
    cells = sorted(cells, reverse=True)
    
    # sort assays alphabetically
    potential_assays = sorted(potential_assays, reverse=True)
    
    # make sure DNase is first assay. This is because the model
    # assumes the first column specifies DNase 
    potential_assays.remove("DNase")
    potential_assays.insert(0,"DNase")

    cellmap = {cell: i for i, cell in enumerate(cells)}
    assaymap = {assay: i for i, assay in enumerate(potential_assays)}

    matrix = np.zeros((len(cellmap), len(assaymap))) - 1
    for cell in cells:
        for assay, _ in indexed_assays[cell].items():
            matrix[cellmap[cell], assaymap[assay]] = indexed_assays[cell][assay]

    matrix = matrix.astype(int) 
    return matrix, cellmap, assaymap


In [None]:

time = datetime.datetime.now().time().strftime("%Y-%m-%d_%H:%M:%S")

file = open('/data/akmorrow/epitome_data/out/tmp_prediction_aucs_singleTFs_%s.json' % time, 'w')

for assay in factors:

    print(" Running on assay %s..." % assay)

    label_assays = ['DNase', assay]
    
    try: 
        matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path='../../data/feature_name', eligible_assays = ["DNase", assay], 
                                     eligible_cells = eligible_cells)
        print(cellmap)
        print(test_celltypes)
        

        file.write('%s\n' % (assay))
        file.flush()

        model = MLP(4, [100, 100, 100, 50], 
                    tf.tanh, 
                    train_data, 
                    valid_data, 
                    test_data, 
                    test_celltypes,
                    gen_from_peaks, 
                    matrix,
                    assaymap,
                    cellmap,
                    shuffle_size=2, 
                    radii=radii)
        model.train(20000)

        test_DNase_1 = model.test(455024, log=True)

        file.write(json.dumps(test_DNase_1[2]))
        file.write("\n")

        # flush to file
        file.flush()
    except:
        print("%s failed" % assay)
        file.write("%s failed" % assay)
    
file.close()

 Running on assay p300...
{'T-47D': 0, 'SK-N-SH_RA': 1, 'K562': 2, 'HepG2': 3, 'HeLa-S3': 4, 'H1-hESC': 5, 'GM12878': 6, 'A549': 7}
['HepG2']
eval cell types ['T-47D', 'SK-N-SH_RA', 'K562', 'HeLa-S3', 'H1-hESC', 'GM12878', 'A549']
using ['T-47D', 'SK-N-SH_RA', 'K562', 'HeLa-S3', 'H1-hESC', 'GM12878', 'A549'] as labels for mode Dataset.TRAIN
using ['T-47D', 'SK-N-SH_RA', 'K562', 'HeLa-S3', 'H1-hESC', 'GM12878', 'A549'] as labels for mode Dataset.VALID
using ['HepG2'] as labels for mode Dataset.TEST




INFO:tensorflow:Scale of 0 disables regularizer.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Initializing variables
INFO:tensorflow:Starting Training
INFO:tensorflow:1000 0.419568
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.851044764635
INFO:tensorflow:Our micro AUC:     0.851044764635
INFO:tensorflow:
INFO:tensorflow:2000 0.0270678
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.845228054516
INFO:tensorflow:Our micro AUC:     0.845228054516
INFO:tensorflow:
INFO:tensorflow:3000 0.0591824
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.850524763949
INFO:tensorflow:Our micro AUC:     0.850524763949
INFO:tensorflow:
INFO:tensorflow:4000 0.0548691
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.84987389615
INFO:tensorflow:Our micro AUC:     0.84987389615
INFO:tensorflow:
INFO:tensorflow:5000 0.0516278
INFO:tensorflow:On validation
INFO:tensorflow:Our macro AUC:     0.812337199814
INFO:tensorflow:Our micro AUC:     0.812337199814
INFO:tensorflow:
INFO:tensorflow:6000 0.434922
INFO:tenso

In [None]:
file

############## END TRIO EXPERIMENTS ##############