In [1]:
%load_ext autoreload
%autoreload 2

In [245]:
!pip install ipynb



# Imports

In [258]:
from epitome.models import *
from epitome.functions import *
from epitome.generators import *
from epitome.dataset import *

import tempfile
import os
from IPython.core.debugger import set_trace

import numpy as np

# from ipynb.fs.defs.debug_score_matrix import WrapperModel

# Setup

In [2]:
class WrapperModel(EpitomeModel):
    def score_matrix(self, accessibility_peak_matrix, regions):
        """ Runs predictions on a matrix of accessibility peaks, where columns are samples and
        rows are regions from regions_peak_file. rows in accessilibility_peak_matrix should matching

        :param numpy.matrix accessilibility_peak_matrix:  of (samples by genomic regions)
        :param str regions: either narrowpeak or bed file containing regions to score, OR a pyranges object
            with columns [Chomosome, Start, End, idx]. Index matches each genomic region to a row in
            accessilibility_peak_matrix. In both cases, number of regions Should
            match rows in accessilibility_peak_matrix

        :return: 3-dimensional numpy matrix of predictions: sized (samples by regions by ChIP-seq targets)
        :rtype: numpy matrix
        """

        conversionObject = RegionConversion(self.dataset.regions, regions)

        results = []
        # print(accessibility_peak_matrix.shape)
        matrix, indices = conversionObject.get_binary_vector(vector = accessibility_peak_matrix[0,:])
        gen = load_data_runtime(data=self.dataset.get_data(Dataset.ALL),
                 label_cell_types=self.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
                 eval_cell_types=self.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 matrix=self.dataset.matrix,
                 targetmap=self.dataset.targetmap,
                 cellmap=self.dataset.cellmap,
                 radii = self.radii,
                 mode = Dataset.RUNTIME,
                 similarity_matrix = matrix,
                 similarity_targets = ['DNase'],
                 indices = indices,
                 return_feature_names=False)

        to_stack = load_data_no_label_mask(data=self.dataset.get_data(Dataset.ALL),
                 label_cell_types=self.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
                 eval_cell_types=self.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 matrix=self.dataset.matrix,
                 targetmap=self.dataset.targetmap,
                 cellmap=self.dataset.cellmap,
                 radii = self.radii,
                 mode = Dataset.RUNTIME,
                 similarity_matrix = matrix,
                 similarity_targets = ['DNase'],
                 indices = indices,
                 return_feature_names=True)

        gen_to_list = list(gen())
        to_stack = list(to_stack())
        gen_to_list = np.array(gen_to_list)

        # reshape to n_regions [from regions] x nassays [acc dim 1] x n_samples
        radii = self.radii
        
        set_trace()

        stacked = np.stack([to_stack] * accessibility_peak_matrix.shape[0], axis=0)
        names = stacked[:, :, 1]
        to_stack = stacked[:, :, 0]
        to_stack = np.expand_dims(to_stack, axis=-1)

        same_size = accessibility_peak_matrix.shape[1] == len(conversionObject.joined.idx_base)

        if not same_size:
            added_indices = []
            old_idx, counter, old_i = 0, 0, 0
            indices_to_merge = []
            for ctr, (i, i_base) in enumerate(zip(conversionObject.joined.idx, conversionObject.joined.idx_base)):
                if i_base == -1:
                    continue
                if i != old_i:
                    indices_to_merge.append((old_idx, counter))
                    old_idx = counter
                added_indices.append(accessibility_peak_matrix[:, i])
                counter += 1
                old_i = i
            indices_to_merge.append((old_idx, len(conversionObject.joined.idx)))
            
            a = np.stack(added_indices)
        else:
            a = np.transpose(accessibility_peak_matrix, axes=[1, 0])
        
        
        a = a[:, None, :]
        
        set_trace()
        
        out = compute_casv(gen_to_list, a, radii)
        
        set_trace()

        casv_len = out.shape[1]
        num_cells = out.shape[3]
        num_regions = out.shape[0]
        num_celltypes = out.shape[2]
        num_targets = len(self.dataset.targets) if 'DNase' in self.dataset.targets else len(self.dataset.targets) + 1

        print(out)
        print(out.shape)
        # assert False

        for region in range(num_regions):
            for cell in range(num_cells):

                selected_gen = to_stack[cell, region, :]
                selected_casv = out[region, :, :, cell]

                len_feats_per_celltype = int(selected_gen[0].shape[0] / num_celltypes) # 24 / 2 = 12

                old_sg = selected_gen

                for celltype in range(num_celltypes):
                    # print(celltype)
                    idx = len_feats_per_celltype * celltype
                    casv_cell = selected_casv[:, celltype]
                    # print(idx + 4, idx + len_feats_per_celltype)
                    # print(names)
                    # print(num_targets)
                    selected_gen[0][idx + num_targets : idx + len_feats_per_celltype] = casv_cell
                    # assert np.any()

        results = []
        for c in range(num_cells):
            for r in range(num_regions):
                results.append(self._predict(to_stack[c, r, :][0][None, :]))
        
        
        results = np.stack(results)
        results = results.reshape((gen_to_list.shape[1], gen_to_list.shape[0])) # 4 x 5

        if not same_size:
            final = []
            final = np.empty((accessibility_peak_matrix.shape[0], accessibility_peak_matrix.shape[1], 1))
            final.fill(np.nan)
            for tup in indices_to_merge:
                final[:, 0, 0] = np.mean(results[:, tup[0]:tup[1]], axis=1)
            
            # final = np.stack(final)
            # final = final.reshape((accessibility_peak_matrix.shape[0], accessibility_peak_matrix.shape[1], 1))
            return final
        results = results.reshape((results.shape[0], results.shape[1], 1))
        return results

In [159]:
def compute_casv(m1, m2, radii, indices= None):
    '''
    Computes CASV between two matrices. CASV indiciates how similar
    two binary matrices are to eachother. m1 and m2 should have the
    same number of rows and columns, where rows indicate regions and
    columns indicate the assays used to compute the casv (ie DNase-seq, H3K27ac)
    :param np.matrix m1: 2D or 3D numpy matrix 2D shape (nregions x (nassays x ncelltypes))
      where 2nd dimension is blocked by cells (i.e. cell1assay1, cell1assay2, cell2assay1, cell2assay2)
      OR 3D: (nregions x nassays x ncells)
    :param np.matrix m2: 3D numpy matrix shape (nregions x nassays x nsamples)
    :param radii: list of radii to access surrounding region
    :param indices: indices on 0th axis of m1 and m2 to compute casv for
    :return numpy matrix of size (len(indices) x CASV dimension x ncelltypes x ncells)
    '''

    if indices is None:
        indices = range(m1.shape[0])

    # if only one sample, extend m2 along 2nd axis
    if len(m2.shape) == 2:
        m2 = m2[:,:,None]

    # if needed, reshape m1 to put all assay/train cells on the last axis
    if len(m1.shape) == 3:
      ncells = m1.shape[-1]
      m1 = m1.reshape(m1.shape[0],m1.shape[1]*m1.shape[2])
    else:
      denom = 1 if m2.shape[1]==0 else m2.shape[1]
      ncells = int(m1.shape[-1]/denom)

    if m2.shape[1] == 0:
      # in this case, there is no CASV to compute, so we just return
      return np.zeros((len(indices),0, ncells,m2.shape[-1]))

    print(m1.shape, m2.shape)
    assert m1.shape[0] == m2.shape[0]
    # verify number of assays match
    assert m2.shape[1] == m1.shape[-1]/ncells
    # print('HERE')
    
#     set_trace()

    def f(i):
        
#         set_trace()
        # get indices for each radius in radii
        radius_ranges = list(map(lambda x: get_radius_indices(radii, x, i, m1.shape[0]), range(len(radii))))

        if len(radius_ranges) > 0:
            radius_indices = np.concatenate(radius_ranges)

            # data from known cell types (m1 portion)
            m1_slice = m1[radius_indices, :]
            m2_slice = np.repeat(m2[radius_indices, :, :],axis=1, repeats = ncells)
            

            # shape: radius size x (nassaysxncells) by nsamples
            pos = (m1_slice.T*m2_slice.T).T
#             agree = (m1_slice.T == m2_slice.T).T

            # split pos and agree arrays to create new dimension for ncells
            # the new dimension will be 4D: (radius x nassays x ncells x nsamples)
            pos = np.stack(np.split(pos, ncells, axis=1), axis=2)
#             agree = np.stack(np.split(agree, ncells, axis=1), axis=2)
            
            # get indices to split on. remove last because it is empty
            split_indices = np.cumsum([len(i) for i in radius_ranges])[:-1]
            # slice arrays by radii
            pos_arrays = np.split(pos, split_indices, axis= 0 )
#             agree_arrays = np.split(agree, split_indices, axis = 0)

            # average over the radius (0th axis)
            tmp1 = list(map(lambda x: np.average(x, axis = 0), pos_arrays)) # this line is problematic
            # final concatenation combines agree, nassays, and radii on the 0th axis
            # this axis is ordered by (1) pos/agree, then (2) radii, then (2) n assays.
            # See ordering example when there are 2 radii (r1, r2):
            # - pos: r1, nassays | pos: r2, nassays | agree: r1: nassays | agree: r1: nassays
            tmp = np.concatenate(tmp1, axis=0)
            return tmp
        else:
            # no radius, so no similarities. just an empty placeholder
            # shaped with the number of cells (last dim of m1)
            return np.zeros((0,ncells,m2.shape[-1]))

    # for every region of interest
    # TODO: maybe something more efficient?

    # set_trace()
    tmp = []
    for i in indices:
        tmp.append(f(i))
    
    return np.stack(tmp)
#     return np.stack([f(i) for i in indices])

In [225]:
def load_data(data,
                 label_cell_types,  # used for labels. Should be all for train/eval and subset for test
                 eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 matrix,
                 targetmap,
                 cellmap,
                 radii,
                 similarity_targets = ['DNase'],
                 mode = Dataset.TRAIN,
                 similarity_matrix = None,
                 indices = None,
                 return_feature_names = False,
                 **kwargs):
    """
    Takes Deepsea data and calculates distance metrics from cell types whose locations
    are specified by label_cell_indices, and the other cell types in the set. Label space is only one cell type.
    :param data: dictionary of matrices. Should have keys x and y. x contains n by 1000 rows. y contains n by 919 labels.
    :param label_cell_types: list of cell types to be rotated through and used as labels (subset of eval_cell_types)
    :param eval_cell_types: list of cell types to be used in evaluation (includes label_cell_types)
    :param matrix: matrix of celltype, target positions
    :param targetmap: map of column target positions in matrix
    :param cellmap: map of row cell type positions in matrix
    :param radii: radii to compute similarity distances from
    :param similarity_targets: list of targets used to measure cell type similarity (default is DNase-seq)
    :param mode: Dataset.TRAIN, VALID, TEST or RUNTIME
    :param similarity_matrix: matrix with shape (len(similarity_targets), genome_size) containing binary 0/1s of peaks for similarity_targets
    to be compared in the CASV.
    :param indices: indices in genome to generate records for.
    :param return_feature_names: boolean whether to return string names of features
    :param kwargs: kargs

    :returns: generator of data with three elements:
        1. record features
        2. record labels for a given cell type
        3. 0/1 mask of labels that have validation data. For example, if this record is for celltype A549,
        and A549 does not have data for ATF3, there will be a 0 in the position corresponding to the label space.
    """

    # reshape similarity_matrix to a matrix if there is only one target
    if similarity_matrix is not None:
        if len(similarity_matrix.shape) == 1:
            similarity_matrix = similarity_matrix[None,:]

    if type(similarity_targets) is not list:
        similarity_targets = [similarity_targets]

    if len(similarity_targets) == 0 and len(radii) > 0:
        raise ValueError("Cannot set radii to anything if there are no similarity assays, but found len(radii)=%i" % len(radii))

    # get indices for features. rows are cells and cols are targets
    cellmap_idx = [cellmap[c] for c in list(eval_cell_types)]
    feature_cell_indices = matrix[cellmap_idx,:]

    # indices to be deleted. used for similarity comparison, not predictions.
    delete_indices = np.array([targetmap[s] for s in similarity_targets]).astype(int)

    # make sure no similarity comparison data is missing for all cell types
    assert np.invert(np.any(feature_cell_indices[:,delete_indices] == -1)), \
        "missing data for similarity target at %s" % (np.where(feature_cell_indices[:,delete_indices] == -1)[0])

    # names of labels that are being predicted
    feature_targets = [a for a in list(targetmap)] # targets used as features for each evaluation cell type
    label_targets = [a for a in feature_targets if a not in similarity_targets]

    if (not isinstance(mode, Dataset)):
        raise ValueError("mode is not a Dataset enum")

    if (not isinstance(indices, np.ndarray) and not isinstance(indices, list)):
        # model performs better when there are less 0s
        if mode == Dataset.TRAIN:
            feature_indices = np.concatenate(list(map(lambda c: EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, c),
                                     list(cellmap))))
            feature_indices = feature_indices[feature_indices != -1]

            # need to re-proportion the indices to oversample underrepresented labels
            if (len(list(targetmap)) > 2):
                # configure y: label matrix of ChIP for all targets from all cell lines in train
                indices = np.concatenate([EpitomeDataset.get_y_indices_for_target(matrix, targetmap, target) for target in label_targets])
                indices = indices[indices != -1]
                y = data[indices, :].T
                m = MLSMOTE(y)
                indices = m.fit_resample()

            else:
                # single TF model
                # get indices for DNase and chip for this mark
                feature_indices = np.concatenate(list(map(lambda c: EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, c),
                                                     list(cellmap))))

                # chop off targets being used in similarity metric
                not_similarity_indices = np.array([v for k,v in targetmap.items() if k not in similarity_targets])
                TF_indices = feature_indices.reshape([len(cellmap),len(targetmap)])[:,not_similarity_indices]

                TF_indices =  TF_indices[TF_indices != -1]
                feature_indices = feature_indices[feature_indices != -1]

                # sites where TF is bound in at least 2 cell line
                positive_indices = np.where(np.sum(data[TF_indices,:], axis=0) > 1)[0]
                indices_probs = np.ones([data.shape[1]])
                indices_probs[positive_indices] = 0
                indices_probs = indices_probs/np.sum(indices_probs, keepdims=1)

                # If there are nans, it means there were no 0 cases.
                # We use this for testing so models converge quickly
                # with all ones.
                if np.any(np.isnan(indices_probs)):
                  print("Warning: no negative examples in dataset!!!")
                  indices_probs[:] = 1/indices_probs.shape[0]

                # randomly select 10 fold sites where TF is not in any cell line
                negative_indices = np.random.choice(np.arange(0,data.shape[1]),
                                                    positive_indices.shape[0] * 10,
                                                    p=indices_probs)
                indices = np.sort(np.concatenate([negative_indices, positive_indices]))

        else:
            indices = range(0, data.shape[-1]) # not training mode, set to all points

#     set_trace()
    
    if (mode == Dataset.RUNTIME):
        label_cell_types = ["PLACEHOLDER_CELL"]
        if similarity_matrix is None:
            raise Exception("similarity_matrix must be defined in runtime mode")
        assert similarity_matrix.shape[0] == len(similarity_targets), \
            "similarity_matrix is missing data for targets (should have %i rows)" % (len(similarity_targets))
        random_cell = list(cellmap)[0] # placeholder to get label vector length

    print("using %s as labels for mode %s" % (label_cell_types, mode))

    # string of radii for meta data labeling
    radii_str = list(map(lambda x: "RADII_%i" % x, radii))

    def g():
        
        for i in indices: # for all records specified

            for (cell) in label_cell_types: # for all cell types to be used in labels

                # labels for this cell
                if (mode != Dataset.RUNTIME):
                    label_cell_indices = EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, cell)

                    # delete all indices being used in the similarity computation
                    label_cell_indices_no_similarities = np.delete(label_cell_indices, delete_indices)

                    # Copy target_index_no_similarities and turn into mask of 0/1 for whether data for this cell type for
                    # a given label is available.
                    target_mask = np.copy(label_cell_indices_no_similarities)
                    target_mask[target_mask > -1] = 1
                    target_mask[target_mask == -1] = 0

                else:
                    label_count = len(EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, random_cell))-len(similarity_targets)

                    # Mask and labels are all 0's because labels are missing during runtime
                    garbage_labels = target_mask = np.zeros(label_count)


                # get indices for targets used in similarity computation
                # for cell types that are going to be features
                similarity_indices = feature_cell_indices[:, delete_indices]

                    
                set_trace()

                # get indices for each radius in radii
                radius_ranges = list(map(lambda x: get_radius_indices(radii, x, i, data.shape[-1]), range(len(radii))))

                if len(radius_ranges) > 0:
                    radius_indices = np.concatenate(radius_ranges)

                    cell_train_data = data[similarity_indices[:,:,None],radius_indices]

                    if mode == Dataset.RUNTIME:

                        pos = cell_train_data*similarity_matrix[:,radius_indices]
                        agree = cell_train_data == similarity_matrix[:,radius_indices]

                    else:
                        cell_label_data = data[label_cell_indices[delete_indices][:,None],radius_indices]

                        # remove middle dimension and flatten similarity targets
                        pos = (cell_train_data*cell_label_data)
#                         agree = (cell_train_data == cell_label_data)
                        
#                         set_trace()

                    # get indices to split on. remove last because it is empty
                    split_indices = np.cumsum([len(i) for i in radius_ranges])[:-1]
                    # slice arrays by radii
                    pos_arrays = np.split(pos, split_indices, axis= -1 )
#                     agree_arrays = np.split(agree, split_indices, axis = -1)
                    
#                     set_trace()
                    # NEW ITER

                    similarities = np.stack(list(map(lambda x: np.average(x, axis = -1), pos_arrays)),axis=1)
                else:
                    # no radius, so no similarities. just an empty placeholder
                    similarities = np.zeros((len(eval_cell_types),0,0))

                # reshape similarities to flatten 1st dimension, which are the targets
                # results in the odering:
                ## row 1: cell 1: pos for each target and agree for each target for each radius
                similarities = similarities.reshape(similarities.shape[0], similarities.shape[1]*similarities.shape[2])

                ##### Concatenate all cell type features together ####
                final_features = np.concatenate([data[feature_cell_indices,i], similarities],axis=1).flatten()

                # mask missing data
                f_mask = np.concatenate([feature_cell_indices!=-1,
                                         np.ones(similarities.shape)],axis=1).flatten()
                final_features = final_features[f_mask != 0]

                if (mode != Dataset.RUNTIME):
                    labels = data[label_cell_indices_no_similarities,i]

                else: # used when just predicting
                    # The features going into the example.
                    labels = garbage_labels # all 0's

                # append labels and targetmask
                final= tuple([final_features, labels.astype(np.float32), target_mask.astype(np.float32)])

                #### Finish appending feature labels together ####
                if (return_feature_names):
                    all_labels = []
                    feature_names = []
                    similarity_labels_agreement = ['r%i_%s' % (radius, 'agree') for radius in radii]
#                     similarity_labels_dp = ['r%i_%s' % (radius, 'dp') for radius in radii]
                    similarity_labels = similarity_labels_agreement

                    # concatenate together feature names
                    for j,c in enumerate(eval_cell_types):
                        tmp = np.array(feature_targets)[feature_cell_indices[j,:] != -1]
                        al = ['%s_%s' % (c, a) for a in tmp]
                        sl = ['%s_%s' % (c, s) for s in similarity_labels]

                        feature_names.append(al)
                        feature_names.append(sl)

                    all_labels.append(np.concatenate(feature_names))
                    all_labels.append(['lbl_%s_%s' % (cell, a) for a in label_targets]) # of form lbl_cellline_target
                    all_labels.append(['mask_%s_%s' % (cell, a) for a in label_targets]) # of form mask_cellline_target

                    yield (final, tuple(all_labels))
                else:
                    yield final


    return g

In [237]:
radii = [1]
accessibility_peak_matrix = np.random.uniform(low=0., high=1., size=(4,2))
accessibility_peak_matrix = np.ones((4, 2))

eligible_cells = ['K562','HepG2','H1','A549','HeLa-S3']
eligible_targets = ['DNase','CTCF', 'RAD21']

dataset = EpitomeDataset(targets = eligible_targets,
    cells = eligible_cells)


model = WrapperModel(dataset,
    test_celltypes = ['K562'])

model.radii = radii

regions_peak_file = tempfile.NamedTemporaryFile(delete=False)

# Create dummy data
regions_dict = {'Chromosome': ['chr1', 'chr1'],
                    'Start': [10000, 30000],
                    'End': [10300, 31200]}

regions_pr = pr.from_dict(regions_dict)

# Write to tmp bed file
regions_pr.to_bed(regions_peak_file.name)
regions_peak_file.flush()

conversionObject = RegionConversion(model.dataset.regions, regions_peak_file.name)

# print(accessibility_peak_matrix.shape)
matrix, indices = conversionObject.get_binary_vector(vector = accessibility_peak_matrix[0,:])
gen = load_data_runtime(data=model.dataset.get_data(Dataset.ALL),
         label_cell_types=model.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
         eval_cell_types=model.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
         matrix=model.dataset.matrix,
         targetmap=model.dataset.targetmap,
         cellmap=model.dataset.cellmap,
         radii = model.radii,
         mode = Dataset.RUNTIME,
         similarity_matrix = matrix,
         similarity_targets = ['DNase'],
         indices = indices,
         return_feature_names=False)

using ['HepG2', 'HeLa-S3', 'H1', 'A549'] as labels for mode Dataset.TRAIN
using ['HepG2', 'HeLa-S3', 'H1', 'A549'] as labels for mode Dataset.VALID
using ['K562'] as labels for mode Dataset.TEST
(8587562,)
+--------------+-----------+-----------+-----------+--------------+------------+------------+
| Chromosome   |     Start |       End |       idx |   Start_base |   End_base |   idx_base |
| (category)   |   (int64) |   (int64) |   (int64) |      (int64) |    (int64) |    (int64) |
|--------------+-----------+-----------+-----------+--------------+------------+------------|
| chr1         |     10000 |     10300 |         0 |        10000 |      10200 |          0 |
| chr1         |     10000 |     10300 |         0 |        10200 |      10400 |          1 |
| chr1         |     30000 |     31200 |         1 |        30600 |      30800 |          6 |
| chr1         |     30000 |     31200 |         1 |        30800 |      31000 |          7 |
| chr1         |     30000 |     31200 |  

In [201]:
gen_to_list = np.array(list(gen()))

In [202]:
gen_to_list

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [203]:
accessibility_peak_matrix

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [204]:
conversionObject.joined

+--------------+-----------+-----------+-----------+--------------+------------+------------+
| Chromosome   |     Start |       End |       idx |   Start_base |   End_base |   idx_base |
| (category)   |   (int64) |   (int64) |   (int64) |      (int64) |    (int64) |    (int64) |
|--------------+-----------+-----------+-----------+--------------+------------+------------|
| chr1         |     10000 |     10300 |         0 |        10000 |      10200 |          0 |
| chr1         |     10000 |     10300 |         0 |        10200 |      10400 |          1 |
| chr1         |     30000 |     31200 |         1 |        30600 |      30800 |          6 |
| chr1         |     30000 |     31200 |         1 |        30800 |      31000 |          7 |
| chr1         |     30000 |     31200 |         1 |        31000 |      31200 |          8 |
+--------------+-----------+-----------+-----------+--------------+------------+------------+
Unstranded PyRanges object has 5 rows and 7 columns from 1 c

In [236]:
indices

array([0, 1, 6, 7, 8])

In [206]:
same_size = accessibility_peak_matrix.shape[1] == len(conversionObject.joined.idx_base)

if not same_size:
    added_indices = []
    old_idx, counter, old_i = 0, 0, 0
    indices_to_merge = []
    for ctr, (i, i_base) in enumerate(zip(conversionObject.joined.idx, conversionObject.joined.idx_base)):
        if i_base == -1:
            continue
        if i != old_i:
            indices_to_merge.append((old_idx, counter))
            old_idx = counter
        added_indices.append(accessibility_peak_matrix[:, i])
        counter += 1
        old_i = i
    indices_to_merge.append((old_idx, len(conversionObject.joined.idx)))

    a = np.stack(added_indices)
else:
    a = np.transpose(accessibility_peak_matrix, axes=[1, 0])

print(a)
print(gen_to_list.shape)

a = a[:, None, :]

[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
(5, 4)


In [207]:
gen_to_list

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [208]:
gen_to_list.shape

(5, 4)

In [209]:
a

array([[[1., 1., 1., 1., 1., 1., 1., 1.]],

       [[1., 1., 1., 1., 1., 1., 1., 1.]],

       [[1., 1., 1., 1., 1., 1., 1., 1.]],

       [[1., 1., 1., 1., 1., 1., 1., 1.]],

       [[1., 1., 1., 1., 1., 1., 1., 1.]]])

In [210]:
a.shape

(5, 1, 8)

In [211]:
model.radii

[1]

In [212]:
out = compute_casv(gen_to_list, a, model.radii)

(5, 4) (5, 1, 8)
> [0;32m/home/eecs/rvkoodli/epitome_modified/epitome/functions.py[0m(226)[0;36mcompute_casv[0;34m()[0m
[0;32m    224 [0;31m    [0;32mdef[0m [0mf[0m[0;34m([0m[0mi[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    225 [0;31m        [0;31m# get indices for each radius in radii[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 226 [0;31m        [0mradius_ranges[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0mget_radius_indices[0m[0;34m([0m[0mradii[0m[0;34m,[0m [0mx[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0mm1[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mradii[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    227 [0;31m[0;34m[0m[0m
[0m[0;32m    228 [0;31m        [0;32mif[0m [0mlen[0m[0;34m([0m[0mradius_ranges[0m[

ipdb>  c


In [213]:
out.shape

(5, 2, 4, 8)

In [214]:
out.squeeze()

array([[[[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.]]],


       [[[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.]]],


       [[[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.

In [226]:
orig_gen = load_data(model.dataset.get_data(Dataset.ALL),
                 model.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
                 model.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 model.dataset.matrix,
                 model.dataset.targetmap,
                 model.dataset.cellmap,
                 radii = model.radii,
                 mode = Dataset.RUNTIME,
                 similarity_matrix = matrix,
                 similarity_targets = model.dataset.similarity_targets,
                 indices = indices,
                 return_feature_names = True)

using ['PLACEHOLDER_CELL'] as labels for mode Dataset.RUNTIME


In [227]:
orig_list = list(orig_gen())

> [0;32m<ipython-input-225-6785598b8898>[0m(168)[0;36mg[0;34m()[0m
[0;32m    166 [0;31m[0;34m[0m[0m
[0m[0;32m    167 [0;31m                [0;31m# get indices for each radius in radii[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 168 [0;31m                [0mradius_ranges[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0mget_radius_indices[0m[0;34m([0m[0mradii[0m[0;34m,[0m [0mx[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0mdata[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mradii[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m[0;34m[0m[0m
[0m[0;32m    170 [0;31m                [0;32mif[0m [0mlen[0m[0;34m([0m[0mradius_ranges[0m[0;34m)[0m [0;34m>[0m [0;36m0[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  
ipdb>  
ipdb>  c


> [0;32m<ipython-input-225-6785598b8898>[0m(168)[0;36mg[0;34m()[0m
[0;32m    166 [0;31m[0;34m[0m[0m
[0m[0;32m    167 [0;31m                [0;31m# get indices for each radius in radii[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 168 [0;31m                [0mradius_ranges[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0mget_radius_indices[0m[0;34m([0m[0mradii[0m[0;34m,[0m [0mx[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0mdata[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mradii[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m[0;34m[0m[0m
[0m[0;32m    170 [0;31m                [0;32mif[0m [0mlen[0m[0;34m([0m[0mradius_ranges[0m[0;34m)[0m [0;34m>[0m [0;36m0[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


> [0;32m<ipython-input-225-6785598b8898>[0m(168)[0;36mg[0;34m()[0m
[0;32m    166 [0;31m[0;34m[0m[0m
[0m[0;32m    167 [0;31m                [0;31m# get indices for each radius in radii[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 168 [0;31m                [0mradius_ranges[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0mget_radius_indices[0m[0;34m([0m[0mradii[0m[0;34m,[0m [0mx[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0mdata[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mradii[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m[0;34m[0m[0m
[0m[0;32m    170 [0;31m                [0;32mif[0m [0mlen[0m[0;34m([0m[0mradius_ranges[0m[0;34m)[0m [0;34m>[0m [0;36m0[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


> [0;32m<ipython-input-225-6785598b8898>[0m(168)[0;36mg[0;34m()[0m
[0;32m    166 [0;31m[0;34m[0m[0m
[0m[0;32m    167 [0;31m                [0;31m# get indices for each radius in radii[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 168 [0;31m                [0mradius_ranges[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0mget_radius_indices[0m[0;34m([0m[0mradii[0m[0;34m,[0m [0mx[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0mdata[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mradii[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m[0;34m[0m[0m
[0m[0;32m    170 [0;31m                [0;32mif[0m [0mlen[0m[0;34m([0m[0mradius_ranges[0m[0;34m)[0m [0;34m>[0m [0;36m0[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


> [0;32m<ipython-input-225-6785598b8898>[0m(168)[0;36mg[0;34m()[0m
[0;32m    166 [0;31m[0;34m[0m[0m
[0m[0;32m    167 [0;31m                [0;31m# get indices for each radius in radii[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 168 [0;31m                [0mradius_ranges[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0mget_radius_indices[0m[0;34m([0m[0mradii[0m[0;34m,[0m [0mx[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0mdata[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mradii[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    169 [0;31m[0;34m[0m[0m
[0m[0;32m    170 [0;31m                [0;32mif[0m [0mlen[0m[0;34m([0m[0mradius_ranges[0m[0;34m)[0m [0;34m>[0m [0;36m0[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


In [228]:
matrix.shape, indices

((8587562,), array([0, 1, 6, 7, 8]))

In [229]:
matrix[indices]

array([1., 1., 1., 1., 1.])

In [230]:
model.dataset.get_data(Dataset.ALL).shape

(10, 8587562)

In [231]:
len(orig_list)

5

In [232]:
print(orig_list[4][0][0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [233]:
np.array(list(map(lambda x: x[0][0], orig_list))).shape

(5, 12)

In [234]:
print(orig_list[1][1][0])

['HepG2_DNase' 'HepG2_CTCF' 'HepG2_r1_agree' 'HeLa-S3_DNase'
 'HeLa-S3_CTCF' 'HeLa-S3_r1_agree' 'H1_DNase' 'H1_CTCF' 'H1_r1_agree'
 'A549_DNase' 'A549_CTCF' 'A549_r1_agree']


In [235]:
for (i, j) in zip(orig_list[2][0][0], orig_list[2][1][0]):
    print("", i, '\t', j)

 0.0 	 HepG2_DNase
 0.0 	 HepG2_CTCF
 0.0 	 HepG2_r1_agree
 0.0 	 HeLa-S3_DNase
 0.0 	 HeLa-S3_CTCF
 0.0 	 HeLa-S3_r1_agree
 0.0 	 H1_DNase
 0.0 	 H1_CTCF
 0.0 	 H1_r1_agree
 0.0 	 A549_DNase
 0.0 	 A549_CTCF
 0.0 	 A549_r1_agree


In [30]:
out[2][0]

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [31]:
out.shape

(5, 2, 4, 4)

In [32]:
a

array([[[1., 1., 1., 1.]],

       [[1., 1., 1., 1.]],

       [[1., 1., 1., 1.]],

       [[1., 1., 1., 1.]],

       [[1., 1., 1., 1.]]])

In [112]:
gen_to_list

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [113]:
radii

[1]

In [247]:
to_stack.shape

NameError: name 'to_stack' is not defined

In [253]:
to_stack = load_data_no_label_mask(model.dataset.get_data(Dataset.ALL),
                 model.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
                 model.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 model.dataset.matrix,
                 model.dataset.targetmap,
                 model.dataset.cellmap,
                 radii = model.radii,
                 mode = Dataset.RUNTIME,
                 similarity_matrix = matrix,
                 similarity_targets = model.dataset.similarity_targets,
                 indices = indices,
                 return_feature_names = True)

to_stack = list(to_stack())
stacked = np.stack([to_stack] * accessibility_peak_matrix.shape[0], axis=0)
names = stacked[:, :, 1]
to_stack = stacked[:, :, 0]
to_stack = np.expand_dims(to_stack, axis=-1)

using ['PLACEHOLDER_CELL'] as labels for mode Dataset.RUNTIME


In [255]:
to_stack.shape

(4, 5, 1)

In [256]:
to_stack[0, 0, :]

array([array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])],
      dtype=object)