In [168]:
from epitome.models import *
from epitome.functions import *
# from epitome.generators import *
from epitome.dataset import *

from tqdm import tqdm


import tempfile
import os
import time
from IPython.core.debugger import set_trace

# Setup

In [96]:
def load_data_runtime(data,
                 label_cell_types,  # used for labels. Should be all for train/eval and subset for test
                 eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 matrix,
                 targetmap,
                 cellmap,
                 radii,
                 similarity_targets = ['DNase'],
                 mode = Dataset.RUNTIME,
                 similarity_matrix = None,
                 indices = None,
                 return_feature_names = False,
                 **kwargs):
    """
    Takes Deepsea data and calculates distance metrics from cell types whose locations
    are specified by label_cell_indices, and the other cell types in the set. Label space is only one cell type.
    :param data: dictionary of matrices. Should have keys x and y. x contains n by 1000 rows. y contains n by 919 labels.
    :param label_cell_types: list of cell types to be rotated through and used as labels (subset of eval_cell_types)
    :param eval_cell_types: list of cell types to be used in evaluation (includes label_cell_types)
    :param matrix: matrix of celltype, target positions
    :param targetmap: map of column target positions in matrix
    :param cellmap: map of row cell type positions in matrix
    :param radii: radii to compute similarity distances from
    :param similarity_targets: list of targets used to measure cell type similarity (default is DNase-seq)
    :param mode: Dataset.TRAIN, VALID, TEST or RUNTIME
    :param similarity_matrix: matrix with shape (len(similarity_targets), genome_size) containing binary 0/1s of peaks for similarity_targets
    to be compared in the CASV.
    :param indices: indices in genome to generate records for.
    :param return_feature_names: boolean whether to return string names of features
    :param kwargs: kargs

    :returns: generator of data with three elements:
        1. record features
        2. record labels for a given cell type
        3. 0/1 mask of labels that have validation data. For example, if this record is for celltype A549,
        and A549 does not have data for ATF3, there will be a 0 in the position corresponding to the label space.
    """

    if similarity_matrix is not None:
        if len(similarity_matrix.shape) == 1:
            similarity_matrix = similarity_matrix[None,:]

    if type(similarity_targets) is not list:
        similarity_targets = [similarity_targets]

    if len(similarity_targets) == 0 and len(radii) > 0:
        raise ValueError("Cannot set radii to anything if there are no similarity assays, but found len(radii)=%i" % len(radii))

    # get indices for features. rows are cells and cols are targets
    cellmap_idx = [cellmap[c] for c in list(eval_cell_types)]
    feature_cell_indices = matrix[cellmap_idx,:]

    # indices to be deleted. used for similarity comparison, not predictions.
    delete_indices = np.array([targetmap[s] for s in similarity_targets]).astype(int)

    # make sure no similarity comparison data is missing for all cell types
    assert np.invert(np.any(feature_cell_indices[:,delete_indices] == -1)), \
        "missing data for similarity target at %s" % (np.where(feature_cell_indices[:,delete_indices] == -1)[0])

    # names of labels that are being predicted
    feature_targets = [a for a in list(targetmap)] # targets used as features for each evaluation cell type
    label_targets = [a for a in feature_targets if a not in similarity_targets]

    if (not isinstance(mode, Dataset)):
        raise ValueError("mode is not a Dataset enum")

    if (not isinstance(indices, np.ndarray) and not isinstance(indices, list)):
        # model performs better when there are less 0s
        if mode == Dataset.TRAIN:
            feature_indices = np.concatenate(list(map(lambda c: EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, c),
                                     list(cellmap))))
            feature_indices = feature_indices[feature_indices != -1]

            # need to re-proportion the indices to oversample underrepresented labels
            if (len(list(targetmap)) > 2):
                # configure y: label matrix of ChIP for all targets from all cell lines in train
                indices = np.concatenate([EpitomeDataset.get_y_indices_for_target(matrix, targetmap, target) for target in label_targets])
                indices = indices[indices != -1]
                y = data[indices, :].T
                m = MLSMOTE(y)
                indices = m.fit_resample()

            else:
                # single TF model
                # get indices for DNase and chip for this mark
                feature_indices = np.concatenate(list(map(lambda c: EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, c),
                                                     list(cellmap))))

                # chop off targets being used in similarity metric
                not_similarity_indices = np.array([v for k,v in targetmap.items() if k not in similarity_targets])
                TF_indices = feature_indices.reshape([len(cellmap),len(targetmap)])[:,not_similarity_indices]

                TF_indices =  TF_indices[TF_indices != -1]
                feature_indices = feature_indices[feature_indices != -1]

                # sites where TF is bound in at least 2 cell line
                positive_indices = np.where(np.sum(data[TF_indices,:], axis=0) > 1)[0]
                indices_probs = np.ones([data.shape[1]])
                indices_probs[positive_indices] = 0
                indices_probs = indices_probs/np.sum(indices_probs, keepdims=1)

                # If there are nans, it means there were no 0 cases.
                # We use this for testing so models converge quickly
                # with all ones.
                if np.any(np.isnan(indices_probs)):
                  print("Warning: no negative examples in dataset!!!")
                  indices_probs[:] = 1/indices_probs.shape[0]

                # randomly select 10 fold sites where TF is not in any cell line
                negative_indices = np.random.choice(np.arange(0,data.shape[1]),
                                                    positive_indices.shape[0] * 10,
                                                    p=indices_probs)
                indices = np.sort(np.concatenate([negative_indices, positive_indices]))

        else:
            indices = range(0, data.shape[-1]) # not training mode, set to all points

    if (mode == Dataset.RUNTIME):
        label_cell_types = ["PLACEHOLDER_CELL"]
        if similarity_matrix is None:
            raise Exception("similarity_matrix must be defined in runtime mode")
        assert similarity_matrix.shape[0] == len(similarity_targets), \
            "similarity_matrix is missing data for targets (should have %i rows)" % (len(similarity_targets))
        random_cell = list(cellmap)[0] # placeholder to get label vector length

    print("using %s as labels for mode %s" % (label_cell_types, mode))

    # string of radii for meta data labeling
    radii_str = list(map(lambda x: "RADII_%i" % x, radii))

    def g():
        for i in indices: # for all records specified

            for (cell) in label_cell_types: # for all cell types to be used in labels

                # labels for this cell
                if (mode != Dataset.RUNTIME):
                    label_cell_indices = EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, cell)

                    # delete all indices being used in the similarity computation
                    label_cell_indices_no_similarities = np.delete(label_cell_indices, delete_indices)

                    # Copy target_index_no_similarities and turn into mask of 0/1 for whether data for this cell type for
                    # a given label is available.
                    target_mask = np.copy(label_cell_indices_no_similarities)
                    target_mask[target_mask > -1] = 1
                    target_mask[target_mask == -1] = 0

                else:
                    label_count = len(EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, random_cell))-len(similarity_targets)

                    # Mask and labels are all 0's because labels are missing during runtime
                    garbage_labels = target_mask = np.zeros(label_count)


                # get indices for targets used in similarity computation
                # for cell types that are going to be features
                similarity_indices = feature_cell_indices[:, delete_indices]


                # get indices for each radius in radii
                radius_ranges = list(map(lambda x: get_radius_indices(radii, x, i, data.shape[-1]), range(len(radii))))

                if len(radius_ranges) > 0:
                    radius_indices = np.concatenate(radius_ranges)

                    cell_train_data = data[similarity_indices[:,:,None],radius_indices]

                    if mode == Dataset.RUNTIME:

                        pos = cell_train_data*similarity_matrix[:,radius_indices]
#                         agree = cell_train_data == similarity_matrix[:,radius_indices]

                    else:
                        cell_label_data = data[label_cell_indices[delete_indices][:,None],radius_indices]

                        # remove middle dimension and flatten similarity targets
                        pos = (cell_train_data*cell_label_data)
#                         agree = (cell_train_data == cell_label_data)

                    # get indices to split on. remove last because it is empty
                    split_indices = np.cumsum([len(i) for i in radius_ranges])[:-1]
                    # slice arrays by radii
                    pos_arrays = np.split(pos, split_indices, axis= -1 )
#                     agree_arrays = np.split(agree, split_indices, axis = -1)

                    similarities = np.stack(list(map(lambda x: np.average(x, axis = -1), pos_arrays)),axis=1)
                else:
                    # no radius, so no similarities. just an empty placeholder
                    similarities = np.zeros((len(eval_cell_types),0,0))

                # reshape similarities to flatten 1st dimension, which are the targets
                # results in the odering:
                ## row 1: cell 1: pos for each target and agree for each target for each radius
                similarities = similarities.reshape(similarities.shape[0], similarities.shape[1]*similarities.shape[2])

                ##### Concatenate all cell type features together ####
                final_features = np.concatenate([data[feature_cell_indices,i], similarities],axis=1).flatten()

                # mask missing data
                f_mask = np.concatenate([feature_cell_indices!=-1,
                                         np.ones(similarities.shape)],axis=1).flatten()
                final_features = final_features[f_mask != 0]

                if (mode != Dataset.RUNTIME):
                    labels = data[label_cell_indices_no_similarities,i]

                else: # used when just predicting
                    # The features going into the example.
                    labels = garbage_labels # all 0's

                # append labels and targetmask
                final = final_features

                #### Finish appending feature labels together ####
                # if (return_feature_names):
                all_labels = []
                feature_names = []
                similarity_labels_agreement = ['r%i_%s' % (radius, 'agree') for radius in radii]
#                 similarity_labels_dp = ['r%i_%s' % (radius, 'dp') for radius in radii]
                similarity_labels = similarity_labels_agreement

                # concatenate together feature names
                for j,c in enumerate(eval_cell_types):
                    tmp = np.array(feature_targets)[feature_cell_indices[j,:] != -1]
                    al = ['%s_%s' % (c, a) for a in tmp]
                    sl = ['%s_%s' % (c, s) for s in similarity_labels]

                    feature_names.append(al)
                    feature_names.append(sl)

                all_labels.append(np.concatenate(feature_names))
                # all_labels.append(['lbl_%s_%s' % (cell, a) for a in label_targets]) # of form lbl_cellline_target
                # all_labels.append(['mask_%s_%s' % (cell, a) for a in label_targets]) # of form mask_cellline_target

                    # yield (final, tuple(all_labels))
                all_labels = all_labels[0]
                indx_to_keep = []
                for i in range(len(all_labels)):
                    for a in similarity_targets:
                        # print(a, all_labels[i])
                        if a in all_labels[i]:
                            
                            indx_to_keep.append(final[i])

                yield np.array(indx_to_keep)


    return g

def load_data_no_label_mask(data,
                 label_cell_types,  # used for labels. Should be all for train/eval and subset for test
                 eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 matrix,
                 targetmap,
                 cellmap,
                 radii,
                 similarity_targets = ['DNase'],
                 mode = Dataset.TRAIN,
                 similarity_matrix = None,
                 indices = None,
                 return_feature_names = False,
                 **kwargs):
    """
    Takes Deepsea data and calculates distance metrics from cell types whose locations
    are specified by label_cell_indices, and the other cell types in the set. Label space is only one cell type.
    :param data: dictionary of matrices. Should have keys x and y. x contains n by 1000 rows. y contains n by 919 labels.
    :param label_cell_types: list of cell types to be rotated through and used as labels (subset of eval_cell_types)
    :param eval_cell_types: list of cell types to be used in evaluation (includes label_cell_types)
    :param matrix: matrix of celltype, target positions
    :param targetmap: map of column target positions in matrix
    :param cellmap: map of row cell type positions in matrix
    :param radii: radii to compute similarity distances from
    :param similarity_targets: list of targets used to measure cell type similarity (default is DNase-seq)
    :param mode: Dataset.TRAIN, VALID, TEST or RUNTIME
    :param similarity_matrix: matrix with shape (len(similarity_targets), genome_size) containing binary 0/1s of peaks for similarity_targets
    to be compared in the CASV.
    :param indices: indices in genome to generate records for.
    :param return_feature_names: boolean whether to return string names of features
    :param kwargs: kargs

    :returns: generator of data with three elements:
        1. record features
        2. record labels for a given cell type
        3. 0/1 mask of labels that have validation data. For example, if this record is for celltype A549,
        and A549 does not have data for ATF3, there will be a 0 in the position corresponding to the label space.
    """

    # reshape similarity_matrix to a matrix if there is only one target
    if similarity_matrix is not None:
        if len(similarity_matrix.shape) == 1:
            similarity_matrix = similarity_matrix[None,:]

    if type(similarity_targets) is not list:
        similarity_targets = [similarity_targets]

    if len(similarity_targets) == 0 and len(radii) > 0:
        raise ValueError("Cannot set radii to anything if there are no similarity assays, but found len(radii)=%i" % len(radii))

    # get indices for features. rows are cells and cols are targets
    cellmap_idx = [cellmap[c] for c in list(eval_cell_types)]
    feature_cell_indices = matrix[cellmap_idx,:]

    # indices to be deleted. used for similarity comparison, not predictions.
    delete_indices = np.array([targetmap[s] for s in similarity_targets]).astype(int)

    # make sure no similarity comparison data is missing for all cell types
    assert np.invert(np.any(feature_cell_indices[:,delete_indices] == -1)), \
        "missing data for similarity target at %s" % (np.where(feature_cell_indices[:,delete_indices] == -1)[0])

    # names of labels that are being predicted
    feature_targets = [a for a in list(targetmap)] # targets used as features for each evaluation cell type
    label_targets = [a for a in feature_targets if a not in similarity_targets]

    if (not isinstance(mode, Dataset)):
        raise ValueError("mode is not a Dataset enum")

    if (not isinstance(indices, np.ndarray) and not isinstance(indices, list)):
        # model performs better when there are less 0s
        if mode == Dataset.TRAIN:
            feature_indices = np.concatenate(list(map(lambda c: EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, c),
                                     list(cellmap))))
            feature_indices = feature_indices[feature_indices != -1]

            # need to re-proportion the indices to oversample underrepresented labels
            if (len(list(targetmap)) > 2):
                # configure y: label matrix of ChIP for all targets from all cell lines in train
                indices = np.concatenate([EpitomeDataset.get_y_indices_for_target(matrix, targetmap, target) for target in label_targets])
                indices = indices[indices != -1]
                y = data[indices, :].T
                m = MLSMOTE(y)
                indices = m.fit_resample()

            else:
                # single TF model
                # get indices for DNase and chip for this mark
                feature_indices = np.concatenate(list(map(lambda c: EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, c),
                                                     list(cellmap))))

                # chop off targets being used in similarity metric
                not_similarity_indices = np.array([v for k,v in targetmap.items() if k not in similarity_targets])
                TF_indices = feature_indices.reshape([len(cellmap),len(targetmap)])[:,not_similarity_indices]

                TF_indices =  TF_indices[TF_indices != -1]
                feature_indices = feature_indices[feature_indices != -1]

                # sites where TF is bound in at least 2 cell line
                positive_indices = np.where(np.sum(data[TF_indices,:], axis=0) > 1)[0]
                indices_probs = np.ones([data.shape[1]])
                indices_probs[positive_indices] = 0
                indices_probs = indices_probs/np.sum(indices_probs, keepdims=1)

                # If there are nans, it means there were no 0 cases.
                # We use this for testing so models converge quickly
                # with all ones.
                if np.any(np.isnan(indices_probs)):
                  print("Warning: no negative examples in dataset!!!")
                  indices_probs[:] = 1/indices_probs.shape[0]

                # randomly select 10 fold sites where TF is not in any cell line
                negative_indices = np.random.choice(np.arange(0,data.shape[1]),
                                                    positive_indices.shape[0] * 10,
                                                    p=indices_probs)
                indices = np.sort(np.concatenate([negative_indices, positive_indices]))

        else:
            indices = range(0, data.shape[-1]) # not training mode, set to all points

    if (mode == Dataset.RUNTIME):
        label_cell_types = ["PLACEHOLDER_CELL"]
        if similarity_matrix is None:
            raise Exception("similarity_matrix must be defined in runtime mode")
        assert similarity_matrix.shape[0] == len(similarity_targets), \
            "similarity_matrix is missing data for targets (should have %i rows)" % (len(similarity_targets))
        random_cell = list(cellmap)[0] # placeholder to get label vector length

    print("using %s as labels for mode %s" % (label_cell_types, mode))

    # string of radii for meta data labeling
    radii_str = list(map(lambda x: "RADII_%i" % x, radii))

    def g():
        for i in indices: # for all records specified

            for (cell) in label_cell_types: # for all cell types to be used in labels

                # labels for this cell
                if (mode != Dataset.RUNTIME):
                    label_cell_indices = EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, cell)

                    # delete all indices being used in the similarity computation
                    label_cell_indices_no_similarities = np.delete(label_cell_indices, delete_indices)

                    # Copy target_index_no_similarities and turn into mask of 0/1 for whether data for this cell type for
                    # a given label is available.
                    target_mask = np.copy(label_cell_indices_no_similarities)
                    target_mask[target_mask > -1] = 1
                    target_mask[target_mask == -1] = 0

                else:
                    label_count = len(EpitomeDataset.get_y_indices_for_cell(matrix, cellmap, random_cell))-len(similarity_targets)

                    # Mask and labels are all 0's because labels are missing during runtime
                    garbage_labels = target_mask = np.zeros(label_count)


                # get indices for targets used in similarity computation
                # for cell types that are going to be features
                similarity_indices = feature_cell_indices[:, delete_indices]


                # get indices for each radius in radii
                radius_ranges = list(map(lambda x: get_radius_indices(radii, x, i, data.shape[-1]), range(len(radii))))

                if len(radius_ranges) > 0:
                    radius_indices = np.concatenate(radius_ranges)

                    cell_train_data = data[similarity_indices[:,:,None],radius_indices]

                    if mode == Dataset.RUNTIME:

                        pos = cell_train_data*similarity_matrix[:,radius_indices]
#                         agree = cell_train_data == similarity_matrix[:,radius_indices]

                    else:
                        cell_label_data = data[label_cell_indices[delete_indices][:,None],radius_indices]

                        # remove middle dimension and flatten similarity targets
                        pos = (cell_train_data*cell_label_data)
#                         agree = (cell_train_data == cell_label_data)

                    # get indices to split on. remove last because it is empty
                    split_indices = np.cumsum([len(i) for i in radius_ranges])[:-1]
                    # slice arrays by radii
                    pos_arrays = np.split(pos, split_indices, axis= -1 )
#                     agree_arrays = np.split(agree, split_indices, axis = -1)

                    similarities = np.stack(list(map(lambda x: np.average(x, axis = -1), pos_arrays)),axis=1)
                else:
                    # no radius, so no similarities. just an empty placeholder
                    similarities = np.zeros((len(eval_cell_types),0,0))

                # reshape similarities to flatten 1st dimension, which are the targets
                # results in the odering:
                ## row 1: cell 1: pos for each target and agree for each target for each radius
                similarities = similarities.reshape(similarities.shape[0], similarities.shape[1]*similarities.shape[2])

                ##### Concatenate all cell type features together ####
                final_features = np.concatenate([data[feature_cell_indices,i], similarities],axis=1).flatten()

                # mask missing data
                f_mask = np.concatenate([feature_cell_indices!=-1,
                                         np.ones(similarities.shape)],axis=1).flatten()
                final_features = final_features[f_mask != 0]

                if (mode != Dataset.RUNTIME):
                    labels = data[label_cell_indices_no_similarities,i]

                else: # used when just predicting
                    # The features going into the example.
                    labels = garbage_labels # all 0's

                # append labels and targetmask
                final = np.array(final_features)

                #### Finish appending feature labels together ####
                if (return_feature_names):
                    all_labels = []
                    feature_names = []
                    similarity_labels_agreement = ['r%i_%s' % (radius, 'agree') for radius in radii]
#                     similarity_labels_dp = ['r%i_%s' % (radius, 'dp') for radius in radii]
                    similarity_labels = similarity_labels_agreement

                    # concatenate together feature names
                    for j,c in enumerate(eval_cell_types):
                        tmp = np.array(feature_targets)[feature_cell_indices[j,:] != -1]
                        al = ['%s_%s' % (c, a) for a in tmp]
                        sl = ['%s_%s' % (c, s) for s in similarity_labels]

                        feature_names.append(al)
                        feature_names.append(sl)

                    all_labels.append(np.concatenate(feature_names))
                    all_labels.append(['lbl_%s_%s' % (cell, a) for a in label_targets]) # of form lbl_cellline_target
                    all_labels.append(['mask_%s_%s' % (cell, a) for a in label_targets]) # of form mask_cellline_target

                    yield (final, tuple(all_labels))
                else:
                    yield final


    return g

In [267]:
class WrapperModel(EpitomeModel):
    def score_matrix_fast(self, accessibility_peak_matrix, regions):
        """ Runs predictions on a matrix of accessibility peaks, where columns are samples and
        rows are regions from regions_peak_file. rows in accessilibility_peak_matrix should matching

        :param numpy.matrix accessilibility_peak_matrix:  of (samples by genomic regions)
        :param str regions: either narrowpeak or bed file containing regions to score, OR a pyranges object
            with columns [Chomosome, Start, End, idx]. Index matches each genomic region to a row in
            accessilibility_peak_matrix. In both cases, number of regions Should
            match rows in accessilibility_peak_matrix

        :return: 3-dimensional numpy matrix of predictions: sized (samples by regions by ChIP-seq targets)
        :rtype: numpy matrix
        """

        conversionObject = RegionConversion(self.dataset.regions, regions)

        results = []
        # print(accessibility_peak_matrix.shape)
        matrix, indices = conversionObject.get_binary_vector(vector = accessibility_peak_matrix[0,:])
        gen = load_data_runtime(data=self.dataset.get_data(Dataset.ALL),
                 label_cell_types=self.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
                 eval_cell_types=self.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 matrix=self.dataset.matrix,
                 targetmap=self.dataset.targetmap,
                 cellmap=self.dataset.cellmap,
                 radii = self.radii,
                 mode = Dataset.RUNTIME,
                 similarity_matrix = matrix,
                 similarity_targets = ['DNase'],
                 indices = indices,
                 return_feature_names=False)

        to_stack = load_data_no_label_mask(data=self.dataset.get_data(Dataset.ALL),
                 label_cell_types=self.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
                 eval_cell_types=self.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
                 matrix=self.dataset.matrix,
                 targetmap=self.dataset.targetmap,
                 cellmap=self.dataset.cellmap,
                 radii = self.radii,
                 mode = Dataset.RUNTIME,
                 similarity_matrix = matrix,
                 similarity_targets = ['DNase'],
                 indices = indices,
                 return_feature_names=True)

        gen_to_list = list(gen())
        to_stack = list(to_stack())
        gen_to_list = np.array(gen_to_list)

        # reshape to n_regions [from regions] x nassays [acc dim 1] x n_samples
        radii = self.radii

        stacked = np.stack([to_stack] * accessibility_peak_matrix.shape[0], axis=0)
        names = stacked[:, :, 1]
        to_stack = stacked[:, :, 0]
        to_stack = np.expand_dims(to_stack, axis=-1)

        same_size = accessibility_peak_matrix.shape[1] == len(conversionObject.joined.idx_base)

        if not same_size:
            added_indices = []
            old_idx, counter, old_i = 0, 0, 0
            indices_to_merge = []
            for ctr, (i, i_base) in enumerate(zip(conversionObject.joined.idx, conversionObject.joined.idx_base)):
                if i_base == -1:
                    continue
                if i != old_i:
                    indices_to_merge.append((old_idx, counter))
                    old_idx = counter
                added_indices.append(accessibility_peak_matrix[:, i])
                counter += 1
                old_i = i
            indices_to_merge.append((old_idx, len(conversionObject.joined.idx)))
            
            a = np.stack(added_indices)
        else:
            a = np.transpose(accessibility_peak_matrix, axes=[1, 0])
        
        a = a[:, None, :]
        
        out = compute_casv(gen_to_list, a, radii)

        casv_len = out.shape[1]
        num_cells = out.shape[3]
        num_regions = out.shape[0]
        num_celltypes = out.shape[2]
        num_targets = len(self.dataset.targets) if 'DNase' in self.dataset.targets else len(self.dataset.targets) + 1
        total_targets = num_targets
        
        print('CALCULATED CASV')

        def first_substring(strings, substring, other, other2):
#             return next(i for i, string in enumerate(strings) if substring in string)
            for i, s in enumerate(strings):
                if substring in s:
                      return i
            print(strings)
            print(substring)
            print(other)
            print(other2)
            return None

        for region in tqdm(range(num_regions)):
            for cell in range(num_cells):

                selected_gen = to_stack[cell, region, :]
                naming_scheme = names[cell, region][0]
                selected_casv = out[region, :, :, cell]

                len_feats_per_celltype = int(selected_gen[0].shape[0] / num_celltypes) # 24 / 2 = 12

                old_sg = selected_gen
                

                for celltype in range(num_celltypes):
                    idx = (num_targets + 4) * celltype
                    if idx >= len(selected_gen[0]):
                        break
                    num_targets = first_substring(naming_scheme[idx:idx+len_feats_per_celltype+total_targets], '_agree', naming_scheme[idx-10:idx+len_feats_per_celltype+10], (idx, len_feats_per_celltype))
                    casv_cell = selected_casv[:, celltype]
                    selected_gen[0][idx + num_targets : idx + num_targets + 4] = casv_cell
        
        
#         set_trace()
        print('RUNNING PREDICTIONS')
        results = []
        for c in tqdm(range(num_cells)):
            for r in range(num_regions):
                results.append(self._predict(to_stack[c, r, :][0][None, :]))
        
        results = np.stack(results)
        results = results.reshape((to_stack.shape[0], to_stack.shape[1], results.shape[2])) # 4 x 5

        if not same_size:
            final = []
            final = np.empty((accessibility_peak_matrix.shape[0], accessibility_peak_matrix.shape[1], results.shape[2]))
            final.fill(np.nan)
            for i, tup in enumerate(indices_to_merge):
                final[:, i, :] = np.mean(results[:, tup[0]:tup[1], :], axis=1)
            
            # final = np.stack(final)
            # final = final.reshape((accessibility_peak_matrix.shape[0], accessibility_peak_matrix.shape[1], 1))
            return final
        results = results.reshape((results.shape[0], results.shape[1], total_targets-1))
        return results
    
    def score_matrix(self, accessilibility_peak_matrix, regions):
        """ Runs predictions on a matrix of accessibility peaks, where columns are samples and
        rows are regions from regions_peak_file. rows in accessilibility_peak_matrix should matching
        :param numpy.matrix accessilibility_peak_matrix:  of (samples by genomic regions)
        :param str regions: either narrowpeak or bed file containing regions to score, OR a pyranges object
            with columns [Chomosome, Start, End, idx]. Index matches each genomic region to a row in
            accessilibility_peak_matrix. In both cases, number of regions Should
            match rows in accessilibility_peak_matrix
        :return: 3-dimensional numpy matrix of predictions: sized (samples by regions by ChIP-seq targets)
        :rtype: numpy matrix
        """

        conversionObject = RegionConversion(self.dataset.regions, regions)

        results = []

        # TODO 9/10/2020: should do something more efficiently than a for loop
        for sample_i in tqdm.tqdm(range(accessilibility_peak_matrix.shape[0])):

            peaks_i, idx = conversionObject.get_binary_vector(vector = accessilibility_peak_matrix[sample_i,:])

            preds = self.eval_vector(peaks_i, idx)

            # group preds by joined['idx']
            results.append(preds)

        # stack all samples along 0th axis
        # shape: samples x regions x TFs
        tmp = np.stack(results)

        # mean and merge along 1st axis
        return conversionObject.merge(tmp, axis = 1)

def compute_casv(m1, m2, radii, indices= None):
    '''
    Computes CASV between two matrices. CASV indiciates how similar
    two binary matrices are to eachother. m1 and m2 should have the
    same number of rows and columns, where rows indicate regions and
    columns indicate the assays used to compute the casv (ie DNase-seq, H3K27ac)
    :param np.matrix m1: 2D or 3D numpy matrix 2D shape (nregions x (nassays x ncelltypes))
      where 2nd dimension is blocked by cells (i.e. cell1assay1, cell1assay2, cell2assay1, cell2assay2)
      OR 3D: (nregions x nassays x ncells)
    :param np.matrix m2: 3D numpy matrix shape (nregions x nassays x nsamples)
    :param radii: list of radii to access surrounding region
    :param indices: indices on 0th axis of m1 and m2 to compute casv for
    :return numpy matrix of size (len(indices) x CASV dimension x ncelltypes x ncells)
    '''

    if indices is None:
        indices = range(m1.shape[0])

    # if only one sample, extend m2 along 2nd axis
    if len(m2.shape) == 2:
        m2 = m2[:,:,None]

    # if needed, reshape m1 to put all assay/train cells on the last axis
    if len(m1.shape) == 3:
      ncells = m1.shape[-1]
      m1 = m1.reshape(m1.shape[0],m1.shape[1]*m1.shape[2])
    else:
      denom = 1 if m2.shape[1]==0 else m2.shape[1]
      ncells = int(m1.shape[-1]/denom)

    if m2.shape[1] == 0:
      # in this case, there is no CASV to compute, so we just return
      return np.zeros((len(indices),0, ncells,m2.shape[-1]))

    print(m1.shape, m2.shape)
    assert m1.shape[0] == m2.shape[0]
    # verify number of assays match
    assert m2.shape[1] == m1.shape[-1]/ncells
    # print('HERE')
    
#     set_trace()

    def f(i):
        
#         set_trace()
        # get indices for each radius in radii
        radius_ranges = list(map(lambda x: get_radius_indices(radii, x, i, m1.shape[0]), range(len(radii))))

        if len(radius_ranges) > 0:
            radius_indices = np.concatenate(radius_ranges)

            # data from known cell types (m1 portion)
            m1_slice = m1[radius_indices, :]
            m2_slice = np.repeat(m2[radius_indices, :, :],axis=1, repeats = ncells)
            

            # shape: radius size x (nassaysxncells) by nsamples
            pos = (m1_slice.T*m2_slice.T).T
#             agree = (m1_slice.T == m2_slice.T).T

            # split pos and agree arrays to create new dimension for ncells
            # the new dimension will be 4D: (radius x nassays x ncells x nsamples)
            pos = np.stack(np.split(pos, ncells, axis=1), axis=2)
#             agree = np.stack(np.split(agree, ncells, axis=1), axis=2)
            
            # get indices to split on. remove last because it is empty
            split_indices = np.cumsum([len(i) for i in radius_ranges])[:-1]
            # slice arrays by radii
            pos_arrays = np.split(pos, split_indices, axis= 0 )
#             agree_arrays = np.split(agree, split_indices, axis = 0)

            # average over the radius (0th axis)
            tmp1 = list(map(lambda x: np.average(x, axis = 0), pos_arrays)) # this line is problematic
            # final concatenation combines agree, nassays, and radii on the 0th axis
            # this axis is ordered by (1) pos/agree, then (2) radii, then (2) n assays.
            # See ordering example when there are 2 radii (r1, r2):
            # - pos: r1, nassays | pos: r2, nassays | agree: r1: nassays | agree: r1: nassays
            tmp = np.concatenate(tmp1, axis=0)
            return tmp
        else:
            # no radius, so no similarities. just an empty placeholder
            # shaped with the number of cells (last dim of m1)
            return np.zeros((0,ncells,m2.shape[-1]))

    # for every region of interest
    # TODO: maybe something more efficient?

    # set_trace()
    tmp = []
    for i in indices:
        tmp.append(f(i))
    
    return np.stack(tmp)
#     return np.stack([f(i) for i in indices])

In [268]:
eval_targets = ['SPI1',
     'JUNB',
     'NR3C1',
     'FOSL1',
     'FOS',
     'FOXP1',
     'MEF2B',
     'FOXP3',
     'NFATC2',
     'ETS1',
     'STAT3',
     'YY1',
     'REST',
     'NFE2',
     'MEIS1',
     'EGR1',
     'NFKB1',
     'IRF2',
     'EBF1',
     'RELA',
     'CEBPB',
     'CTCF']

# targets mentioned in Satpathy paper
satpathy_targets = ['BCL11A',
     'CEBPA',
     'FOXO1',
     'GATA1',
     'GATA2',
     'HIF1A',
     'IRF4',
     'JUN',
     'KLF4',
     'MAFK',
     'NFATC1',
     'NFIC',
     'NFKB1',
     'NR3C1',
     'NR4A1',
     'PAX5',
     'PRDM1',
     'RARA',
     'RBPJ',
     'RUNX1',
     'SPI1',
     'SRF',
     'STAT5A',
     'TCF3',
     'YY1']

targets = list(set(np.concatenate([eval_targets, satpathy_targets])))

In [269]:
# that we have data we can use for validation
CHIP_ALTAS_VALIDATION_CELLS = {'Monocytes':'Monocytes',
            'Dendritic Cells':'Dendritic_Cells',
            'Hematopoietic Stem Cells':'HSC',
            'B-Lymphocytes':'B_Cells',
            'PBMC':'PBMC',
            'CD34+ cells': 'CD34_Progenitors',
            'CD4-Positive T-Lymphocytes':'Naive_CD4_T_Cells',#'Memory_CD4_T_Cells']
           }

dataset = EpitomeDataset(targets = targets,
                            cells=None,
                           min_cells_per_target = 1,
                         min_targets_per_cell = 1)
cells = list(dataset.cellmap)
[cells.remove(i) for i in CHIP_ALTAS_VALIDATION_CELLS.keys() if i in cells]

dataset = EpitomeDataset(targets = targets,
                        cells=cells,
                       min_cells_per_target = 3,
                     min_targets_per_cell = 3)


  (a, min_cells_per_target, min_targets_per_cell))
  (a, min_cells_per_target, min_targets_per_cell))
  (a, min_cells_per_target, min_targets_per_cell))
  (a, min_cells_per_target, min_targets_per_cell))
  (a, min_cells_per_target, min_targets_per_cell))
  (a, min_cells_per_target, min_targets_per_cell))
  (a, min_cells_per_target, min_targets_per_cell))


In [270]:
os.environ["EPITOME_DATA_PATH"] = '/data/yosef2/users/akmorrow/data/scEpitome_data/CHIPATLAS/hg19'

# paths to Tal's dataset
DATA_PATH='/data/yosef/users/tal_ashuach/PeakVI/Datasets/Satpathy'
regions_peak_file = os.path.join(DATA_PATH, 'rds_f_regions.tsv')
cluster_file = os.path.join(DATA_PATH, 'rds_f_cells.tsv')

regions_peak_file = os.path.join(DATA_PATH, 'rds_f_regions.tsv')


# load in matrix
accessilibility_peak_matrix = np.load(os.path.join(DATA_PATH, 'imputed.npy'))

# load in regions and filter by regions we are actually scoring
regions = bed2Pyranges(regions_peak_file ).df
regions.sort_values(by='idx', inplace=True)

# load in cluster labels
clusters = pd.read_csv(cluster_file, sep='\t')
# group by clusters and get cell indices in each cluster
split_indices = clusters.groupby('CellType').indices
cluster_keys = list(split_indices.keys())
cluster_indices = list(split_indices.values())

# mean cells by clusters
means = [np.mean(accessilibility_peak_matrix[ind,:], axis=0) for ind in cluster_indices]
means_matrix = np.vstack(means)

means_matrix.shape

(21, 133962)

In [271]:
model = WrapperModel(dataset)
model.train(5)

using ['iPS cells', 'hESC derived neural cells', 'hESC derived mesendodermal cells', 'hESC H9', 'hESC H1', 'Unclassified', 'U2OS', 'U-937', 'Treg', 'Testis', 'T-47D', 'SW 480', 'SKH1', 'SK-N-SH', 'RS4-11', 'PC-3', 'PANC-1', 'Osteoblasts', 'OCI-LY-7', 'NB-4', 'NAMALWA', 'Mesenchymal stem cells', 'Macrophages', 'MOLM-13', 'MM.1S', 'MG-63', 'MDA-MB-231', 'MCF-7', 'Lymphoblastoid cell line', 'LoVo', 'LNCAP', 'L-1236', 'Keratinocytes', 'Kasumi-1', 'K-562', 'Jurkat', 'IMR-90', 'HuH-7', 'Hep G2', 'HeLa', 'HUVEC', 'HMEC', 'HL-60', 'HCT 116', 'HAP1', 'GP5d', 'GM12878', 'Fibroblasts', 'Erythroid Cells', 'ECC-1', 'Carcinoma, Renal Cell', 'CD34+', 'BJ', 'Acute myeloid leukemia', 'A549', '786-O', '293'] as labels for mode Dataset.TRAIN
using ['iPS cells', 'hESC derived neural cells', 'hESC derived mesendodermal cells', 'hESC H9', 'hESC H1', 'Unclassified', 'U2OS', 'U-937', 'Treg', 'Testis', 'T-47D', 'SW 480', 'SKH1', 'SK-N-SH', 'RS4-11', 'PC-3', 'PANC-1', 'Osteoblasts', 'OCI-LY-7', 'NB-4', 'NAMALWA

(0, 5, [])

In [272]:
if isinstance(regions, pd.core.frame.DataFrame):
    regions = pr.PyRanges(regions, int64=True)

In [273]:
results = model.score_matrix_fast(means_matrix[:, :13035], regions['chr1'])

(8299529,)
+--------------+-----------+-----------+-----------+--------------+------------+------------+
| Chromosome   | Start     | End       | idx       | Start_base   | End_base   | idx_base   |
| (category)   | (int64)   | (int64)   | (int64)   | (int64)      | (int64)    | (int64)    |
|--------------+-----------+-----------+-----------+--------------+------------+------------|
| chr1         | 10238     | 10738     | 0         | 10200        | 10400      | 3          |
| chr1         | 10238     | 10738     | 0         | 10400        | 10600      | 4          |
| chr1         | 10238     | 10738     | 0         | 10600        | 10800      | 5          |
| chr1         | 237511    | 238011    | 1         | 237400       | 237600     | 657        |
| ...          | ...       | ...       | ...       | ...          | ...        | ...        |
| chr1         | 249239576 | 249240076 | 13034     | 249239400    | 249239600  | 740999     |
| chr1         | 249239576 | 249240076 | 13034   

  0%|          | 38/45400 [00:00<02:00, 376.79it/s]

CALCULATED CASV


100%|██████████| 45400/45400 [02:01<00:00, 373.11it/s]
  0%|          | 0/21 [00:00<?, ?it/s]

RUNNING PREDICTIONS


100%|██████████| 21/21 [03:48<00:00, 10.88s/it]


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(134)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    132 [0;31m        [0mresults[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mstack[0m[0;34m([0m[0mresults[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    133 [0;31m        [0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 134 [0;31m        [0mresults[0m [0;34m=[0m [0mresults[0m[0;34m.[0m[0mreshape[0m[0;34m([0m[0;34m([0m[0mto_stack[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mto_stack[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m [0;31m# 4 x 5[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    135 [0;31m[0;34m[0m[0m
[0m[0;32m    136 [0;31m        [0;32mif[0m [0;32mnot[0m [0msame_size[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(136)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    134 [0;31m        [0mresults[0m [0;34m=[0m [0mresults[0m[0;34m.[0m[0mreshape[0m[0;34m([0m[0;34m([0m[0mto_stack[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mto_stack[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m [0;31m# 4 x 5[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    135 [0;31m[0;34m[0m[0m
[0m[0;32m--> 136 [0;31m        [0;32mif[0m [0;32mnot[0m [0msame_size[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    137 [0;31m            [0mfinal[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(137)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    135 [0;31m[0;34m[0m[0m
[0m[0;32m    136 [0;31m        [0;32mif[0m [0;32mnot[0m [0msame_size[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 137 [0;31m            [0mfinal[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(138)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    136 [0;31m        [0;32mif[0m [0;32mnot[0m [0msame_size[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    137 [0;31m            [0mfinal[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0m

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(139)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    137 [0;31m            [0mfinal[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(140)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,

ipdb>  final.shape


(21, 13035, 36)


ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(141)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0;34m:[0m[0;34m][0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0mresults[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mtup[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0mtup[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    142 [0;31m[0;34m[0m[0m
[0m[0;32m    143 [0;31m   

ipdb>  np.mean(results[:, tup[0]:tup[1], :], axis=1).shape


(21, 36)


ipdb>  i


0


ipdb>  final[:, i, :].shape


(21, 36)


ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(140)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(141)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0;34m:[0m[0;34m][0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0mresults[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mtup[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0mtup[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    142 [0;31m[0;34m[0m[0m
[0m[0;32m    143 [0;31m   

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(140)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(141)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0;34m:[0m[0;34m][0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0mresults[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mtup[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0mtup[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    142 [0;31m[0;34m[0m[0m
[0m[0;32m    143 [0;31m   

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(140)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(141)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0;34m:[0m[0;34m][0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0mresults[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mtup[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0mtup[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    142 [0;31m[0;34m[0m[0m
[0m[0;32m    143 [0;31m   

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(140)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(141)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0;34m:[0m[0;34m][0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0mresults[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mtup[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0mtup[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    142 [0;31m[0;34m[0m[0m
[0m[0;32m    143 [0;31m   

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(140)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(141)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0;34m:[0m[0;34m][0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0mresults[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mtup[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0mtup[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    142 [0;31m[0;34m[0m[0m
[0m[0;32m    143 [0;31m   

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(140)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(141)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mi[0m[0;34m,[0m [0;34m:[0m[0;34m][0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0mresults[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mtup[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0mtup[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    142 [0;31m[0;34m[0m[0m
[0m[0;32m    143 [0;31m   

ipdb>  n


> [0;32m<ipython-input-267-ed8fa0af6cbc>[0m(140)[0;36mscore_matrix_fast[0;34m()[0m
[0;32m    138 [0;31m            [0mfinal[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mempty[0m[0;34m([0m[0;34m([0m[0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0maccessibility_peak_matrix[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mresults[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    139 [0;31m            [0mfinal[0m[0;34m.[0m[0mfill[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mnan[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 140 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mtup[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mindices_to_merge[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    141 [0;31m                [0mfinal[0m[0;34m[[0m[0;34m:[0m[0;34m,

ipdb>  c


In [None]:
accessibility_peak_matrix = means_matrix
# regions = pr.PyRanges(regions, int64=True)

In [238]:
means_matrix[:, :13035].shape

(21, 13035)

In [239]:
regions['chr1']

+--------------+-----------+-----------+-----------+
| Chromosome   | Start     | End       | idx       |
| (category)   | (int64)   | (int64)   | (int64)   |
|--------------+-----------+-----------+-----------|
| chr1         | 10238     | 10738     | 0         |
| chr1         | 237511    | 238011    | 1         |
| chr1         | 752456    | 752956    | 2         |
| chr1         | 761853    | 762353    | 3         |
| ...          | ...       | ...       | ...       |
| chr1         | 249201345 | 249201845 | 13031     |
| chr1         | 249218817 | 249219317 | 13032     |
| chr1         | 249219589 | 249220089 | 13033     |
| chr1         | 249239576 | 249240076 | 13034     |
+--------------+-----------+-----------+-----------+
Unstranded PyRanges object has 13,035 rows and 4 columns from 1 chromosomes.
For printing, the PyRanges was sorted on Chromosome.

In [111]:
conversionObject = RegionConversion(model.dataset.regions, regions['chr1'])

results = []
# print(accessibility_peak_matrix.shape)
matrix, indices = conversionObject.get_binary_vector(vector = accessibility_peak_matrix[0,:])
gen = load_data_runtime(data=model.dataset.get_data(Dataset.ALL),
         label_cell_types=model.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
         eval_cell_types=model.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
         matrix=model.dataset.matrix,
         targetmap=model.dataset.targetmap,
         cellmap=model.dataset.cellmap,
         radii = model.radii,
         mode = Dataset.RUNTIME,
         similarity_matrix = matrix,
         similarity_targets = ['DNase'],
         indices = indices,
         return_feature_names=False)

gen_to_list = list(gen())

AssertionError: Error: value_vector must be the same shape as self.compare

In [34]:
len(gen_to_list)

453764

In [35]:
g = np.array(gen_to_list)

In [36]:
g.shape

(453764, 57)

In [37]:
g[0]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 0.])

In [38]:
matrix.shape

(8299529,)

In [41]:
indices

array([      3,       4,       5, ..., 8190394, 8190395, 8190396])

In [46]:
len(model.eval_cell_types)

57

In [90]:
to_stack = load_data_no_label_mask(data=model.dataset.get_data(Dataset.ALL),
         label_cell_types=model.test_celltypes,   # used for labels. Should be all for train/eval and subset for test
         eval_cell_types=model.eval_cell_types,   # used for rotating features. Should be all - test for train/eval
         matrix=model.dataset.matrix,
         targetmap=model.dataset.targetmap,
         cellmap=model.dataset.cellmap,
         radii = model.radii,
         mode = Dataset.RUNTIME,
         similarity_matrix = matrix,
         similarity_targets = ['DNase'],
         indices = indices,
         return_feature_names=False)

to_stack = list(to_stack())

using ['PLACEHOLDER_CELL'] as labels for mode Dataset.RUNTIME


In [91]:
to_stacked = np.stack([to_stack] * accessibility_peak_matrix.shape[0], axis=0)
names = stacked[:, :, 1]
to_stack = stacked[:, :, 0]
to_stack = np.expand_dims(to_stack, axis=-1)

In [92]:
to_stack.shape

(21, 453764, 1)

In [95]:
to_stack[0, 0, :]

array([1.])

In [94]:
matrix.shape, indices.shape

((8299529,), (453764,))