In [1]:
import shutil
import os
import time
import sys
import multiprocessing
import numpy as np
import pandas as pd

# deepTools packages
import deeptools.utilities
from deeptools import bamHandler
from deeptools import mapReduce
from deeptoolsintervals import GTF
import pyBigWig

debug = 0
old_settings = np.seterr(all='ignore')

## Goal

**Modify countReadsPerBin such that it takes the barcode (list) argument and returns an np.array of shape (bins, barcodes\*len(bam))**

In [2]:
sys.path.append(".")

In [3]:
import scReadCounter as countR

In [4]:
bamfiles = [ '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/fESC_CTCF-K4me3_26-02-2020/dedup_bam/HVG-ChIC-fESC-CTCF-i1-26-02-20.bam',
             '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/fESC_CTCF-K4me3_26-02-2020/dedup_bam/HVG-ChIC-fESC-CTCF-i2-26-02-20.bam',
             '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/fESC_CTCF-K4me3_26-02-2020/dedup_bam/HVG-ChIC-fESC-CTCF-i3-26-02-20.bam',
             '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/fESC_CTCF-K4me3_26-02-2020/dedup_bam/HVG-ChIC-fESC-CTCF-i4-26-02-20.bam'
           ]

In [5]:
## real barcodes
barcodes_real = pd.read_csv('/hpc/hub_oudenaarden/vbhardwaj/annotations/cell_barcodes_inhouse/maya_384NLA.bc', 
                            sep = "\t", header=None)
barcodes = [ bc for bc in barcodes_real[0] ]

In [6]:
binSize = 100000
distanceBetweenBins = 0
numberOfProcessors = 10
genomeChunkSize = None
stepsize = binSize + distanceBetweenBins

In [7]:
c = countR.CountReadsPerBin(
        bamfiles,
        binSize,
        barcodes=barcodes,
        numberOfSamples=None,
        numberOfProcessors=numberOfProcessors,
        verbose=False,
        region=None,
        bedFile=None,
        blackListFileName=None,
        extendReads=None,
        minMappingQuality=10,
        ignoreDuplicates=True,
        center_read=False,
        samFlag_include=None,
        samFlag_exclude=None,
        minFragmentLength=1,
        maxFragmentLength=2000,
        stepSize=stepsize,
        zerosToNans=False,
        out_file_for_raw_data=None)

In [9]:
num_reads_per_bin = c.run(allArgs=None)

In [11]:
num_reads_per_bin.shape

(27348, 1536)

In [None]:
array = np.zeros((10, 384), dtype='float64')

In [None]:
array.

In [None]:
def get_fragment_from_read(read):
    return read.get_blocks()

In [None]:
import pysam
format_options = [b"required_fields=0x1FF"]

bam_handles = []
for fname in bamfiles:
    bamHandle = pysam.Samfile(fname, 'rb', format_options=format_options)
    bam_handles.append(bamHandle)

In [None]:
bam_handles

In [None]:
readlist = []
for read in bamHandle.fetch('1', 2000000, 10000000):
    readlist.append(read)


In [None]:
readlist[0].get_tag('BC')

In [None]:
chrom = '1'
start = 2000000
end = 10000000
binLength = 100000

transcriptsToConsider = []
transcriptsToConsider.append([(start, end, binLength)])

In [None]:
transcriptsToConsider

In [None]:
def get_coverage_of_region(bamHandle, chrom, regions, barcodes, tagName='BC', ## barcodes = list/tuple of barcodes
                           fragmentFromRead_func=None):
    """
    Returns a numpy array that corresponds to the number of reads
    that overlap with each tile.
    """
    fragmentFromRead_func = get_fragment_from_read
    nbins = len(regions)
    if len(regions[0]) == 3:
        nbins = 0
        for reg in regions:
            nbins += (reg[1] - reg[0]) // reg[2]
            if (reg[1] - reg[0]) % reg[2] > 0:
                nbins += 1
    #coverages = np.zeros(nbins, dtype='float64')
    ## instead of an array, the coverages object is a dict with keys = barcodes, values = np arrays
    coverages = {}
    for bc in barcodes:
        coverages[bc] = np.zeros(nbins, dtype='float64')

    extension = 0
    blackList = None
    vector_start = 0
    for idx, reg in enumerate(regions):
        if len(reg) == 3:
            tileSize = int(reg[2])
            nRegBins = (reg[1] - reg[0]) // tileSize
            if (reg[1] - reg[0]) % tileSize > 0:
                # Don't eliminate small bins! Issue 887
                nRegBins += 1
        else:
            nRegBins = 1
            tileSize = int(reg[1] - reg[0])

        regStart = int(max(0, reg[0] - extension))
        regEnd = reg[1] + int(extension)
        start_time = time.time()
        # caching seems faster. TODO: profile the function
        c = 0
        if chrom not in bamHandle.references:
            raise NameError("chromosome {} not found in bam file".format(chrom))

        prev_pos = set()
        lpos = None
        # of previous processed read pair
        # read object should also return the barcode name
        for read in bamHandle.fetch(chrom, regStart, regEnd):
            ## get barcode from read
            bc = read.get_tag(tagName)
            # since reads can be split (e.g. RNA-seq reads) each part of the
            # read that maps is called a position block.
            try:
                position_blocks = fragmentFromRead_func(read)
            except TypeError:
                # the get_fragment_from_read functions returns None in some cases.
                # Those cases are to be skipped, hence the continue line.
                continue

            last_eIdx = None
            for fragmentStart, fragmentEnd in position_blocks:
                if fragmentEnd is None or fragmentStart is None:
                    continue
                fragmentLength = fragmentEnd - fragmentStart
                if fragmentLength == 0:
                    continue
                # skip reads that are not in the region being
                # evaluated.
                if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                    continue

                if fragmentStart < reg[0]:
                    fragmentStart = reg[0]
                if fragmentEnd > reg[0] + len(coverages) * tileSize:
                    fragmentEnd = reg[0] + len(coverages) * tileSize

                sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
                eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
                if last_eIdx is not None:
                    sIdx = max(last_eIdx, sIdx)
                    if sIdx >= eIdx:
                        continue
                sIdx = int(sIdx)
                eIdx = int(eIdx)
                #coverages[sIdx:eIdx] += 1
                coverages[bc][sIdx:eIdx] += 1
                last_eIdx = eIdx
            c += 1
        vector_start += nRegBins

    return coverages

In [None]:
subnum_reads_per_bin = []
for bam in bam_handles:
    for trans in transcriptsToConsider:
        tcov = get_coverage_of_region(bam, chrom, trans, barcodes)
        tcov_stack = np.stack(list(tcov.values()))
        tcov_keys = list(tcov.keys())
        subnum_reads_per_bin.extend(tcov_stack)

In [None]:
# subnum_reads_per_bin is a list of arrays of size (nCells*nBAM), shape of each array = (nBins, )
print(len(subnum_reads_per_bin))
print(subnum_reads_per_bin[0].shape)

In [None]:
subnum_reads_per_bin[0]

In [None]:
transcriptsToConsider

In [None]:
#np.concatenate([ subnum_reads_per_bin ]).reshape(-1, len(bamfiles), order='F')
ct = np.concatenate([ subnum_reads_per_bin ]).reshape(-1, len(barcodes)*len(bamfiles), order='F')


In [None]:
for i, trans in enumerate(transcriptsToConsider):
    print(len(ct[i, :]))

In [None]:
ct