# count reads per barcode for bedgraph 

In [1]:
## own functions
import sys
sys.path.append("./bin/")
import scReadCounter as cr
import pandas as pd
import itertools

In [2]:
from deeptools import countReadsPerBin as crb

In [3]:
import os
import sys
import shutil
import numpy as np
import pyBigWig

# own modules
from deeptools import mapReduce
from deeptools.utilities import getCommonChrNames
from deeptools import bamHandler
from deeptools import utilities

debug = 0
old_settings = np.seterr(all='ignore')

In [4]:
from deeptools.writeBedGraph import bedGraphToBigWig, getGenomeChunkLength

In [5]:
    def scaleCoverage(tile_coverage, nCells, args):
        """
        tileCoverage should be an list with only one element
       """
        return (args['scaleFactor'] * tile_coverage)/nCells


In [27]:
cores = 20
binLength = 500
mappedList = []
fl = ['HVG-ChIC-fESC-CTCF-i1-26-02-20.bam',
     'HVG-ChIC-fESC-CTCF-i2-26-02-20.bam',
     'HVG-ChIC-fESC-CTCF-i3-26-02-20.bam',
     'HVG-ChIC-fESC-CTCF-i4-26-02-20.bam',
     'HVG-frozen-fix-scNPC-CTCF-index-1-020620.bam',
     'HVG-frozen-fix-scNPC-CTCF-index-2-020620.bam',
     'HVG-frozen-fix-scNPC-CTCF-index-3-020620.bam',
     'HVG-frozen-fix-scNPC-CTCF-index-4-020620.bam']

bamFilesList = ["/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/downstream/ESC_NPC_clustering/LSA_clusters_split/dedup_bams/"+f for f in fl]

In [29]:
df = pd.read_csv("/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/downstream/ESC_NPC_clustering/LSA_clusters_split/cluster_info.tsv",
            sep="\t", index_col=None, header = None, comment="#", names = ['sample', 'barcode', 'cluster'])
df.index = df[['sample', 'barcode']].apply(lambda x: ':'.join(x), axis=1)

In [30]:
labels = ['ESC_CTCF_i1', 'ESC_CTCF_i2', 'ESC_CTCF_i3', 'ESC_CTCF_i4',
         'NPC_CTCF_i1', 'NPC_CTCF_i2', 'NPC_CTCF_i3', 'NPC_CTCF_i4']
#labels = ['ESC_CTCF_i1']
barcodes = df['barcode'].unique().tolist()
sm = list(itertools.chain.from_iterable(itertools.repeat(lab, len(barcodes)) for lab in labels))
bc = barcodes*len(labels)
groupInfo = pd.DataFrame({"sample":sm, "barcode":bc})
groupInfo.index = groupInfo[['sample', 'barcode']].apply(lambda x: ':'.join(x), axis=1)
groupInfo = pd.merge(groupInfo, df['cluster'], how="left", left_index=True, right_index=True, sort=False)
groupInfo = groupInfo.reset_index()[['sample', 'barcode', 'cluster']]

In [31]:
groupInfo

Unnamed: 0,sample,barcode,cluster
0,ESC_CTCF_i1,ATACTGCG,1.0
1,ESC_CTCF_i1,ATAGCGAT,1.0
2,ESC_CTCF_i1,ATCAGTAT,1.0
3,ESC_CTCF_i1,CATCAGTA,1.0
4,ESC_CTCF_i1,CGCAGTCA,1.0
...,...,...,...
3003,NPC_CTCF_i4,GATTCGCA,2.0
3004,NPC_CTCF_i4,CAAGATCG,2.0
3005,NPC_CTCF_i4,GTCAATGC,
3006,NPC_CTCF_i4,TGTACTGG,2.0


In [9]:
len(barcodes)

376

In [32]:
labels_in_df = set(sm).difference(df['sample'])
df_in_labels = set(df['sample']).difference(sm)
if bool(df_in_labels):
    sys.stderr.write("Some (or all) of the samples indicated in the groupInfo file "
                     "are absent from the bam file labels! \n"
                     "Mismatched samples are: {} \n".format(df_in_labels))
elif bool(labels_in_df):
    sys.stderr.write("Some (or all) of the samples indicated in --labels "
                     "are absent from the in the groupInfo file! \n"
                     "Mismatched samples are: {} \n".format(labels_in_df))

In [33]:
#region = 'chr1:118073048:120315739'
#region = 'chr1:118468726:118584268'
#region = 'chr1:118300678:118719272'
#region='chr1:118615384:118869730'
#region = 'chr1:105387163:132177293'
region=None


In [34]:
len(barcodes)

376

In [35]:
c = cr.CountReadsPerBin(
        bamFilesList,
        binLength=binLength,
        stepSize=binLength,
        barcodes=barcodes,
        clusterInfo=groupInfo,
        motifFilter=None,
        tagName='BC',
        numberOfSamples=None,
        numberOfProcessors=cores,
        verbose=False,
        region=None,
        bedFile=None,
        blackListFileName=None,
        extendReads=None,
        minMappingQuality=0,
        ignoreDuplicates=False,
        center_read=False,
        samFlag_include=None,
        samFlag_exclude=None,
        minFragmentLength=1,
        maxFragmentLength=2000,
        zerosToNans=False,
        out_file_for_raw_data=None)

In [23]:
#counts = c.run(allArgs=None)

In [25]:
counts

(array([[11., 10., 24., ...,  4.,  9.,  2.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ...,
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  2.,  1., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array(['chrM_0_16299', 'KZ289083.1_0_66290', 'KZ289078.1_0_100000', ...,
        'GL456233.1_100000_200000', 'GL456233.1_200000_300000',
        'GL456233.1_300000_336933'], dtype='<U47'))

```
bam_handles = []
for fname in bamFilesList:
    bam_handles.append(bamHandler.openBam(fname))

cov = c.get_coverage_of_region(bam_handles[0], 'chr1', [(105387163, 132177293, 1000)])
```

In [26]:
_file = open('test_cr_direct.bedgraph', 'w')
line_string = "{}\t{}\t{}\t{:g}\n"
for i in range(counts[0].shape[0]):
    l= counts[1][i].split('_')
    value = np.sum(counts[0][i])
    _file.write(line_string.format(l[0], l[1], l[2], value))
_file.close()

Problem: the sum of counts are zero at some large intervals, even before grouping them by cluster

### task
**1. np.sum across barcodes in a group**


**2. the scaleFactor would be equal to 1/nBarcodes in the pseudobulk group**

In [36]:
    def writeBedGraph_worker(self, chrom, start, end,
                             func_to_call, func_args,
                             bed_regions_list=None):
        if start > end:
                raise NameError("start position ({0}) bigger "
                                "than end position ({1})".format(start, end))
        coverage, _, r = self.count_reads_in_region(chrom, start, end)

        ## get groups (clusters)
        cluster_info = self.clusterInfo
        clusters = cluster_info.cluster.unique().tolist()
        tempfilenames = dict.fromkeys(clusters)
        ## sum up tilecoverage group-wise
        for cl in clusters:
            if np.isnan(cl):
                continue
            cl_idx = cluster_info.index[pd.Series(cluster_info.cluster == cl)].tolist()
            nCells = len(cl_idx)
            _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
            previous_value = None
            line_string = "{}\t{}\t{}\t{:g}\n"
            
            for tileIndex in range(coverage.shape[0]):
                ## smoothing disabled for now
                tileCoverage = coverage[tileIndex, :]
                if self.skipZeroOverZero and np.sum(tileCoverage) == 0:
                    continue
                value = func_to_call(np.sum(tileCoverage), nCells, func_args)#[cl_idx]
                if previous_value is None:
                    writeStart = start + tileIndex * self.binLength
                    writeEnd = min(writeStart + self.binLength, end)
                    previous_value = value

                elif previous_value == value:
                    writeEnd = min(writeEnd + self.binLength, end)

                elif previous_value != value:
                    if not np.isnan(previous_value):
                        _file.write(
                            line_string.format(chrom, writeStart, writeEnd, previous_value))
                    previous_value = value
                    writeStart = writeEnd
                    writeEnd = min(writeStart + self.binLength, end)

                # write remaining value if not a nan
            if previous_value is not None and writeStart != end and not np.isnan(previous_value):
                _file.write(line_string.format(chrom, writeStart,end, previous_value))

            tempfilenames[cl] = _file.name
            _file.close()

        return chrom, start, end, tempfilenames

works

In [37]:
getStats = None
bam_handles = []
for x in bamFilesList:
    if getStats:
        bam, mapped, unmapped, stats = bamHandler.openBam(x, returnStats=True, nThreads=numberOfProcessors)
    else:
        bam = bamHandler.openBam(x)
        bam_handles.append(bam)
    
genome_chunk_length = getGenomeChunkLength(bam_handles, binLength, c.mappedList)
chrom_names_and_size, non_common = getCommonChrNames(bam_handles, verbose=False)

In [38]:
def writeBedGraph_wrapper(args):
    return writeBedGraph_worker(*args)

In [39]:
def scaleCoverage_simple(tile_coverage, nCells, args):
    """
    Return coverage normalized by cell numbers per cluster.
    tileCoverage should be an list with only one element
    """
    return args['scaleFactor'] * tile_coverage

In [40]:
genome_chunk_length

5000000

In [41]:
funcArgs = {'scaleFactor': 1}
res = mapReduce.mapReduce([scaleCoverage_simple, funcArgs],
                           writeBedGraph_wrapper,
                           chrom_names_and_size,
                           self_=c,
                           genomeChunkLength=genome_chunk_length,
                           region=region,
                           blackListFileName=None,
                           numberOfProcessors=1)

In [42]:
# Determine the sorted order of the temp files
chrom_order = dict()
for i, _ in enumerate(chrom_names_and_size):
    chrom_order[_[0]] = i
res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
res.sort()

In [43]:
res

[[0,
  0,
  5000000,
  {1.0: '/tmp/_deeptools_8g1xv8kw.bg',
   2.0: '/tmp/_deeptools_k_zz7ztm.bg',
   nan: None}],
 [0,
  5000000,
  10000000,
  {1.0: '/tmp/_deeptools_40w4ymnh.bg',
   2.0: '/tmp/_deeptools_auyft04y.bg',
   nan: None}],
 [0,
  10000000,
  15000000,
  {1.0: '/tmp/_deeptools_6h1xngle.bg',
   2.0: '/tmp/_deeptools_en8ow94j.bg',
   nan: None}],
 [0,
  15000000,
  20000000,
  {1.0: '/tmp/_deeptools_e8fp_pzx.bg',
   2.0: '/tmp/_deeptools_vrertu0a.bg',
   nan: None}],
 [0,
  20000000,
  25000000,
  {1.0: '/tmp/_deeptools_pz5gnvcy.bg',
   2.0: '/tmp/_deeptools_j3nvnica.bg',
   nan: None}],
 [0,
  25000000,
  30000000,
  {1.0: '/tmp/_deeptools_heiwkvhv.bg',
   2.0: '/tmp/_deeptools_aowlm9q0.bg',
   nan: None}],
 [0,
  30000000,
  35000000,
  {1.0: '/tmp/_deeptools_ptr_61a8.bg',
   2.0: '/tmp/_deeptools_bsnxq_jm.bg',
   nan: None}],
 [0,
  35000000,
  40000000,
  {1.0: '/tmp/_deeptools_bqp4eb85.bg',
   2.0: '/tmp/_deeptools_97zp81q7.bg',
   nan: None}],
 [0,
  40000000,
  450000

In [None]:
coverage = res[0][3]
chrom='1'
start = 118030576
end = 119266341
# isolate 1 cluster
cl = 1
cl_idx = df2.index[pd.Series(df2.cluster == cl)].tolist()
li = []
previous_value = None
line_string = "{}\t{}\t{}\t{:g}\n"
_file = open("/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/downstream/ESC_NPC_clustering/LSA_clusters_split/scdt_test/test_cl1.bedgraph", 'w')

for tileIndex in range(coverage.shape[0]):
    tileCoverage = coverage[tileIndex, :]
    value = np.sum(tileCoverage[cl_idx])
    if previous_value is None:
        writeStart = start + tileIndex * binLength
        writeEnd = min(writeStart + binLength, end)
        previous_value = value
    elif previous_value == value:
        writeEnd = min(writeEnd + binLength, end)
    elif previous_value != value:
        if not np.isnan(previous_value):
            print(line_string.format(chrom, writeStart, writeEnd, previous_value))
            previous_value = value
            writeStart = writeEnd
            writeEnd = min(writeStart + binLength, end)

# write remaining value if not a nan
if previous_value is not None and writeStart != end and not np.isnan(previous_value):
    print(line_string.format(chrom, writeStart,end, previous_value))
#_file.close()

In [None]:
coverage.shape[0]

In [None]:
for tileIndex in range(coverage.shape[0]):
    tot = coverage[tileIndex, cl_idx]
    print(len(tot))
    print(np.sum(tot))

In [44]:
format = 'bedgraph'
out_file_prefix = '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/downstream/ESC_NPC_clustering/LSA_clusters_split/scdt_test/test'

In [45]:
clusters = groupInfo.cluster.unique().tolist()
for cl in clusters:
    if np.isnan(cl):
                continue
    if format == 'bedgraph':
        out_file = open("{}_{}.bedgraph".format(out_file_prefix, cl), 'wb')
        for r in res:
            if r[3][cl]:
                _foo = open(r[3][cl], 'rb')
                shutil.copyfileobj(_foo, out_file)
                _foo.close()
                #os.remove(r[3][cl])
        out_file.close()
    else:
        bedGraphToBigWig(chrom_names_and_size, [x[3][cl] for x in res], 
                         "{}_{}.bw".format(out_file_prefix, cl))