## Goal

**Modify countReadsPerBin such that it takes the barcode (list) argument and returns an np.array of shape (bins, barcodes\*len(bam))**

In [1]:
import os
import time
import sys
import numpy as np
from scipy import sparse, io

debug = 0
old_settings = np.seterr(all='ignore')
## own functions
sys.path.append(".")
import scReadCounter as countR

In [None]:
import py2bit
## get the mm10 as 2bit
genome = "mouse_ensembl_97.2bit"
tb = py2bit.open(genome, True)

In [2]:
bamfiles = [ '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/fESC_CTCF-K4me3_26-02-2020/dedup_bam/HVG-ChIC-fESC-CTCF-i1-26-02-20.bam',
             '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/fESC_CTCF-K4me3_26-02-2020/dedup_bam/HVG-ChIC-fESC-CTCF-i2-26-02-20.bam',
             '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/fESC_CTCF-K4me3_26-02-2020/dedup_bam/HVG-ChIC-fESC-CTCF-i3-26-02-20.bam',
             '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/fESC_CTCF-K4me3_26-02-2020/dedup_bam/HVG-ChIC-fESC-CTCF-i4-26-02-20.bam'
           ]

In [3]:
whitelist = '/hpc/hub_oudenaarden/vbhardwaj/annotations/cell_barcodes_inhouse/maya_384NLA.bc'
# open file and read the content in a list
with open(whitelist, 'r') as f:
    barcodes = f.read().splitlines()

In [4]:
binSize = 100000
distanceBetweenBins = 0
numberOfProcessors = 20
genomeChunkSize = None
stepsize = binSize + distanceBetweenBins
bed_file = None#"/hpc/hub_oudenaarden/vbhardwaj/annotations/mm10_gencode23/custom_annotation/gencode.vM23.level1_only.gtf"
blacklist = "/hpc/hub_oudenaarden/vbhardwaj/annotations/blacklists/mm10-blacklist.v2.bed"

labels = ['i1', 'i2', 'i3', 'i4']
outFilePrefix = "test_counts"

mtxFile = outFilePrefix + ".counts.mtx"
rowNamesFile = outFilePrefix + ".rownames.txt"
colNamesFile = outFilePrefix + ".colnames.txt"

In [None]:
c = countR.CountReadsPerBin(
        bamfiles,
        binLength=binSize,
        stepSize=stepsize,
        barcodes=barcodes,
        motifFilter=None,
        tagName='BC',
        numberOfSamples=None,
        numberOfProcessors=numberOfProcessors,
        verbose=False,
        region=None,
        bedFile=bed_file,
        blackListFileName=None,
        extendReads=None,
        minMappingQuality=10,
        ignoreDuplicates=True,
        center_read=False,
        samFlag_include=None,
        samFlag_exclude=None,
        minFragmentLength=1,
        maxFragmentLength=2000,
        zerosToNans=False,
        out_file_for_raw_data=rowNamesFile)

In [None]:
num_reads_per_bin = c.run(allArgs=None)

In [None]:
rowSums = np.sum(num_reads_per_bin, axis=0)
colSums = np.sum(num_reads_per_bin, axis=1)

In [None]:
np.sum(num_reads_per_bin)

In [None]:
import seaborn as sns

In [None]:
%matplotlib inline
sns.distplot(rowSums)

In [None]:
sns.distplot(colSums)

In [None]:
## add labels to barcodes and write
newlabels = ["{}_{}".format(a, b) for a in labels for b in barcodes ]

f = open(colNamesFile, "w")
f.write("\n".join(newlabels))
f.write("\n")
f.close()

## write the matrix as .mtx
sp = sparse.csr_matrix(num_reads_per_bin)
io.mmwrite(mtxFile, sp, field="integer")

### Remaining updates

 - Filter reads based on sequence in read and corresponding genome sequence.
 - pseudo-bulk bigwigs: collapse barcodes into pre-defined sets and perform CPM normalization taking cell number into account

In [None]:
## Filter reads based on motif presence
## first let's see what info we get from a read
from deeptools import bamHandler

Criteria for scChIC Forward aligned read: Read has 'A' , upstream has 'T'

R1 ........A------->

----------TA------------\ Ref (+)

Rev aligned read: read has 'T', downstream has A

<-------T....... R1

--------TA------------\ Ref (+)


In [None]:
bam = bamHandler.openBam(bamfiles[0], returnStats=False)

In [None]:
reads = []
for read in bam.fetch('1', 2000000, 5000000):
    reads.append(read)
reads

In [None]:
cigarlist = []
for i in range(0,500):
    read = reads[i]
    print(read.is_reverse)
    cigarlist.append(read.cigarstring)
    print(read.get_forward_sequence())
    print(read.get_reference_sequence())
    if(read.is_reverse):
        print(read.get_reference_positions()[0:2])
    else:
        print(read.get_reference_positions()[-3:-1])    
    i += 1

In [None]:
import re


In [None]:
out = [re.search("I|S", x) for x in cigarlist ]

In [None]:
pos = [i for i, x in enumerate(out) if x]

In [None]:
specialReads = [reads[i] for i in  pos]

In [None]:
bam.get_reference_name(read.reference_id)

In [None]:
t = None
t2 = False
t3 = ['Some', 'Value']
if t3:
    print("yes")
else:
     print("No")   

In [None]:
len(t3)

In [None]:
def checkMotifs(read, readMotif, refMotif):
    # get read and ref motif pos
    read_motif = read.get_forward_sequence()[0:len(readMotif)]   
    chrom = bam.get_reference_name(read.reference_id)
    ref_motifLen = len(refMotif) - 1
    
    if(read.is_reverse):
        read_stat = "reverse"
        # for reverse reads ref motif begins at read-end and ends downstream
        ref_motif = tb.sequence(chrom, read.reference_end - 1, read.reference_end + ref_motifLen)
    else:
        read_stat = "forward"
        # for forward reads ref motif begins upstream and ends at read-start
        ref_motif = tb.sequence(chrom, read.reference_start - ref_motifLen, read.reference_start + 1)
    
    if read_motif == readMotif and ref_motif == refMotif:
        return True
#        print(read_stat)
#        print(read.get_forward_sequence())
#        print(read.get_reference_sequence())
#        print(read_motif)
#        print(ref_motif)
    else:
        return False
#        print(False)

**Looks like it's working** Test on a specific bam file region

In [None]:
# chr5:44,095,633-44,096,163
testbam = '/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/02_scChIC_test/10_singlecells_k4me3-ctcf_bonemarrow/dedup_bam/fBM-pCTCF-k4me3-i1.bam'
bam = bamHandler.openBam(testbam, returnStats=False)
reads = []
for read in bam.fetch('5', 44095633, 44096163):
    reads.append(read)

In [None]:
# there are 8 reads here
reads

![](./test_region_chic.png)

two forward reads have 'TA' and 'AA' in the genome
6 reverse reads all have 'TA' overhang

In [None]:
# now test for A/T and TA motifs
for r in reads:
    checkMotifs(r, 'A', 'TA')

**Perfect!** What happens if I ask for 'A' in read and 'AGATAA' in reference?

Only 2nd read should pop up

In [None]:
for r in reads:
    checkMotifs(r, 'A', 'AGATAA')

In [None]:
## if I ask for AGT in read?
for r in reads:
    checkMotifs(r, 'AGT', 'AGATAA')

Check some motifs on reverse-read

In [None]:
## only 3 reverse reads should pop up
for r in reads:
    checkMotifs(r, 'A', 'TAAA')

In [None]:
## only 1 reverse reads should pop up
for r in reads:
    checkMotifs(r, 'A', 'TAAG')

In [None]:
## only 1 reverse reads should pop up
for r in reads:
    checkMotifs(r, 'ACTG', 'TAAG')

**All good.. let's lock it!**

Implemented. Now let's see if the output counts are different using TA filter

In [10]:
bamfiles = ['/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/02_scChIC_test/10_singlecells_k4me3-ctcf_bonemarrow/dedup_bam/fBM-pCTCF-k4me3-i1.bam']

In [11]:
bed_file = "test_region.bed"

In [12]:
c_filt = countR.CountReadsPerBin(
        bamfiles,
        binLength=None,
        stepSize=None,
        barcodes=barcodes,
        tagName='BC',
        motifFilter=['A', 'TA'],
        genome2bit="mouse_ensembl_97.2bit",
        numberOfSamples=None,
        numberOfProcessors=10,
        verbose=False,
        region=None,
        bedFile=bed_file,
        blackListFileName=None,
        extendReads=None,
        minMappingQuality=10,
        ignoreDuplicates=True,
        center_read=False,
        samFlag_include=None,
        samFlag_exclude=None,
        minFragmentLength=1,
        maxFragmentLength=2000,
        zerosToNans=False,
        out_file_for_raw_data=rowNamesFile)

In [13]:
num_reads_filt = c_filt.run(allArgs=None)

**It works**

### Remaining updates
 - low complexity filter: remove reads which are very low complexity
 - Stats: Store and output cell-level stats for all filters
 - pseudo-bulk bigwigs: collapse barcodes into pre-defined sets and perform CPM normalization taking cell number into account