In [1]:
import pysam
import os

from collections import Counter
from tqdm import tnrange, tqdm_notebook

In [2]:
fn = '/projects/ps-yeolab3/bay001/reference_data/204_01_RBFOX2/from_core_pipeline/204_01_RBFOX2.merged.r2.bam'

# Add a tag (such as RG) to read tags

In [3]:
bam = pysam.AlignmentFile(fn, 'rb')

for read in bam.fetch():
    name = read.qname
    tags = read.tags
    tags += [('BY', 'COOLNEWTAG')]
    print(tags)
    break

[('NH', 1), ('HI', 1), ('AS', 80), ('nM', 0), ('NM', 0), ('MD', '43'), ('jM', array('b', [-1])), ('jI', array('i', [-1])), ('RG', 'foo'), ('BY', 'COOLNEWTAG')]


# Iterate over reads in BAM file

In [4]:
def get_cell_barcode_counts(bamfile):
    '''
    Generates a dictionary of counts for each barcode observed in the XC tag
    :param bamfile: bamfile output from dropseqtools (star_gene_exon_tagged)
    :return: Dictionary of counts for each barcode detected
    '''
    final_bam = pysam.AlignmentFile(bamfile, "rb")
    cell_barcode_counts = Counter()

    for read in final_bam.fetch():
        barcode = dict(read.tags)['MD']
        cell_barcode_counts[barcode] +=1

    return cell_barcode_counts

In [7]:
"""%%timeit 
counts = get_cell_barcode_counts1(fn)"""

'%%timeit \ncounts = get_cell_barcode_counts1(fn)'

In [8]:
import concurrent.futures
import math

In [9]:
def get_cell_barcode_counts((bamfile, chrom)):
    '''
    Generates a dictionary of counts for each barcode observed in the XC tag
    :param bamfile: bamfile output from dropseqtools (star_gene_exon_tagged)
    :return: Dictionary of counts for each barcode detected
    '''
    final_bam = pysam.AlignmentFile(bamfile, "rb")
    cell_barcode_counts = Counter()

    for read in final_bam.fetch(chrom):
        barcode = dict(read.tags)['XC']
        cell_barcode_counts[barcode] +=1

    return cell_barcode_counts



def get_cell_barcode_counts_mt():
    fn = '/home/bay001/projects/codebase/metadata/results_dir/macosko_SRR1853178_4980a38b35412fe9a7e92226b5552768f5a9eb9b/results/SRX907219_sample_01.tagged1-12.tagged13-20.filtered.trimmed_smart.polyA_filtered.STARAligned.out.namesorted.merged.TaggedGeneExon.cleaned.bam'

    chroms = []
    for chrom in range(1, 20):
        chr_chrom = "{}".format(chrom)
        chroms.append(chr_chrom)
    
    dicts = []
    args = ((fn, c) for c in chroms)
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for d in executor.map(get_cell_barcode_counts, args):
            dicts.append(d)
    return dicts



In [10]:
%%timeit

d = get_cell_barcode_counts_mt()

1 loop, best of 3: 3min 1s per loop


In [12]:
d = get_cell_barcode_counts_mt()
len(d.keys())

AttributeError: 'list' object has no attribute 'keys'