In [1]:
import pysam
import os

from collections import Counter
from tqdm import tnrange, tqdm_notebook

In [5]:
fn = '/projects/ps-yeolab5/encode/analysis/encode_master/204_01_RBFOX2.merged.r2.bam'

# Get reads mapped to a region

In [6]:
bam = pysam.AlignmentFile(fn, 'rb')
for read in bam.fetch('chr1', 10000000, 10005000):
    print(read)

AATAT:SN1001:449:HGTN3ADXX:2:2106:9338:2103	147	0	10000029	255	43M	0	10000026	43	AGCTGGGCATGGTGGTGTGTGCTTGTAGTCCCAGCTACTTGGC	array('B', [40, 40, 40, 37, 40, 40, 40, 40, 40, 40, 40, 40, 40, 37, 37, 37, 37, 40, 40, 37, 37, 40, 40, 40, 40, 40, 40, 40, 37, 37, 37, 37, 37, 40, 40, 37, 37, 37, 37, 37, 37, 37, 37])	[('NH', 1), ('HI', 1), ('AS', 79), ('nM', 1), ('NM', 0), ('MD', '43'), ('jM', array('b', [-1])), ('jI', array('i', [-1])), ('RG', 'foo')]
CTGAA:SN1001:449:HGTN3ADXX:1:1107:7039:81143	147	0	10002791	255	43M	0	10002786	43	ATCAGGACTACAAAAGGCTCAAAGTTCAAACCACCTCGGGGTG	array('B', [40, 40, 37, 40, 40, 37, 40, 40, 37, 37, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 37, 37, 37, 40, 37, 40, 37, 37, 37, 33, 40, 37, 37, 37, 37, 37, 37, 37, 37, 37])	[('NH', 1), ('HI', 1), ('AS', 81), ('nM', 0), ('NM', 0), ('MD', '43'), ('jM', array('b', [-1])), ('jI', array('i', [-1])), ('RG', 'foo')]
CGACC:SN1001:449:HGTN3ADXX:1:1105:11551:78917	147	0	10002941	255	42M	0	10002916	42	GGGCGGGGGCACTTTCCCCC

# Add a tag (such as RG) to read tags

In [7]:
bam = pysam.AlignmentFile(fn, 'rb')

for read in bam.fetch():
    name = read.qname
    tags = read.tags
    tags += [('BY', 'COOLNEWTAG')]
    print(tags)
    break

[('NH', 1), ('HI', 1), ('AS', 80), ('nM', 0), ('NM', 0), ('MD', '43'), ('jM', array('b', [-1])), ('jI', array('i', [-1])), ('RG', 'foo'), ('BY', 'COOLNEWTAG')]


# Grab cell barcode counts from dropseqtools-processed BAM file
- These BAM files embed cell barcodes inside the 'XC' tag

In [11]:
fn = '/projects/ps-yeolab3/bay001/class_resources/permanent_data/dropseq_tutorial/star_gene_exon_tagged.bam'

In [14]:
def get_cell_barcode_counts(bamfile):
    '''
    Generates a dictionary of counts for each barcode observed in the XC tag
    :param bamfile: bamfile output from dropseqtools (star_gene_exon_tagged)
    :return: Dictionary of counts for each barcode detected
    '''
    final_bam = pysam.AlignmentFile(bamfile, "rb")
    cell_barcode_counts = Counter()

    for read in final_bam.fetch():
        barcode = dict(read.tags)['XC']
        cell_barcode_counts[barcode] +=1

    return cell_barcode_counts

a = get_cell_barcode_counts(fn)

In [15]:
%%timeit 
# profile the amount of time it takes to iterate over all reads
counts = get_cell_barcode_counts(fn)

1 loop, best of 3: 14 s per loop


# Grab cell barcode counts from dropseqtools-processed BAM file (multithreaded)
- These BAM files embed cell barcodes inside the 'XC' tag

In [16]:
import concurrent.futures
import math

In [17]:
def get_cell_barcode_counts((bamfile, chrom)):
    '''
    Generates a dictionary of counts for each barcode observed in the XC tag
    :param bamfile: bamfile output from dropseqtools (star_gene_exon_tagged)
    :return: Dictionary of counts for each barcode detected
    '''
    final_bam = pysam.AlignmentFile(bamfile, "rb")
    cell_barcode_counts = Counter()

    for read in final_bam.fetch(chrom):
        barcode = dict(read.tags)['XC']
        cell_barcode_counts[barcode] +=1

    return cell_barcode_counts



def get_cell_barcode_counts_mt(fn):
    chroms = []
    for chrom in range(1, 20):
        chr_chrom = "{}".format(chrom)
        chroms.append(chr_chrom)
    
    dicts = []
    args = ((fn, c) for c in chroms)
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for d in executor.map(get_cell_barcode_counts, args):
            dicts.append(d)
    return dicts

b = get_cell_barcode_counts_mt(fn)

In [18]:
%%timeit
d = get_cell_barcode_counts_mt(fn)

1 loop, best of 3: 2.52 s per loop


In [11]:
print(len(a.keys()), len(b.keys()))

AttributeError: 'list' object has no attribute 'keys'