In [22]:
import pysam
from collections import defaultdict, Counter
from tqdm import tnrange, tqdm_notebook
import os
import pandas as pd

In [2]:
def get_num_mapped(bamfile):
    num_stats = pysam.idxstats(bamfile).split('\n')
    total_mapped = 0
    for stat in num_stats:
        try:
            chrom, chrom_len, num_mapped, num_unmapped = stat.split('\t')
            total_mapped += int(num_mapped)
        except ValueError:
            print(stat)
            pass
    return total_mapped

In [7]:
def get_cell_barcode_counts(bamfile):
    '''
    Generates a dictionary of counts for each barcode observed in the XC tag
    :param bamfile: bamfile output from dropseqtools (star_gene_exon_tagged)
    :return: Dictionary of counts for each barcode detected
    '''
    final_bam = pysam.AlignmentFile(bamfile, "rb")
    cell_barcode_counts = Counter()
    # progress = tnrange(get_cell_barcode_counts(bamfile))
    for read in final_bam.fetch():
        barcode = dict(read.tags)['XC']
        cell_barcode_counts[barcode] +=1
        # progress.update(1)
    return cell_barcode_counts

In [8]:
def get_cell_barcode_counts_brian(bamfile):
    '''
    Generates a dictionary of counts for each barcode observed in the XC tag
    :param bamfile: bamfile output from dropseqtools (star_gene_exon_tagged)
    :return: Dictionary of counts for each barcode detected
    '''
    samfile = pysam.AlignmentFile(bamfile,"rb")
    # progress = tnrange(get_cell_barcode_counts(bamfile))
    all_barcodes = defaultdict(list)
    for read in samfile.fetch():
        barcode = read.get_tag('XC')
        umi = read.get_tag('XM')
        all_barcodes[barcode].append(umi)
        # progress.update(1)
    return all_barcodes

In [57]:
bamfile = '/projects/ps-yeolab3/bay001/for_alexc/barnyard_troubleshooting_inputs/from_alex/results/WT_Organoid_D71_SSS_star_gene_exon_tagged.bam'

In [55]:
%%timeit
cell_barcode_counts = get_cell_barcode_counts(bamfile)

1 loop, best of 3: 1min 58s per loop


In [58]:
all_barcodes = get_cell_barcode_counts_brian(bamfile)

In [47]:
output_dir = '/home/bay001/scratch/'
out_file = os.path.join(output_dir, 'emily.txt')
with open(out_file, 'w') as f:
    for key, value in cell_barcode_counts.iteritems():
        f.write('{}\t{}\n'.format(str(key), value))

In [59]:
out_file = os.path.join(output_dir, 'brian.txt')
with open(out_file, 'w') as f:
    for key, value in all_barcodes.iteritems():
        f.write('{}\t{}\n'.format(str(key), len(value)))

In [None]:
out_file = os.path.join(output_dir, 'brian.txt')
with open(out_file, 'w') as f:
    for key, value in all_barcodes.iteritems():
        f.write('{}\t{}\n'.format(str(key), len(value)))

In [60]:
df = pd.read_table(out_file, names=['barcode','from_bam_count'])
df[df['from_bam_count']>2000].shape

(2549, 2)

In [50]:
df.drop_duplicates().shape

(1227648, 2)

In [51]:
counts_from_fastq = pd.read_table(
    '/projects/ps-yeolab3/bay001/for_alexc/barnyard_troubleshooting_outputs/naive_demux.txt',
    names=['barcode','from_fastq_count']
)
counts_from_fastq.head()

Unnamed: 0,barcode,from_fastq_count
0,ATTACATCCGGT,2
1,TTTGGAGCAAAA,2
2,CGCCGCGCACCC,1
3,CAGGGGTCCATT,5
4,ATAAAATTATCA,1


In [52]:
counts_from_fastq[counts_from_fastq['from_fastq_count']>2000].shape

(2181, 2)

In [53]:
merged = pd.merge(counts_from_fastq, df, how='outer', left_on='barcode', right_on='barcode')

Unnamed: 0,barcode,from_fastq_count,from_bam_count
0,ATTACATCCGGT,2,2.0
1,TTTGGAGCAAAA,2,2.0
2,CGCCGCGCACCC,1,
3,CAGGGGTCCATT,5,4.0
4,ATAAAATTATCA,1,
5,ATCTCGTTACAT,2,2.0
6,CAGGGGTCCATG,1,1.0
7,TATTAGAACAAA,1,
8,CCTAATACTGCC,4,2.0
9,CCTAATACTGCA,42,30.0
