In [0]:
file_dir = "/data7/eckertlab/projects/ethan/HiSeq_140603/"

In [0]:
from IPython.parallel import Client
rc = Client(profile="sge")
dview = rc[:] #all nodes
lview = rc.load_balanced_view() # for load balancing

In [0]:
print "%d nodes in the cluster" % len(rc.ids)
import socket

In [0]:
from IPython.display import FileLink, FileLinks

In [0]:
with dview.sync_imports():
    import stopwatch
    import numpy
    import numpy as np
    import scipy
    import pandas
    import gzip
    import os
    import tempfile
    import shutil
    import socket
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    from collections import deque, defaultdict
    import multiprocessing 
    from multiprocessing import Pool, Manager
    import traceback
    from itertools import izip

In [0]:
processed = !find $file_dir -type f | grep processed.fastq\$ | grep -v test
processed = sorted(processed)
processed

In [0]:
#get single ended samples
singles = []
for p in processed:
    if "R1" in p:
        test = p.replace("R1", "R2")
        if not os.path.exists(test):
            singles.append(p)
singles

In [0]:
barcode_map = {}
for f in singles:
    base = os.path.basename(f)
    num = base.split("_")[0][-1]
    barcode = os.path.join(os.path.dirname(f), "ethan_library%s.csv" % num)
    barcode_map[f] = barcode
barcode_map
dview['barcode_map'] = barcode_map

In [0]:
def format_fastq_tuple(title, seq, qual):
    assert len(seq) == len(qual)
    return "@%s\n%s\n+\n%s\n" % (title, seq, qual)
dview['format_fastq_tuple'] = format_fastq_tuple

In [0]:
def get_writers(barcodes):
    w = defaultdict()
    for b, name in barcodes.items():
        w[b] = [name, tempfile.NamedTemporaryFile(delete=False)]
    return w

def get_barcodes(f):
    bcs = defaultdict()
    bc_lens = set()
    b = os.path.join(os.path.dirname(f), barcode_map[f])
    print b
    h = open(b)
    h.readline()
    for line in h:
        line = line.strip()
        data = line.split("\t")
        if len(data) > 1: #skip blank lines
            if "," in data[0]:
                data[0] = data[0].replace(" ", "").replace(",", "-")
            bc = data[4].upper().replace("CTCTTTCCCTACACGACGCTCTTCCGATCT", "")[:-1]
            bc_lens.add(len(bc))
            bcs[bc] = data[1] + "_" + data[0]
        
    return bcs, bc_lens    

def check_barcodes(barcodes):
    for i in barcodes.keys():
        for j in barcodes.keys():
            if i != j:
                assert not j.startswith(i)
                
def copy_file(src, dst):
    shutil.copy(src, dst)
                
def demult(f):
    print f
    hostname = socket.gethostname()
    timer = stopwatch.Timer()
    out_dir = os.path.dirname(f)
    barcodes, barcode_lens = get_barcodes(f)
    check_barcodes(barcodes)
    writers = get_writers(barcodes)
    count = 0
    found = defaultdict(int)
    for title, seq, qual in FastqGeneralIterator(open(f)):
        for l in barcode_lens: 
            bc = seq[0:l]
            if bc in barcodes:
                found[bc] += 1
                w = writers[bc][1]
                w.write(format_fastq_tuple(title, seq[l:], qual[l:]))
                break
            
        count += 1
            
        if count % 10000 == 0:
            print hostname, f, count
            
    print hostname, f, "copying tmp files"
    
    pool = Pool()
    out_files = []
    for k, v in writers.items():
        v[1].close()
        if k in found:
            copy_timer = stopwatch.Timer()
            out_file = os.path.join(out_dir, "%s_%s.fastq" % (v[0], k))
            out_files.append(out_file)
            pool.apply_async(copy_file, (v[1].name, out_file))
            copy_timer.stop()
    pool.close()
    pool.join()
    for k, v in writers.items():
        os.remove(v[1].name)
    timer.stop()
    print timer.elapsed, out_files
    
dview['demult'] = demult
dview['check_barcodes'] = check_barcodes
dview['get_writers'] = get_writers
dview['get_barcodes'] = get_barcodes
dview['copy_file'] = copy_file

In [0]:
singles

In [0]:
results = []
for single in singles:
#     demult(single)
    results.append(lview.apply_async(demult, single))

In [0]:
for r in results:
    print r.ready(), r.metadata.stdout.split("\n")[-2]
    if r.ready():
        print r.r

In [0]:
# demult_dir = "/home/cfriedline/eckertlab/bccl.csbc.vcu.edu/internal/Eckert/HiSeq_130529"
demult_dir = file_dir

In [0]:
def count_fastq_reads(f):
    lines = !grep -c . $f
    return f, int(lines[0])/4
dview['count_fastq_reads'] = count_fastq_reads

In [0]:
count_jobs = []
for root, dirs, files in os.walk(demult_dir):
    for f in files:
        if f.endswith("fastq") and not "processed" in f:
            count_jobs.append(lview.apply_async(count_fastq_reads, os.path.join(root, f)))

In [0]:
working  = 0
for j in count_jobs:
    if not j.ready():
        working += 1
print "%d working" % working

In [0]:
out_file = os.path.join(demult_dir, "%s_%s" % (os.path.basename(demult_dir), "counts.txt"))
with open(out_file, "w") as o:
    o.write("file\treads\n")
    for j in count_jobs:
        o.write("%s\n" % "\t".join([str(x) for x in j.r]))
FileLink(out_file)