In [0]:
import os, sys

In [0]:
from IPython.parallel import Client

In [0]:
rc = Client(profile="sge")
dview = rc[:]
lview = rc.load_balanced_view()

In [0]:
len(dview)

In [0]:
file_dir = "/home/cfriedline/eckertlab/gypsy_indiv/masked"
if not os.path.exists(file_dir):
    os.mkdir(file_dir)
os.chdir(file_dir)

In [0]:
proc_files = ['/gpfs_fs/home/eckertlab/gypsy_indiv/HiSeq_140425/lane1_Undetermined_L001_R1_001.fastq_masked_34.fastq_dropped_0.50.fastq',
             '/gpfs_fs/home/eckertlab/gypsy_indiv/HiSeq_140425/lane2_Undetermined_L002_R1_001.fastq_masked_34.fastq_dropped_0.50.fastq']

In [0]:
proc_files

In [0]:
!mkdir I1
!mkdir I3

In [0]:
ls

In [0]:
!cp /home/cfriedline/ipynb/gypsy_moth/barcodes_i1.txt I1/barcodes
!cp /home/cfriedline/ipynb/gypsy_moth/barcodes_i3.txt I3/barcodes

In [0]:
def format_fastq_tuple(title, seq, qual):
    assert len(seq) == len(qual)
    return "@%s\n%s\n+\n%s\n" % (title, seq, qual)

def get_writers(barcodes):
    import tempfile
    from collections import defaultdict
    w = defaultdict()
    for b, name in list(barcodes.items()):
        w[b] = [name, tempfile.NamedTemporaryFile(delete=False)]
    return w

def get_barcodes(f):
    from collections import defaultdict
    import os
    bcs = defaultdict()
    bc_lens = set()
    b = os.path.join(os.path.dirname(f), "barcodes")
    print(b)
    h = open(b)
    h.readline()
    for line in h:
        line = line.strip()
        data = line.split("\t")
        if len(data) > 1: #skip blank lines
            if "," in data[0]:
                data[0] = data[0].replace(" ", "").replace(",", "-")
            bc = data[4].upper().replace("CTCTTTCCCTACACGACGCTCTTCCGATCT", "")[:-1]
            bc_lens.add(len(bc))
            bcs[bc] = data[1] + "_" + data[0]        
    return bcs, bc_lens    

def check_barcodes(barcodes):
    for i in list(barcodes.keys()):
        for j in list(barcodes.keys()):
            if i != j:
                assert not j.startswith(i)
                
def copy_file(src, dst):
    import shutil
    shutil.copy(src, dst)
                
def demult(f):
    print(f)
    import socket, stopwatch, os
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    from multiprocessing import Pool
    from collections import defaultdict
    hostname = socket.gethostname()
    timer = stopwatch.Timer()
    out_dir = os.path.dirname(f)
    barcodes, barcode_lens = get_barcodes(f)
    check_barcodes(barcodes)
    writers = get_writers(barcodes)
    count = 0
    found = defaultdict(int)
    for title, seq, qual in FastqGeneralIterator(open(f)):
        for l in barcode_lens: 
            bc = seq[0:l]
            if bc in barcodes:
                found[bc] += 1
                w = writers[bc][1]
                w.write(format_fastq_tuple(title, seq[l:], qual[l:]))
                break
            
        count += 1
            
        if count % 10000 == 0:
            print(hostname, f, count)
            
    print(hostname, f, "copying tmp files")
    
    pool = Pool()
    out_files = []
    for k, v in list(writers.items()):
        v[1].close()
        if k in found:
            copy_timer = stopwatch.Timer()
            out_file = os.path.join(out_dir, "%s_%s.fastq" % (v[0], k))
            out_files.append(out_file)
            pool.apply_async(copy_file, (v[1].name, out_file))
            copy_timer.stop()
    pool.close()
    pool.join()
    for k, v in list(writers.items()):
        os.remove(v[1].name)
    timer.stop()
    print(timer.elapsed, out_files)

dview['format_fastq_tuple'] = format_fastq_tuple
dview['demult'] = demult
dview['check_barcodes'] = check_barcodes
dview['get_writers'] = get_writers
dview['get_barcodes'] = get_barcodes
dview['copy_file'] = copy_file

In [0]:
file_dir

In [0]:
for f in proc_files:
    if 'lane1' in f:
        !cp {f} {os.path.join(file_dir, "I1")}
    else:
        !cp {f} {os.path.join(file_dir, "I3")}

In [0]:
pwd

In [0]:
proc_files = ! find . | grep '.fastq$'

In [0]:
proc_files = [os.path.abspath(x) for x in proc_files]
proc_files

In [0]:
demult_jobs = []
for f in proc_files:
    print(f)
    demult_jobs.append(lview.apply_async(demult, f))

In [0]:
for r in demult_jobs:
    print(r.stdout.split("\n")[-2:])

In [0]:
fastq_files = !ls I1/*.fastq | grep -v lane
fastq_files2 = !ls I3/*.fastq | grep -v lane
fastq_files.extend(fastq_files2)
fastq_files = [os.path.abspath(x) for x in fastq_files]
len(fastq_files)

In [0]:
@lview.remote()
def run_cmd(cmd):
    res = !$cmd
    return res, cmd
    

In [0]:
indiv_count_jobs = []
for f in fastq_files:
    indiv_count_jobs.append(run_cmd("wc -l %s" % f))

In [0]:
count_ready = 0
for f in indiv_count_jobs:
    if r.ready():
        count_ready +=1 
print("%d/%d" % (count_ready, len(indiv_count_jobs)))

In [0]:
indiv_counts = {}
for j in indiv_count_jobs:
    data = j.r[0][0].split()
    count = int(data[0])/4
    indiv_counts[data[1]] = count

In [0]:
os.chdir("/home/cfriedline/eckertlab/gypsy_indiv/masked/")
with open("sample_counts.txt", "w") as o:
    keys = sorted(indiv_counts.keys())
    for f in keys:
        count = indiv_counts[f]
        o.write("%s\t%d\n" % (f, count))