In [0]:
import gzip, os, sys
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from IPython.parallel import Client

In [0]:
file_dir = "/home/cfriedline/eckertlab/gypsy_indiv/HiSeq_140425"
os.chdir(file_dir)

In [0]:
files = !ls *.fastq | grep -v processed

In [0]:
files = [os.path.abspath(x) for x in files]

In [0]:
files

In [0]:
rc = Client(profile="sge")
dview = rc[:]
lview = rc.load_balanced_view()
len(rc)

In [0]:
@dview.remote(block=True)
def get_cpu_count():
    import multiprocessing as mp
    import socket
    return socket.gethostname(), mp.cpu_count()
cpu_counts = get_cpu_count()
from collections import defaultdict
cpu_dict = defaultdict(list)
for i, c in enumerate(cpu_counts):
    if c[1] >= 16:
        cpu_dict[c[0]].append(i)  
print(cpu_dict)
cview = rc.load_balanced_view(targets=[v[0] for k, v in list(cpu_dict.items())])

In [0]:
with dview.sync_imports():
    import stopwatch
    import numpy
    import numpy as np
    import scipy
    import pandas
    import gzip
    import os
    import tempfile
    import shutil
    import socket
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    from collections import deque, defaultdict
    import multiprocessing 
    from multiprocessing import Pool, Manager
    import traceback
    

In [0]:
def mask_reads(args):
    fastq, qual = args
    out = "%s_masked_%d.fastq" % (fastq, qual)
    res = !/home/cfriedline/gpfs/opt/fastx/bin/fastq_masker -q {qual} -i {fastq} -o {out} -v
    return out, res
dview['mask_reads'] = mask_reads

In [0]:
mask_jobs = []
for f in files:
    mask_jobs.append(lview.apply_async(mask_reads, (f, 33)))

In [0]:
for m in mask_jobs:
    print(m.ready())

In [0]:
def drop_masked_reads(masked_fastq, cutoff):
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    for name, seq, qual in FastqGeneralIterator(open(masked_fastq)):
        n_count = sum([1 if x == "N" else 0 for x in seq])*1.0
        n_perc = n_count/len(seq)
        if n_perc > cutoff:
            yield None
        else:
            yield name, seq, qual, n_perc
dview['drop_masked_reads'] = drop_masked_reads       

In [0]:
masked = !ls *masked*.fastq
masked

In [0]:
def format_fastq_tuple(title, seq, qual):
    assert len(seq) == len(qual)
    return "@%s\n%s\n+\n%s\n" % (title, seq, qual)
dview['format_fastq_tuple'] = format_fastq_tuple

In [0]:
def filter_masked(args):
    m, cutoff = args
    out = '%s_dropped_%.2f.fastq' % (m, cutoff)
    with open(out, "w") as o:
        for res in drop_masked_reads(m, cutoff):
            if res:
                name, seq, qual, perc = res
                o.write(format_fastq_tuple(name, seq, qual))
dview['filter_masked'] = filter_masked

In [0]:
masked = [os.path.abspath(x) for x in masked]
masked

In [0]:
cutoff_jobs = []
cutoff = 0.5
for m in masked:
    if "34.fastq" in m and not "dropped" in m:
        cutoff_jobs.append(lview.apply_async(filter_masked, (m, cutoff)))

In [0]:
[x.ready() for x in cutoff_jobs]

In [0]:
cutoff_jobs[0].r