In [0]:
from IPython.parallel import Client
import os, time

In [0]:
rc = Client(profile="huge")
dview = rc[:]
lview = rc.load_balanced_view()
len(rc)

In [0]:
def get_async_progress(jobs):
    ready = 0
    for j in jobs:
        if j.ready():
            ready += 1
    return "%d/%d" % (ready, len(jobs))

In [0]:
def get_idle_engines(rc):
    qs = rc.queue_status()
    time.sleep(10)
    active = [eid for eid in sorted(qs)[:-1] if not qs[eid]['queue']]
    d = rc[active]
    l = rc.load_balanced_view(targets=active)
    return d, l

In [0]:
dview, lview = get_idle_engines(rc)

In [0]:
with dview.sync_imports():
    import os
    import sys
    import socket
    import stopwatch

In [0]:
def get_single_host_lview(host_list):
    hosts = dview.apply(socket.gethostname).get()
    host_ids = {}
    for i, host in enumerate(hosts):
        if not host in host_ids:
            host_ids[host] = []
        if host_list == "all":
            host_ids[host].append(i)
        elif host in host_list:
            host_ids[host].append(i)
    hview = [v[0] for k, v in host_ids.items()]
    hlview = rc.load_balanced_view(targets=hview)
    return hlview

In [0]:
hlview = get_single_host_lview("all")
len(hlview)

In [0]:
bam_dir = "/data7/eckertlab/projects/ethan/analysis"
analysis_dir = os.path.join(bam_dir, "samtools1.1")
assert os.path.exists(analysis_dir)

In [0]:
bam_files = !ls $bam_dir/*_sorted.bam

In [0]:
samtools = "/home/cfriedline/data7/src/samtools-1.1/samtools-1.1/samtools"
bcftools = "/home/cfriedline/data7/src/samtools-1.1/bcftools-1.1/bcftools"
picard = "/home/cfriedline/data7/src/broadinstitute-picard-03a1d72/dist/picard.jar"
java = "/home/cfriedline/jdk1.7.0_25/bin/java"
perl = "/home/cfriedline/data7/opt/ActivePerl-5.16/bin/perl"

In [0]:
def mark_duplicates(args):
    java, picard, bam_file, analysis_dir = args
    out_bam = os.path.join("%s_dedup.bam" % os.path.basename(bam_file))
    t = tempfile.NamedTemporaryFile(delete=False)
    cmd = "%s -jar %s MarkDuplicates \
    INPUT=%s OUTPUT=%s METRICS_FILE=%s.metrics" %     (java,
                              picard,
                              bam_file,
                              t.name,
                              out_bam)
    print cmd
    !$cmd
    shutil.move(t.name, out_bam)
    return cmd, out_bam
dview['mark_duplicates'] = mark_duplicates

In [0]:
rmdup_jobs = []
for b in bam_files:
    rmdup_jobs.append(hlview.apply_async(mark_duplicates, (java, picard, b, analysis_dir)))

In [0]:
get_async_progress(rmdup_jobs)

In [0]:
rmdup_jobs[0].get()

In [0]:
assembly = "/data7/cfriedline/assemblies/foxtail2/Green_26_ATCGCGCAA.fastq_31_data_31/contigs.fa_in_map.fa"

In [0]:
bam_rmdup_files = !ls $analysis_dir/*_dedup.bam
bam_rmdup_files

In [0]:
def create_ploidy_file(args):
    bam_files, analysis_dir = args
    ploidy_file = os.path.join(analysis_dir, "%s.ploidy" % "all")
    with open(ploidy_file, "w") as o:
        for b in bam_files:
            name = "%s.fastq" % os.path.basename(b).split(".fastq")[0]
            ploidy = 2
            o.write("%s\t%d\n" % (name, ploidy))
    return ploidy_file
dview['create_ploidy_file'] = create_ploidy_file

In [0]:
def call_snps(args):
    print socket.gethostname()
    timer = stopwatch.Timer()
    samtools, reference, bam_sorted, bcftools, raw_vcf, out_dir = args 
    if not out_dir:
        out_dir = os.environ['TMPDIR']
    raw_vcf = os.path.join(out_dir, raw_vcf)
    ploidy_file = create_ploidy_file((bam_sorted, out_dir))
    pileup = "%s mpileup -ugf %s %s | %s call -S %s -vmO z -o %s" % (samtools, 
                                                                     reference, 
                                                                     ' '.join(bam_sorted), 
                                                                     bcftools, 
                                                                     ploidy_file, 
                                                                     raw_vcf) 
    
    print pileup
    !$pileup
    timer.stop()
    return pileup, timer.elapsed
dview['call_snps'] = call_snps

In [0]:
args = [samtools, 
        assembly, 
        bam_rmdup_files, 
        bcftools, 
        "samtools_1.1.vcf.gz", 
        analysis_dir]
samtools_job = lview.apply_async(call_snps, args)

In [0]:
print samtools_job.ready()