In [0]:
import os, sys, tempfile, shutil, socket
import multiprocessing as mp
from IPython.parallel import Client
import pandas as pd

In [0]:
rc = Client(profile="huge")
dview = rc[:]
lview = rc.load_balanced_view()
len(rc)

In [0]:
def get_async_progress(jobs):
    ready = 0
    for j in jobs:
        if j.ready():
            ready += 1
    return "%d/%d" % (ready, len(jobs))

In [0]:
def get_sample_name_from_file(f):
    base = os.path.basename(f)
    sample_name = "_".join(base.split("_")[0:2])
    return sample_name

In [0]:
with dview.sync_imports():
    import os, stopwatch, multiprocessing, tempfile, shutil, socket

In [0]:
def get_single_host_lview(host_list):
    hosts = dview.apply(socket.gethostname).get()
    host_ids = {}
    for i, host in enumerate(hosts):
        if not host in host_ids:
            host_ids[host] = []
        if host_list == "all":
            host_ids[host].append(i)
        elif host in host_list:
            host_ids[host].append(i)
    hview = [v[0] for k, v in host_ids.items()]
    hlview = rc.load_balanced_view(targets=hview)
    return hlview

In [0]:
hlview = get_single_host_lview("all")

In [0]:
len(hlview)

In [0]:
assembly = "/data7/cfriedline/assemblies/foxtail2/Green_26_ATCGCGCAA.fastq_31_data_31/contigs.fa_in_map.fa"

In [0]:
fastq_dir = "/data7/eckertlab/projects/ethan/HiSeq_140603/FASTQ"

In [0]:
fastq_files = !ls $fastq_dir | grep 'fastq$' | grep -v processed
fastq_files = sorted([os.path.join(fastq_dir, x) for x in fastq_files])

In [0]:
fastq_files

In [0]:
bowtie2_dir = "/home/cfriedline/data7/src/bowtie2-2.2.4"
bowtie2_build = os.path.join(bowtie2_dir, "bowtie2-build")
bowtie2 = os.path.join(bowtie2_dir, "bowtie2")

In [0]:
x = !$bowtie2_build -f $assembly $assembly

In [0]:
analysis_dir = "/data7/eckertlab/projects/ethan/analysis"

In [0]:
#modified from read_mapping notebook to align with LB and PL in @RG
def run_bowtie2(args):
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    bowtie2, assembly, reads, rglb, sam = args
    tmp = tempfile.NamedTemporaryFile(delete=False)
    rgid = os.path.basename(reads)
    rgsm = rgid
    cmd = "%s %s %s -p %d -x %s -U %s -S \
    %s --rg-id %s --rg SM:%s --rg PL:illumina --rg LB:%s" % (bowtie2,
                                                           "--local",
                                                           "--very-sensitive-local", 
                                                           cpus, 
                                                           assembly,
                                                           reads,
                                                           tmp.name,
                                                           rgid,
                                                           rgsm,
                                                           rglb)
    print socket.gethostname(), cmd
    !$cmd
    shutil.move(tmp.name, sam)
    timer.stop()
    return assembly, sam, cmd, timer.elapsed
dview['run_bowtie2'] = run_bowtie2

In [0]:
def create_library_index(barcode_files):
    index = {}
    for f in barcode_files:
        df = pd.read_csv(f, sep="\t")
        names = df.apply(lambda row: "%s_%s" % (row.Family_ID, row.Sample_ID), axis=1)
        for n in names:
            if not n in index:
                index[n] = os.path.basename(f).replace(".csv", "")
            else:
                print "duplicate sample found at %s ()" % (n, os.path.basename(f))
    return index
        
library_index = create_library_index(["/data7/eckertlab/projects/ethan/ethan_library1.csv",
                                      "/data7/eckertlab/projects/ethan/ethan_library2.csv"])                                      

In [0]:
bowtie2_jobs = []
for f in fastq_files:
    sample_name = get_sample_name_from_file(f)
    rglb = library_index[sample_name]
    sam = os.path.join(analysis_dir, "%s_bowtie2.sam" % os.path.basename(f))
    bowtie2_jobs.append(hlview.apply_async(run_bowtie2, (bowtie2, assembly, f, rglb, sam)))

In [0]:
ready = 0
for x in bowtie_jobs:
    if x.ready():
        ready+=1
        #print x.get()
print "%d/%d" % (ready, len(bowtie_jobs))

In [0]:
@hlview.remote()
def convert_sam_to_bam(sam):
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    bam = sam.replace(".sam", ".bam")
    bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
    bam_index = bam_sorted.replace(".bam", ".bai")
    if not os.path.exists(bam):
        !/home/cfriedline/data7/src/samtools-0.1.19/samtools view -bS $sam > $bam
        !/home/cfriedline/data7/src/samtools-0.1.19/samtools sort -@ $cpus -f $bam $bam_sorted
        !/home/cfriedline/data7/src/samtools-0.1.19/samtools index $bam_sorted $bam_index
    timer.stop()
    return bam, bam_sorted, bam_index, timer.elapsed

In [0]:
sam_files = !ls $analysis_dir/*.sam

In [0]:
sam_bam_jobs = []
for f in sam_files:
    sam_bam_jobs.append(convert_sam_to_bam(f))

In [0]:
ready = 0
for x in sam_bam_jobs:
    if x.ready():
        ready+=1
        #print x.get()
print "%d/%d" % (ready, len(sam_bam_jobs))