In [0]:
import os, sys
from ipyparallel import Client
import matplotlib.pyplot as plt
%matplotlib inline
from subprocess import Popen, PIPE
from Bio import SeqIO
import pandas as pd
import pickle

In [0]:
sys.path.append("/home/cfriedline/ipynb/include_utils")

In [0]:
import include_utils as u

In [0]:
cd ~/eckertlab/gypsy_indiv/raw_demult_gbsx/

In [0]:
fastq_files = !find I1 I3 -type f | grep '.fastq$' | grep -v lane | grep -v OTIS | grep -v undet
fastq_files = sorted([os.path.abspath(x) for x in fastq_files])
len(fastq_files)

In [0]:
rc = Client(profile="sge")

In [0]:
dv = rc[:]
lv = rc.load_balanced_view()
len(dv)

In [0]:
with dv.sync_imports():
    import os, sys, socket

In [0]:
hv = u.get_single_host_lview(rc, "all")

In [0]:
#assembly = "/home/cfriedline/gpfs/assemblies/gypsy/masurca_new/CA/10-gapclose/genome.ctg.fasta"
assembly = "/home/cfriedline/eckertlab/projects/gypsy_moth/assemblies/masurca3/CA/10-gapclose/genome.ctg.fasta"

In [0]:
!/home/cfriedline/gpfs/src/bowtie2-2.2.4/bowtie2-build -f $assembly $assembly

In [0]:
# --very-fast-local
# Same as: -D 5 -R 1 -N 0 -L 25 -i S,1,2.00

# --fast-local
# Same as: -D 10 -R 2 -N 0 -L 22 -i S,1,1.75

# --sensitive-local
# Same as: -D 15 -R 2 -N 0 -L 20 -i S,1,0.75 (default in --local mode)

# --very-sensitive-local
# Same as: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50

@lv.remote()
def run_bowtie2(args):
    import os, stopwatch, multiprocessing
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    assembly, reads = args
    parent = os.path.dirname(reads)
    outdir = os.path.join(parent, "masurca3_rerun")
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    sam = os.path.join(outdir, "%s.sam" % (os.path.basename(reads)))
    cmd = "/home/cfriedline/gpfs/src/bowtie2-2.2.4/bowtie2 --local -D 20 -R 3 -N 1 -L 20 -i S,1,0.50 -p %d -x %s -U %s -S %s" % (cpus,
                                                               assembly,
                                                               reads,
                                                               sam)
    res = None
    res = !$cmd
#     if not os.path.exists(sam):
#         res = !$cmd
    timer.stop()
    return assembly, sam, cmd, timer.elapsed, res

In [0]:
@lv.remote()
def convert_sam_to_bam(sam):
    import stopwatch, multiprocessing, os
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    bam = sam.replace(".sam", ".bam")
    bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
    if not os.path.exists(bam):
        !/home/cfriedline/gpfs/src/samtools-1.3/samtools view -b $sam -o $bam
        !/home/cfriedline/gpfs/src/samtools-1.3/samtools sort -@ $cpus $bam -o $bam_sorted
        !/home/cfriedline/gpfs/src/samtools-1.3/samtools index $bam_sorted
    timer.stop()
    return bam, bam_sorted, timer.elapsed

In [0]:
len(fastq_files)

In [0]:
len(dv)

In [0]:
fastq_files[0]

In [0]:
bowtie_jobs = []
for f in fastq_files:
    bowtie_jobs.append(run_bowtie2((assembly, f)))

In [0]:
u.get_async_progress(bowtie_jobs)

In [0]:
for i, x in enumerate(bowtie_jobs):
    if x.ready():
        print(i, x.r[-1][-1])

In [0]:
bowtie_results = [x.r for x in bowtie_jobs]

In [0]:
demult_dir = "/gpfs_fs/home/eckertlab/gypsy_indiv/raw_demult_gbsx/"

In [0]:
with open(os.path.join(demult_dir, "bowtie_results.pkl"), "wb") as o:
    pickle.dump(bowtie_results, o, pickle.HIGHEST_PROTOCOL)

In [0]:
sam_files = !find . -type f | grep '.sam$' | grep 'masurca3'
sam_files = [os.path.abspath(x) for x in sam_files]
assert len(sam_files) == len(fastq_files)

In [0]:
sam_files

In [0]:
sam_bam_jobs = []
for f in sam_files:
    sam_bam_jobs.append(convert_sam_to_bam(os.path.abspath(f)))

In [0]:
u.get_async_progress(sam_bam_jobs)

In [0]:
sorted_bams = !find . -type f | grep '/masurca3' | grep 'sorted.bam$'
sorted_bams = [os.path.abspath(x) for x in sorted_bams if 'bam' in x]
assert len(sorted_bams) == len(fastq_files)

In [0]:
len(sorted_bams)

In [0]:
@lv.remote()
def get_lane_info(bam):
    res = !/home/cfriedline/g/src/samtools-1.3/samtools view $bam | tail -n1
    return bam, res[0].split("\t")[0]

In [0]:
get_lane_info(sorted_bams[0])

In [0]:
rg_info = []
for f in sorted_bams:
    rg_info.append(get_lane_info(f))

In [0]:
u.get_async_progress(rg_info)

In [0]:
lane_info = [x.r for x in rg_info]

In [0]:
rg_dict = {}
for bam, header in lane_info:
    sample = os.path.basename(bam).split(".")[0][:-1]
    instr, run, flowcell, lane, tile, x, y = header.split(":")
    rg_dict[bam] = {"id": "{}.{}.{}".format(flowcell, lane, sample),
                   "pl": "ILLUMINA",
                   "lb": "{}.{}".format(flowcell, lane),
                   "sm": sample}

In [0]:
dv['rg_dict'] = rg_dict

In [0]:
@hv.remote()
def add_rg_info_to_bam(bam):
    import os
    cmd = "java -jar /home/cfriedline/gpfs/src/picard-tools-1.112/AddOrReplaceReadGroups.jar"
    bam_rg = bam.replace(".bam", "_rg.bam")
    info = rg_dict[bam]
    rg_string = "RGID={0} RGLB={1} RGPL=ILLUMINA RGPU={1} RGSM={2}".format(info['id'],
                                                                           info['lb'],
                                                                           info['sm'])
    cmd = "{} INPUT={} OUTPUT={} {} CREATE_INDEX=true".format(cmd,
                                                              bam,
                                                              bam_rg,
                                                              rg_string)
#     if not os.path.exists(bam_rg):
    !$cmd
    return bam_rg, rg_string, cmd
    

In [0]:
add_rg = []
for f in sorted_bams:
    add_rg.append(add_rg_info_to_bam(f))

In [0]:
u.get_async_progress(add_rg)

In [0]:
rg_bams = !find . | grep 'rg.bam$' | grep 'masurca3'
rg_bams = sorted([os.path.abspath(x) for x in rg_bams if 'rg.bam' in x])
assert len(rg_bams) == len(fastq_files)

In [0]:
len(rg_bams)