In [0]:
import os, sys
from ipyparallel import Client
import matplotlib.pyplot as plt
%matplotlib inline
from subprocess import Popen, PIPE
from Bio import SeqIO
import pandas as pd
import pickle
import scandir

In [0]:
sys.path.append("/home/cfriedline/ipynb/include_utils")

In [0]:
import include_utils as u

In [0]:
root = "/home/cfriedline/eckertlab/Mitra/SWWP_seq2"

fastq_files = open(os.path.join(root, "fastq_files.txt")).readlines()
fastq_files = sorted([x.strip() for x in fastq_files])

In [0]:
len(fastq_files)

In [0]:
assembly = "/gpfs_fs/home/eckertlab/SugarPine_genome/pila.v1.0.scafSeq"

In [0]:
# --very-fast-local
# Same as: -D 5 -R 1 -N 0 -L 25 -i S,1,2.00

# --fast-local
# Same as: -D 10 -R 2 -N 0 -L 22 -i S,1,1.75

# --sensitive-local
# Same as: -D 15 -R 2 -N 0 -L 20 -i S,1,0.75 (default in --local mode)

# --very-sensitive-local
# Same as: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50

#@lview.remote()
def run_bowtie2(args):
    import os, stopwatch, multiprocessing, socket
    print(socket.gethostname())
    timer = stopwatch.Timer()
    cpus = 64
    assembly, reads, outdir = args
    sam = os.path.join(outdir, "%s.sam" % ("all_fastq"))
    cmd = "/home/cfriedline/g/src/bowtie2-2.2.9/bowtie2 --local -D 20 -R 3 -N 1 -L 20 -i S,1,0.50 -p %d -x %s -U %s -S %s" % (cpus,
                                                               assembly,
                                                               reads,
                                                               sam)
    res = None
    res = cmd
#     if not os.path.exists(sam):
#         res = !$cmd
    timer.stop()
    return assembly, sam, cmd, timer.elapsed, res

In [0]:
sam_outdir = "/gpfs_fs/home/eckertlab/Mitra/SWWP_seq2/dedupe/mapping"

In [0]:
!mkdir $sam_outdir

In [0]:
cmd = run_bowtie2((assembly, ','.join(fastq_files), sam_outdir))[2]

In [0]:
cmd

In [0]:
root

In [0]:
pwd

In [0]:
with open(os.path.join(root, "bowtie.sh"), "w") as o:
    s = """
#!/bin/bash

#$ -S /bin/bash
#$ -N bowtie
#$ -pe smp 64
#$ -q godel199@godel97
#$ -V
#$ -cwd
#$ -e bowtie.err
#$ -o bowtie.out
%s""" % cmd
    o.write("%s\n" % s)

## Run bowtie command

```bash
chmod +x bowtie.sh
qsub bowtie.sh
```

In [0]:
@lview.remote()
def convert_sam_to_bam(sam):
    import stopwatch, multiprocessing, os
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    bam = sam.replace(".sam", ".bam")
    bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
    bam_index = bam_sorted.replace(".bam", ".bai")
    if not os.path.exists(bam):
        !/home/cfriedline/gpfs/src/samtools-1.2/samtools view -bS $sam > $bam
        !/home/cfriedline/gpfs/src/samtools-1.2/samtools sort -@ $cpus -f $bam $bam_sorted
        !/home/cfriedline/gpfs/src/samtools-1.2/samtools index $bam_sorted $bam_index
    timer.stop()
    return bam, bam_sorted, bam_index, timer.elapsed

In [0]:
len(fastq_files)

In [0]:
len(highview)

In [0]:
[x.stdout for x in bowtie_jobs]

In [0]:
hosts = !qhost | grep godel
hosts = [x.split()[0] for x in hosts]
for h in hosts:
    if h not in ['godel21', 'godel37', 'godel200']: 
        res = !ssh $h ps aux | grep cfriedline | grep bowtie2
        if res:
            print(h, res)

In [0]:
bowtie_results = [x.r for x in bowtie_jobs]

In [0]:
demult_dir = "/gpfs_fs/home/eckertlab/gypsy_indiv/raw_demult"

In [0]:
with open(os.path.join(demult_dir, "bowtie_results.pkl"), "wb") as o:
    pickle.dump(bowtie_results, o, pickle.HIGHEST_PROTOCOL)

In [0]:
sam_files = !find . -type f | grep '.sam$' | grep 'masurca3'
sam_files = [os.path.abspath(x) for x in sam_files]
assert len(sam_files) == len(fastq_files)

In [0]:
sam_bam_jobs = []
for f in sam_files:
    sam_bam_jobs.append(convert_sam_to_bam(os.path.abspath(f)))

In [0]:
u.get_async_progress(sam_bam_jobs)

In [0]:
sam_bam_jobs[0].r

In [0]:
@lview.remote()
def add_rg_info_to_bam(bam):
    import os
    lane_map = {"I1": 1, "I3":2}
    cmd = "java -jar /home/cfriedline/gpfs/src/picard-tools-1.112/AddOrReplaceReadGroups.jar"
    base = os.path.basename(bam).split(".")
    bam_rg = bam.replace(".bam", "_rg.bam")
    rglb = os.path.basename(os.path.dirname(os.path.dirname(bam)))
    rgpu = base[0].split("_")[-1]
    rgsm = base[0]
    rgid = "FLOWCELL.LANE%d.%s" % (lane_map[rglb],rgsm)
    rg_string = "RGID=%s RGLB=%s RGPL=illumina RGPU=%s RGSM=%s" % (rgid,
                                                                   rglb,
                                                                   rgpu,
                                                                   rgsm)
    cmd = "%s INPUT=%s OUTPUT=%s %s CREATE_INDEX=true" % (cmd,
                                                          bam,
                                                          bam_rg,
                                                          rg_string)
#     if not os.path.exists(bam_rg):
    !$cmd
    return bam_rg, rg_string, cmd
    

In [0]:
sorted_bams = !find . -type f | grep '/masurca3' | grep 'sorted.bam$'
sorted_bams = [os.path.abspath(x) for x in sorted_bams if 'bam' in x]
assert len(sorted_bams) == len(fastq_files)

In [0]:
add_rg = []
for f in sorted_bams:
    add_rg.append(add_rg_info_to_bam(f))

In [0]:
u.get_async_progress(add_rg)

In [0]:
rg_bams = !find . | grep 'rg.bam$' | grep 'masurca3'
rg_bams = sorted([os.path.abspath(x) for x in rg_bams if 'rg.bam' in x])
assert len(rg_bams) == len(fastq_files)

In [0]:
len(rg_bams)