In [0]:
import sys

sys.path.append("../include_utils/")

#from IPython.parallel import Client
import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE, STDOUT, check_output
import seaborn as sns
from IPython.display import FileLink
import urllib.request as urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam
from collections import OrderedDict, namedtuple
import operator
import multiprocessing as mp
from ipyparallel import Client
import shutil

In [0]:
cd "~/eckertlab/Mitra/mapping/split_parallel/collapsed/"

In [0]:
bam_dir = "."
analysis_dir = os.path.join(bam_dir, "samtools1.3")
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)
assert os.path.exists(analysis_dir)

In [0]:
bam_files = open("bams.txt").readlines()
bam_files = sorted([x.strip() for x in bam_files])
bam_files = [os.path.abspath(x) for x in bam_files]

In [0]:
len(bam_files) == 381

In [0]:
samtools = "/home/cfriedline/bin/samtools"
bcftools = "/home/cfriedline/bin/bcftools"
picard = "/home/cfriedline/gpfs/src/broadinstitute-picard-03a1d72/dist/picard.jar"
java = "/home/cfriedline/g/src/jdk1.8.0_92/bin/java"
perl = "/home/cfriedline/gpfs/opt/ActivePerl-5.18/bin/perl"

In [0]:
def remove_header(args):
    samtools, bam = args
    out = "%s_nosq.sam" % bam
    cmd = "%s view %s -o %s" % (samtools, bam, out)
    res = check_output(cmd.split(), stderr=STDOUT)
    return out

In [0]:
rc = Client(profile="sge")
dv = rc[:]
lv = rc.load_balanced_view()

In [0]:
len(dv)

In [0]:
with dv.sync_imports():
    from subprocess import check_output, call, Popen, STDOUT, PIPE
    import os, socket
    import tempfile
    import shutil

In [0]:
%px cd "~/eckertlab/Mitra/mapping/split_parallel/collapsed/"

In [0]:
dv['remove_header'] = remove_header

In [0]:
jobs = []
for b in bam_files:
    j = lv.apply(remove_sq, (samtools, b))
    jobs.append(j)

In [0]:
len(jobs)

In [0]:
np.sum([x.ready() for x in jobs])

In [0]:
nosq_sam = !ls *nosq.sam

In [0]:
assembly = "/gpfs_fs/home/eckertlab/SugarPine_genome/pila.v1.0.scafSeq_mapped.fasta"

In [0]:
def add_back_sq(args):
    samtools, sam, assembly = args
    out = "%s_newsq.sam" % sam
    cmd = "%s view -h -t %s.fai %s -o %s" % (samtools, assembly, sam, out)
    res = check_output(cmd.split(), stderr=STDOUT)
    return cmd

In [0]:
dv['add_back_sq'] = add_back_sq

In [0]:
jobs = []
for n in nosq_sam:
    j = lv.apply_async(add_back_sq, (samtools, n, assembly))
    jobs.append(j)

In [0]:
np.sum([x.ready() for x in jobs])

In [0]:
newsq_sam = !ls | grep 'newsq.sam$'

In [0]:
def rewrite_rg(args):
    samtools, sam = args
    
    #sam: ATH163_rg_sorted_mapped.bam_nosq.sam_newsq.sam
    
    # get RG from original bam file
    bam = sam.replace("_nosq.sam_newsq.sam", "")
    cmd = "%s view -H %s | grep '^@RG'" % (samtools, bam)
    p = Popen(cmd, stdout=PIPE, shell=True)
    p.wait()
    rg = list(p.stdout)[0].decode().strip()
    
    #dump header to temp file, append rg
    t = tempfile.NamedTemporaryFile(delete=False).name
    print(t)
    cmd = "%s view -H %s -o %s" % (samtools, sam, t)
    call(cmd.split())
    with open(t, "a") as o:
        o.write("%s\n" % rg)
    
    #convert sam to bam
    newbam = "%s.bam" % sam
    cmd = "%s view -h -b %s -o %s" % (samtools, sam, newbam)
    call(cmd.split())
    
    #use reheader
    reheader = "%s_reheader.bam" % newbam
    cmd = "%s reheader -P %s %s > %s" % (samtools, t, newbam, reheader)
    p = Popen(cmd, stdout=PIPE, shell=True)
    p.wait()
    
    #sort bam
    sorted_bam = "%s_sorted.bam" % reheader
    cmd = "%s sort %s -o %s" % (samtools, reheader, sorted_bam)
    call(cmd.split())
    return sorted_bam


In [0]:
dv['rewrite_rg'] = rewrite_rg

In [0]:
jobs = []
for n in newsq_sam:
    j = lv.apply_async(rewrite_rg, (samtools, n))
    jobs.append(j)

In [0]:
np.sum([x.ready() for x in jobs])

In [0]:
working = !ls | grep 'reheader.bam_sorted.bam$'

In [0]:
!mkdir work

In [0]:
def copy_file(args):
    src, dst = args
    shutil.copy(src, dst)

In [0]:
dv['copy_file'] = copy_file

In [0]:
jobs = []
for w in working:
    src = w
    dst = os.path.join("work", "%s.bam" % src.split("_")[0].upper())
    jobs.append(lv.apply_async(copy_file, (src, dst)))

In [0]:
np.sum([x.ready() for x in jobs])

In [0]:
cd work

In [0]:
%px cd work

In [0]:
working_bams = !ls *.bam

In [0]:
len(working_bams)

In [0]:
def index_bam(args):
    samtools, bam = args
    cmd = "%s index %s" % (samtools, bam)
    call(cmd.split())

In [0]:
jobs = []
for b in working_bams:
    jobs.append(lv.apply_async(index_bam, (samtools, b)))

In [0]:
np.sum([x.ready() for x in jobs])

In [0]:
pwd