In [0]:
import sys

sys.path.append("../include_utils/")

#from IPython.parallel import Client
import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE
import seaborn as sns
from IPython.display import FileLink
import urllib.request as urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam
from collections import OrderedDict, namedtuple
import operator
import multiprocessing as mp

In [0]:
def setup_r():
    os.environ['R_HOME'] = '/home/cfriedline/g/R3/lib64/R'
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'],
                                                     "/home/cfriedline/lib64")

In [0]:
setup_r()
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext rpy2.ipython

In [0]:
rc = u.get_client("sge")

In [0]:
dview, lview = u.get_views(rc)
len(dview)

In [0]:
with dview.sync_imports():
    import os
    import sys
    import socket
    import stopwatch
    from subprocess import Popen, PIPE
    import tempfile
    import shutil

In [0]:
hlview = u.get_single_host_lview(rc, "all")
len(hlview)

In [0]:
bam_dir = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis"
analysis_dir = os.path.join(bam_dir, "samtools1.2_masurca3")
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)
assert os.path.exists(analysis_dir)

In [0]:
bam_files = !find /home/cfriedline/eckertlab/gypsy_indiv/masked | grep masurca3 | grep 'rg.bam$' | grep -v OTIS
bam_files = [os.path.abspath(x) for x in bam_files if '.bam' in x]
len(bam_files)

In [0]:
samtools = "/home/cfriedline/gpfs/src/samtools-1.2/samtools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.2/bcftools"
picard = "/home/cfriedline/gpfs/src/broadinstitute-picard-03a1d72/dist/picard.jar"
java = "/home/cfriedline/g/src/jdk1.8.0_60/bin/java"
perl = "/home/cfriedline/gpfs/opt/ActivePerl-5.16/bin/perl"

In [0]:
def mark_duplicates(args):
    java, picard, bam_file, analysis_dir = args
    out_bam = os.path.join("%s_dedup.bam" % os.path.basename(bam_file))
    out_bam = os.path.join(analysis_dir, out_bam)
    t = tempfile.NamedTemporaryFile(delete=False)
    cmd = "%s -jar %s MarkDuplicates \
    INPUT=%s OUTPUT=%s METRICS_FILE=%s.metrics" %     (java,
                              picard,
                              bam_file,
                              t.name,
                              out_bam)
    print(cmd)
    #if os.path.getsize(out_bam) == 0 or not os.path.exists(out_bam):
    !$cmd
    shutil.move(t.name, out_bam)
    return cmd, out_bam

In [0]:
dview['mark_duplicates'] = mark_duplicates

In [0]:
rmdup_jobs = []
for b in bam_files:
    rmdup_jobs.append(hlview.apply_async(mark_duplicates, (java, picard, b, analysis_dir)))

In [0]:
u.get_async_progress(rmdup_jobs)

In [0]:
assembly = "/home/cfriedline/eckertlab/projects/gypsy_moth/assemblies/masurca3/CA/10-gapclose/genome.ctg.fasta"

In [0]:
bam_rmdup_files = !ls {analysis_dir} | grep 'dedup.bam$' | grep -v OTIS
bam_rmdup_files = sorted([os.path.join(analysis_dir, x) for x in bam_rmdup_files])
len(bam_rmdup_files)

In [0]:
def create_ploidy_file(args):
    import os
    bam_files, analysis_dir = args
    ploidy_file = os.path.join(analysis_dir, "%s.ploidy" % "all")
    if not os.path.exists(ploidy_file):
        with open(ploidy_file, "w") as o:
            for b in bam_files:
                name = "%s" % os.path.basename(b).split(".fastq")[0]
                ploidy = 2
                o.write("%s\t%d\n" % (name, ploidy))
    return ploidy_file

In [0]:
dview['create_ploidy_file'] = create_ploidy_file

In [0]:
def call_snps(args):
    import socket, os, stopwatch
    print(socket.gethostname())
    timer = stopwatch.Timer()
    samtools, reference, bam_sorted, bcftools, raw_vcf, out_dir, contig_names = args 
    if not out_dir:
        out_dir = os.environ['TMPDIR']
    raw_vcf = os.path.join(out_dir, os.path.basename(raw_vcf))
    ploidy_file = create_ploidy_file((bam_sorted, out_dir))
    pileup = "%s mpileup -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR %s -ugf %s %s | %s call -f GP,GQ -S %s -vmO z -o %s" % (samtools, 
                                                                     contig_names, reference, 
                                                                     ' '.join(bam_sorted), 
                                                                     bcftools, 
                                                                     ploidy_file, 
                                                                     raw_vcf) 
    
    #print(pileup)
    #!$pileup
    timer.stop()
    return pileup, timer.elapsed

In [0]:
#dview['call_snps'] = call_snps

In [0]:
args = ["/home/cfriedline/g/src/samtools-1.3/samtools", 
        assembly, 
        bam_rmdup_files, 
        bcftools, 
        "samtools_1.3.vcf.gz", 
        '/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_masurca3',
        ""]

In [0]:
pileup, elapsed = call_snps(args)

In [0]:
with open(os.path.join(analysis_dir, "samtools.sh"), "w") as o:
    o.write("#!/bin/bash\n")
    o.write("%s\n" % pileup)

## Run on SGE
```bash
cd /gpfs_fs/home/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_masurca3
qsub -cwd -N samtools -j y -V samtools.sh
```