In [0]:
import sys

sys.path.append("../include_utils/")

#from IPython.parallel import Client
import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE, STDOUT, check_output, call
import seaborn as sns
from IPython.display import FileLink
import urllib.request as urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam
from collections import OrderedDict, namedtuple
import operator
import multiprocessing as mp
import glob

In [0]:
workdir = "~/eckertlab/Mitra/mapping/split_parallel/collapsed/work"

In [0]:
cd $workdir

In [0]:
bam_dir = "."
analysis_dir = os.path.join(bam_dir, "samtools1.3")
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)
assert os.path.exists(analysis_dir)

In [0]:
bam_files = !ls *.bam

In [0]:
len(bam_files) == 381

In [0]:
freebayes = "/home/cfriedline/g/src/freebayes/bin/freebayes"
freebayes_parallel = "/home/cfriedline/g/src/freebayes/scripts/freebayes-parallel"
fasta_generate_regions = "/home/cfriedline/g/src/freebayes/scripts/fasta_generate_regions.py"

In [0]:
assembly = "/gpfs_fs/home/eckertlab/SugarPine_genome/pila.v1.0.scafSeq_mapped.fasta"

In [0]:
!$samtools faidx {assembly}

In [0]:
def create_split_beds(nodes, bed):
    lines = 0
    for line in open(bed):
        lines += 1
    print(lines, lines//nodes)
    per_bed = lines//nodes
    cmd = "split -a 3 -d -l %d %s contig.bed." % (per_bed, bed)
    call(cmd.split())
create_split_beds(150, "../contigs.bed")

In [0]:
beds = !ls contig.bed.*

In [0]:
len(beds)

In [0]:
rc = u.get_client(profile="sge")
dv, lv = u.get_views(rc)
len(dv)

In [0]:
with dv.sync_imports():
    from subprocess import Popen, PIPE, STDOUT, check_output, call
    import os, sys, socket, glob

In [0]:
%px cd $workdir

In [0]:
def create_parallel_bams(args):
    os.chdir("/home/cfriedline/eckertlab/Mitra/mapping/split_parallel/collapsed/work/")
    samtools, bam_file, bed_file = args
    num = bed_file.split(".")[-1]
    out = "%s.%s" % (bam_file, num)
    if not os.path.exists(out):
        cmd = "%s view -L %s -b %s -o %s" % (samtools, bed_file, bam_file, out)
        call(cmd.split())
    return out

In [0]:
dv['create_parallel_bams'] = create_parallel_bams

In [0]:
jobs = []
args = []
for bam in bam_files:
    for bed in beds:
        a = [samtools, bam, bed]
        args.append(a)

In [0]:
len(bam_files)*len(beds)

In [0]:
jobs = dv.map_async(create_parallel_bams, args)

In [0]:
jobs.progress

In [0]:
pwd

In [0]:
def check_bams(args):
    samtools, bam = args
    os.chdir("/home/cfriedline/eckertlab/Mitra/mapping/split_parallel/collapsed/work")
    cmd = "%s quickcheck %s" % (samtools, bam)
    return call(cmd.split())

In [0]:
dv['check_bams'] = check_bams

In [0]:
par_bams = !ls *.bam.* | grep -v '.bai$'

In [0]:
len(par_bams)

In [0]:
args = []
for b in par_bams:
    args.append((samtools, b))

In [0]:
jobs = lv.map_async(check_bams, args)

In [0]:
for j in jobs:
    if not j == 0:
        print(j)

In [0]:
job_map = {}
for b in par_bams:
    num = b.split(".")[-1]
    if not num in job_map:
        job_map[num] = []
    job_map[num].append(b)

In [0]:
for num in job_map:
    job_map[num] = sorted(job_map[num])

In [0]:
## split assembly for freebayes mapping
"pyfasta split -n 150 {}".format(assembly)

In [0]:
snp_args = []
for num in job_map:
    bam_files = job_map[num]
    a = (freebayes_parallel,
         fasta_generate_regions,
         assembly, 
         bam_files,
        "freebayes",
        num)
    snp_args.append(a)

In [0]:
def call_snps(args):
    freebayes, fasta_generate_regions, assembly, bam_files, out_dir, num = args
    #freebayes-parallel <(fasta_generate_regions.py ref.fa.fai 100000) 36 -f ref.fa aln.bam >out.vcf
    cmd = "{} <({} {}.fai 200000) 3 --use-best-n-alleles 4 -f {} {} > {}.{}".format(freebayes,
                                                              fasta_generate_regions,
                                                              assembly,
                                                              assembly,
                                                              " ".join(bam_files),
                                                              os.path.join(out_dir, "freebayes.vcf"),
                                                              num)
    return cmd

In [0]:
with open("freebayes_jobs.sh", "w") as j:
    j.write("#!/bin/bash\n")
    for i, a in enumerate(snp_args):
        scr = "freebayes_run%d.sh" % i
        j.write("qsub %s\n" % scr)
        with open(scr, "w") as o:
            
            header = """#!/bin/bash
#$ -N fb%d
#$ -cwd
#$ -V
#$ -o freebayes/fb%d.out
#$ -e freebayes/fb%d.err
#$ -q all.q""" % (i, i, i)
            
            o.write("%s\n" % header)
            o.write("%s\n" % call_snps(a))

In [0]:
args_all = (freebayes_parallel,
         fasta_generate_regions,
         assembly, 
         bam_files,
        "freebayes","all")

with open("freebayes_all.sh", "w") as j:
    i = 0
    header = """#!/bin/bash
#$ -N fb%d
#$ -cwd
#$ -V
#$ -o freebayes/fb%d.out
#$ -e freebayes/fb%d.err
#$ -q all.q""" % (i, i, i)
            
    j.write("%s\n" % header)
    j.write("%s\n" % call_snps(args_all))

## Run on SGE
```bash
cd /gpfs_fs/home/eckertlab/Mitra/mapping/split_parallel/collapsed/work
mkdir freebayes
chmod +x freebayes*.sh
./freebayes_jobs.sh
```

In [0]:
cd samtools1.3/

In [0]:
vcfs = !ls samtools_1.3.vcf.gz.*

In [0]:
vcfs

In [0]:
vcf_concat = "/home/cfriedline/g/src/vcftools-0.1.14/src/perl/vcf-concat"
tabix = "/home/cfriedline/g/src/htslib-1.3/tabix"

### index parallel vcf files

```
ls samtools_1.3.vcf.* | parallel tabix
```

In [0]:
with open("concat.sh", "w") as o:
    o.write("%s -s 50 %s > concat.vcf\n" % (vcf_concat, " ".join(vcfs)))

### concatenate vcf files

```
chmod +x concat.sh
./concat.sh
```

In [0]:
!$vcf_concat --help