In [0]:
import os, sys
from IPython.parallel import Client
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
cd /data7/eckertlab/gypsy_indiv

In [0]:
fastq_files = !find . -type f | grep '.fastq$' | grep -v HiSeq
fastq_files = sorted([os.path.abspath(x) for x in fastq_files])

In [0]:
rc = Client()
dview = rc[:]
lview = rc.load_balanced_view()
print(len(rc))

In [0]:
assembly = "/home/cfriedline/data7/assemblies/gypsy/masurca/CA/10-gapclose/genome.ctg.fasta"

In [0]:
!/home/cfriedline/data7/src/bowtie2-2.2.2/bowtie2-build -f $assembly $assembly

In [0]:
@lview.remote()
def run_bowtie2(args):
    import os, stopwatch, multiprocessing
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    assembly, reads = args
    sam = "%s.sam" % reads
    cmd = "/home/cfriedline/data7/src/bowtie2-2.2.2/bowtie2 --local --very-sensitive-local -p %d -x %s -U %s -S %s" % (cpus, assembly, reads, sam)
    if not os.path.exists(sam):
        !$cmd
    timer.stop()
    return assembly, sam, cmd, timer.elapsed


In [0]:
@lview.remote()
def convert_sam_to_bam(sam):
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    bam = sam.replace(".sam", ".bam")
    bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
    bam_index = bam_sorted.replace(".bam", ".bai")
    if not os.path.exists(bam):
        !/home/cfriedline/data7/src/samtools-0.1.19/samtools view -bS $sam > $bam
        !/home/cfriedline/data7/src/samtools-0.1.19/samtools sort -@ $cpus -f $bam $bam_sorted
        !/home/cfriedline/data7/src/samtools-0.1.19/samtools index $bam_sorted $bam_index
    timer.stop()
    return bam, bam_sorted, bam_index, timer.elapsed

In [0]:
bowtie_jobs = []
for f in fastq_files:
    bowtie_jobs.append(run_bowtie2((assembly, f)))

In [0]:
bowtie_ready = 0
for j in bowtie_jobs:
    if j.ready():
        bowtie_ready += 1
    else:
        pass
print("%d/%d" % (bowtie_ready, len(bowtie_jobs)))

In [0]:
sam_bam_jobs = []
sam_files = !find . -type f | grep '.sam$'
sam_files = [os.path.abspath(x) for x in sam_files]
for f in sam_files:
    sam_bam_jobs.append(convert_sam_to_bam(os.path.abspath(f)))

In [0]:
sam_bam_count = 0
for j in sam_bam_jobs:
    if j.ready():
        sam_bam_count+=1
print("%d/%d" % (sam_bam_count, len(sam_bam_jobs)))

In [0]:
@lview.remote()
def mark_duplicates(bam_file):
    output = bam_file.replace(".bam", "_dedup.bam")
    metrics = output.replace(".bam", ".metrics")
    prog = "/home/cfriedline/data7/src/picard-tools-1.112/MarkDuplicates.jar"
    cmd = "java -jar %s INPUT=%s OUTPUT=%s METRICS_FILE=%s REMOVE_DUPLICATES=true CREATE_INDEX=true" % (prog,
                                                               bam_file,
                                                               output,
                                                               metrics)
    !$cmd
    return output, metrics

In [0]:
@lview.remote()
def index_bam(bam_file):
    bam_index = bam_file.replace(".bam", ".bai")
    !/home/cfriedline/data7/src/samtools-0.1.19/samtools index $bam_file $bam_index
    return bam_index

In [0]:
sorted_bams = !find . -type f | grep 'sorted.bam$'
sorted_bams = sorted([os.path.abspath(x) for x in sorted_bams])

In [0]:
dedup_jobs = []
for f in sorted_bams:
    dedup_jobs.append(mark_duplicates(f))

In [0]:
dedup_count = 0
for j in dedup_jobs:
#     print j.stdout
    if j.ready():
        dedup_count += 1
print("%d/%d" % (dedup_count, len(dedup_jobs)))

In [0]:
index_jobs = []
for j in dedup_jobs:
    if j.ready():
        dedup_bam = j.r[0]
        index_jobs.append(index_bam(dedup_bam))

In [0]:
index_count = 0
for j in index_jobs:
    if j.ready():
        index_count += 1
print("%d/%d" % (index_count, len(index_jobs)))

In [0]:
dedup_bams = !find . -type f | grep 'dedup.bam$'
dedup_bams = sorted([os.path.abspath(x) for x in dedup_bams])

In [0]:
@lview.remote()
def run_realigner_target_creator(bam, assembly):
    import os, socket, sys
    java = "/home/cfriedline/jdk1.7.0_25/bin/java"
    gatk = "%s -Xmx4g -jar /home/cfriedline/data7/src/GATK-3.1.1/GenomeAnalysisTK.jar" % java
    intervals = "%s.intervals" % bam
    cmd = "%s -T RealignerTargetCreator -R %s -I %s -o %s -nt 8" % (gatk,
                                                              assembly,
                                                              bam,
                                                              intervals)
    print socket.gethostname(), cmd
    !$cmd
    return cmd, socket.gethostname(), intervals

In [0]:
@lview.remote()
def add_rg_info_to_bam(bam):
    import os
    lane_map = {"I1": 1, "I3":2}
    cmd = "java -jar /home/cfriedline/data7/src/picard-tools-1.112/AddOrReplaceReadGroups.jar"
    base = os.path.basename(bam).split(".")
    bam_rg = bam.replace(".bam", "_rg.bam")
    rglb = os.path.basename(os.path.dirname(bam))
    rgpu = base[0].split("_")[-1]
    rgsm = base[0]
    rgid = "FLOWCELL.LANE%d.%s" % (lane_map[os.path.basename(os.path.dirname(bam))],
                                   rgsm)
    rg_string = "RGID=%s RGLB=%s RGPL=illumina RGPU=%s RGSM=%s" % (rgid,
                                                                   rglb,
                                                                   rgpu,
                                                                   rgsm)
    cmd = "%s INPUT=%s OUTPUT=%s %s CREATE_INDEX=true" % (cmd,
                                                          bam,
                                                          bam_rg,
                                                          rg_string)
    !$cmd
    return bam_rg, rg_string, cmd
    

In [0]:
add_rg = []
for f in sorted_bams:
    add_rg.append(add_rg_info_to_bam(f))


In [0]:
rg_count = 0
for j in add_rg:
    if j.ready():
        rg_count += 1
    else:
        pass
#         print j.stdout
print("%d/%d" % (rg_count, len(add_rg)))

In [0]:
rg_bams = !find . -type f | grep 'rg.bam$'
rg_bams = sorted([os.path.abspath(x) for x in rg_bams])

In [0]:
rg_dedup = []
for f in rg_bams:
    rg_dedup.append(mark_duplicates(f))

In [0]:
rg_dedup_count = 0
for j in rg_dedup:
    if j.ready():
        rg_dedup_count += 1
    else:
        pass
        print(j.stdout)
print("%d/%d" % (rg_dedup_count, len(rg_dedup)))

## get the deduped bam files with rg info added

In [0]:
rg_dedup_bams = !find . -type f | grep 'rg_dedup.bam$'
rg_dedup_bams = sorted([os.path.abspath(x) for x in rg_dedup_bams])

In [0]:
rg_dedup_bams

In [0]:
#checking file sizes between sorted_rg bam files and sorted_rg_dedup
rg_sizes = []
rgd_sizes = []
for rgd in rg_dedup_bams:
    rg = rgd.replace("_dedup", "")
    rgsize = os.path.getsize(rg)/1e6
    rgdsize = os.path.getsize(rgd)/1e6
    rg_sizes.append(rgsize)
    rgd_sizes.append(rgdsize)
plt.plot(rg_sizes)
plt.plot(rgd_sizes)
plt.show()

##setting up for freebayes

In [0]:
bamlist_out = "bamlist.txt"
with open(bamlist_out, "w") as o:
    for x in rg_dedup_bams:
        o.write("%s\n" % x)
freebayes = "/home/cfriedline/data7/src/freebayes/bin/freebayes"
cmd = "%s -L %s -f %s -v %s" % (freebayes, 
                                   os.path.abspath(bamlist_out),
                                   assembly,
                                   os.path.abspath(os.path.join(os.path.dirname(bamlist_out), "gypsy_freebayes_out.vcf")))
print(cmd)
