In [1]:
from __future__ import print_function
import json
from matplotlib import pyplot as plt
import os
import sys
import glob
from time import time
import itertools
from itertools import islice, chain
from struct import *
import shutil
import pandas as pd
import argparse
import errno
import math
import subprocess

# import pickle
import dill as pickle 
import gffutils
import pysam
import numpy as np
import multiprocessing as mp
from multiprocessing import Pool
from collections import defaultdict
# from collections import OrderedDict

# from modules import create_splice_graph as splice_graph
# from modules import graph_chainer 

from uLTRA.modules import create_augmented_gene as augmented_gene 
from uLTRA.modules import mem_wrapper 
from uLTRA.modules import colinear_solver 
from uLTRA.modules import help_functions
from uLTRA.modules import classify_read_with_mams
from uLTRA.modules import classify_alignment2
from uLTRA.modules import sam_output
from uLTRA.modules import align
from uLTRA.modules import prefilter_genomic_reads

from new_modules import functions
from new_modules import evaluate_exons
from new_modules import evaluate_splice_annotations
from new_modules import get_diff_loc_reads
from uLTRA.evaluation import plot_correctness_per_exon_size
from uLTRA.evaluation import plots
from uLTRA.evaluation import venn_diagram

In [7]:
from new_modules.arguments import arguments

def initialize_dump(outfolder):
    if os.path.exists(outfolder) and os.path.isdir(outfolder):
        shutil.rmtree(outfolder)
    help_functions.mkdir_p(outfolder)
    
def create_args(ref, gtf, bed, reads, ont, isoseq, disable_infer, name, tool_name, mm2, desalt_index):
    args = arguments(ref, gtf, bed, reads, ont, isoseq, disable_infer, name, tool_name, mm2, desalt_index)
    s = args.reads
    s = s[s.rindex('/')+1:]
    s = s[:s.rindex('.')]
    args.name = s
    if args.ont:
        args.min_mem = 17
        args.min_acc = 0.6
        args.mm2_ksize = 14
        # args.alignment_threshold = 0.5
    if args.isoseq:
        args.min_mem = 20
        args.min_acc = 0.8
        # args.alignment_threshold = 0.5
    return args

In [12]:
def set_desalt_args(args):
    d = 10 
    s = 2
    l = 14
    noncan = 9
    max_intron_length = 500000
    index_path = args.desalt_index
    
    if(args.gtf == None):
        if("SIRV" in args.ref):
            noncan = 4
            max_intron_length = 200000
            return ["desalt", "aln", index_path, args.reads, 
                               "-d", str(d), "-s", str(s),
                                "--noncan", str(noncan), "--max-intron-len", str(max_intron_length),
                    "-o", os.path.join(args.outfolder, "reads.sam")]
        else:
            return ["desalt", "aln", index_path, args.reads, 
                               "-d", str(d), "-s", str(s),
                                "--noncan", str(noncan), "--max-intron-len", str(max_intron_length),
                    "-o", os.path.join(args.outfolder, "reads.sam")]
    else:
        if("SIRV" in args.ref):
            noncan = 4
            max_intron_length = 200000
            return ["desalt", "aln", index_path, args.reads, 
                               "-d", str(d), "-s", str(s),
                                "--noncan", str(noncan), "--max-intron-len", str(max_intron_length),
                                "--gtf", args.gtf,
                                "-o", os.path.join(args.outfolder, "reads.sam")]
        else:
            return ["desalt", "aln", index_path, args.reads, 
                               "-d", str(d), "-s", str(s),
                                "--noncan", str(noncan), "--max-intron-len", str(max_intron_length),
                                "--gtf", args.gtf,
                                "-o", os.path.join(args.outfolder, "reads.sam")]


def deSALT(args):
    reads = os.path.join(args.outfolder, "reads.sam")
    if (not os.path.exists(reads)):
        stats = {}
        stats["dataset"] = args.name
        initialize_dump(args.outfolder)
        deSALT_start = time()
        if not os.path.exists(args.desalt_index): 
            subprocess.check_call(["desalt", "index",args.ref, os.path.join(args.outfolder, "index")])
        subprocess.check_call(set_desalt_args(args), env = os.environ)
        stats["total_time"] = time() - deSALT_start
        json.dump(stats, open(os.path.join(args.outfolder, "stats.json"), "w"))


    subprocess.check_call(["desalt", "aln"])

In [13]:
# Create a dict with the paths of each dataset
dataset_dict = {
    # "test" : {
    #     "ref": "uLTRA/test/SIRV_genes.fasta" ,
    #     "reads": "uLTRA/test/reads.fa" ,
    #     "gtf": "uLTRA/test/SIRV_genes_C_170612a.gtf" ,
    #     "bed":"uLTRA/test/SIRV_genes_C_170612a.bed",
    #     "ont": False,
    #     "isoseq": True,
    #     "disable_infer": False,
    # },
     "DROS": {"ref": "data/genome/DROS/DROS.BDGP6.28.all.fa",
            "reads": "data/DROS/DROS_processed.fastq",
            "gtf": "data/genome/annotations/Drosophila_melanogaster.BDGP6.28.102.gtf",
            "bed":"data/genome/annotations/bed/Drosophila_melanogaster.BDGP6.28.102.bed",
            "desalt_index":"data/genome/indexes/dmIndex",
            "ont": True,
            "isoseq": False,
            "disable_infer": True,
            "desalt_index":"data/genome/indexes/dm",   
    },
}

In [14]:
def real_data_pipeline(dataset):
    category_stats = {"uLTRA" :{}, "uLTRA_mm2" :{}, "minimap2" :{}, "minimap2_GTF" :{}, "deSALT" :{}, "deSALT_GTF" :{}}
    
    ref = dataset["ref"]
    gtf = dataset["gtf"]
    bed = dataset["bed"]
    reads = dataset["reads"]
    ont = dataset["ont"]
    isoseq = dataset["isoseq"]
    disable_infer = dataset["disable_infer"]
    desalt_index = dataset["desalt_index"]
    
    # ultra
    toolname = "deSALT"
    args = create_args(ref, None, bed, reads, ont, isoseq, disable_infer, dataset_name, toolname, True, desalt_index)
    deSALT(args)

In [15]:
for dataset_name in dataset_dict.keys():
    dataset = dataset_dict[dataset_name]
    is_real = dataset["ont"] or dataset["isoseq"] # if the dataset is real
    if is_real:
        real_data_pipeline(dataset)
    else:
        sim_data_pipeline(dataset)

creating output/DROS/deSALT


[Main] deSALT - De Bruijn graph-based Spliced Aligner for Long Transcriptome reads
[Param-INFO] deSALT parameters:index-kmer:22	seed-lmer:15	hash-kmer:8	thread:4	strand_diff:10	identify junction:both_strand
[Phase-INFO] Loading Index and Reads
[Phase-INFO] Seeding and Chaining Phase (first-pass)
[Skeleton-generation] Generating skeletons of 655350 reads, total 413218919 bases in 31.866877 seconds
[Skeleton-generation] Generating skeletons of 655350 reads, total 435255562 bases in 29.593379 seconds
[Skeleton-generation] Generating skeletons of 155782 reads, total 103536043 bases in 7.059926 seconds
[Phase-INFO] Total 1466482 reads were processed in 68.532 seconds (first-pass)
[Phase-INFO] Refined Alignment Phase (second-pass)
[Phase-INFO] Exons inference by skeletons of all reads
[Phase-INFO] Inferring total 56690 isolated regions (pseudo-exons) after merging and filtering
[Phase-INFO] Refining pseudo-exons by scoring matrix
[Loop-ProcessReads] The 0st loop of refined alignment procedur