Skip to content

Commit

Permalink
Merge pull request #200 from althonos/feat-pyrodigal
Browse files Browse the repository at this point in the history
Use Pyrodigal instead of Prodigal for ORF prediction
  • Loading branch information
raphenya committed Dec 7, 2022
2 parents 752b9d4 + 58064cb commit ed0d289
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 5 deletions.
5 changes: 5 additions & 0 deletions app/MainBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ def main_args(self):
choices=['wgs', 'plasmid', 'chromosome', 'NA'],
help = "specify a data-type (default = NA)")
parser.add_argument('-v','--version', action='version', version="{}".format(SOFTWARE_VERSION), help = "prints software version number")
parser.add_argument('-O', '--orf_finder', dest="orf_finder",
type=str.upper,
choices=['PRODIGAL', 'PYRODIGAL'],
default='PYRODIGAL',
help="specify ORF finding tool (default = PYRODIGAL)")
parser.add_argument('--split_prodigal_jobs', dest="split_prodigal_jobs", action="store_true", help="run multiple prodigal jobs simultaneously for contigs in a fasta file (default: False)")
return parser

Expand Down
54 changes: 52 additions & 2 deletions app/ORF.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from app.settings import os, SeqIO, logger
import tempfile, time, fileinput, math, multiprocessing, shutil
import contextlib, tempfile, time, fileinput, math, multiprocessing, shutil
from multiprocessing.pool import ThreadPool
import pyrodigal
from Bio import SeqIO

class ORF(object):
"""Class to find open reading frames from nucleotide sequence."""
Expand All @@ -25,7 +28,7 @@ def contig_to_orf(self):
self.orf_prodigal()

def min_max_sequence_length(self):
"""Returns minimum and maximun sequence length in multi-fasta inputs"""
"""Returns minimum and maximun sequence length in multi-fasta inputs"""
sequences = []
for record in SeqIO.parse(self.input_file, "fasta"):
sequences.append(len(record.seq))
Expand Down Expand Up @@ -214,4 +217,51 @@ def get_character_len(self,file_path):



class PyORF(object):
"""Class to find open reading frames using Pyrodigal."""

def __init__(self,input_file, threads, clean=True, working_directory=None, low_quality=False, training_file=None, split_prodigal_jobs=False):
"""Creates ORF object for finding open reading frames."""
self.input_file = input_file
self.clean = clean
self.working_directory = working_directory
self.low_quality = low_quality
self.training_file = training_file
self.threads = threads
self.split_prodigal_jobs = split_prodigal_jobs

def __repr__(self):
"""Returns ORF class full object."""
return "PyORF({}".format(self.__dict__)

def contig_to_orf(self):
"""Find open reading frames in contigs."""
# load sequences to be processed
records = list(SeqIO.parse(self.input_file, "fasta"))
sequences = [ str(record.seq) for record in records ]
minimum_sequence_length = min((len(seq) for seq in sequences), default=0)

# create an ORF finder in single or meta mode based on configuration
if self.training_file is not None:
with open(self.training_file, "rb") as src:
training_info = pyrodigal.TrainingInfo.load(src)
orf_finder = pyrodigal.OrfFinder(meta=False, mask=True, training_info=training_info)
elif self.low_quality or minimum_sequence_length < 20000:
orf_finder = pyrodigal.OrfFinder(meta=True, mask=True)
else:
orf_finder = pyrodigal.OrfFinder(meta=False, mask=True)
orf_finder.train(*sequences, force_nonsd=True)

with contextlib.ExitStack() as stack:
# open result files
filename = os.path.basename(self.input_file)
trans_filename = os.path.join(self.working_directory, "{}.temp.contig.fsa".format(filename))
trans_file = stack.enter_context(open(trans_filename, "w"))
nuc_filename = os.path.join(self.working_directory, "{}.temp.contigToORF.fsa".format(filename))
nuc_file = stack.enter_context(open(nuc_filename, "w"))
# prepare a pool to run ORF detection in parallel
pool = stack.enter_context(ThreadPool(self.threads))
# detect genes and save results to output files
for record, genes in zip(records, pool.map(orf_finder.find_genes, sequences)):
genes.write_genes(nuc_file, record.id)
genes.write_translations(trans_file, record.id)
13 changes: 10 additions & 3 deletions app/RGI.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from app.Database import Database
from app.Blast import Blast
from app.Diamond import Diamond
from app.ORF import ORF
from app.ORF import ORF, PyORF
from app.Filter import Filter

import filetype
Expand All @@ -17,7 +17,7 @@ class RGI(RGIBase):
"""Class to predict resistome(s) from protein or nucleotide data based on CARD detection models."""

def __init__(self,input_type='contig',input_sequence=None,threads=32,output_file=None,loose=False, \
clean=True,data='na',aligner='blast',galaxy=None, local_database=False, low_quality=False, debug=False, split_prodigal_jobs=False, include_nudge=False, keep=False):
clean=True,data='na',aligner='blast',galaxy=None, local_database=False, low_quality=False, debug=False, split_prodigal_jobs=False, include_nudge=False, keep=False, orf_finder='pyrodigal'):
"""Creates RGI object for resistome(s) prediction."""

o_f_path, o_f_name = os.path.split(os.path.abspath(output_file))
Expand All @@ -31,6 +31,7 @@ def __init__(self,input_type='contig',input_sequence=None,threads=32,output_file
self.clean = clean
self.data = data
self.aligner = aligner.lower()
self.orf_finder = orf_finder.lower()
self.database = galaxy
self.low_quality = low_quality

Expand Down Expand Up @@ -341,7 +342,13 @@ def process_protein(self):
def process_contig(self):
"""Process nuclotide sequence(s)."""
file_name = os.path.basename(self.input_sequence)
orf_obj = ORF(input_file=self.input_sequence, threads=self.threads, clean=self.clean, working_directory=self.working_directory, low_quality=self.low_quality, split_prodigal_jobs=self.split_prodigal_jobs)

if self.orf_finder == "pyrodigal":
orf_finder_cls = PyORF
elif self.orf_finder == "prodigal":
orf_finder_cls = ORF

orf_obj = orf_finder_cls(input_file=self.input_sequence, threads=self.threads, clean=self.clean, working_directory=self.working_directory, low_quality=self.low_quality, split_prodigal_jobs=self.split_prodigal_jobs)
orf_obj.contig_to_orf()
contig_fsa_file = os.path.join(self.working_directory,"{}.temp.contig.fsa".format(file_name))
blast_results_xml_file = os.path.join(self.working_directory,"{}.temp.contig.fsa.blastRes.xml".format(file_name))
Expand Down
1 change: 1 addition & 0 deletions conda_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies:
- blast 2.9.0
- zlib
- prodigal 2.6.3
- pyrodigal >=2.0.0
- diamond 0.8.36
- oligoarrayaux 3.8
- samtools 1.9
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ beautifulsoup4>=4.9.3
requests==2.24.0
lxml>=4.9.1
dask[dataframe]
pyrodigal>=2.0.0

0 comments on commit ed0d289

Please sign in to comment.