Merge pull request #200 from althonos/feat-pyrodigal

Use Pyrodigal instead of Prodigal for ORF prediction
arpcard · Dec 7, 2022 · ed0d289 · ed0d289
2 parents 752b9d4 + 58064cb
commit ed0d289
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 5 deletions.
diff --git a/app/MainBase.py b/app/MainBase.py
@@ -115,6 +115,11 @@ def main_args(self):
                 choices=['wgs', 'plasmid', 'chromosome', 'NA'],
                 help = "specify a data-type (default = NA)")
         parser.add_argument('-v','--version', action='version', version="{}".format(SOFTWARE_VERSION), help = "prints software version number")
+        parser.add_argument('-O',  '--orf_finder', dest="orf_finder",
+            type=str.upper,
+            choices=['PRODIGAL', 'PYRODIGAL'],
+            default='PYRODIGAL',
+            help="specify ORF finding tool (default = PYRODIGAL)")
         parser.add_argument('--split_prodigal_jobs', dest="split_prodigal_jobs", action="store_true", help="run multiple prodigal jobs simultaneously for contigs in a fasta file (default: False)")
         return parser
 

diff --git a/app/ORF.py b/app/ORF.py
@@ -1,5 +1,8 @@
 from app.settings import os, SeqIO, logger
-import tempfile, time, fileinput, math, multiprocessing, shutil
+import contextlib, tempfile, time, fileinput, math, multiprocessing, shutil
+from multiprocessing.pool import ThreadPool
+import pyrodigal
+from Bio import SeqIO
 
 class ORF(object):
 	"""Class to find open reading frames from nucleotide sequence."""
@@ -25,7 +28,7 @@ def contig_to_orf(self):
 			self.orf_prodigal()
 
 	def min_max_sequence_length(self):
-		"""Returns minimum and maximun sequence length in multi-fasta inputs""" 
+		"""Returns minimum and maximun sequence length in multi-fasta inputs"""
 		sequences = []
 		for record in SeqIO.parse(self.input_file, "fasta"):
 			sequences.append(len(record.seq))
@@ -214,4 +217,51 @@ def get_character_len(self,file_path):
 
 
 
+class PyORF(object):
+	"""Class to find open reading frames using Pyrodigal."""
 
+	def __init__(self,input_file, threads, clean=True, working_directory=None, low_quality=False, training_file=None, split_prodigal_jobs=False):
+		"""Creates ORF object for finding open reading frames."""
+		self.input_file = input_file
+		self.clean = clean
+		self.working_directory = working_directory
+		self.low_quality = low_quality
+		self.training_file = training_file
+		self.threads = threads
+		self.split_prodigal_jobs = split_prodigal_jobs
+
+	def __repr__(self):
+		"""Returns ORF class full object."""
+		return "PyORF({}".format(self.__dict__)
+
+	def contig_to_orf(self):
+		"""Find open reading frames in contigs."""
+		# load sequences to be processed
+		records = list(SeqIO.parse(self.input_file, "fasta"))
+		sequences = [ str(record.seq) for record in records ]
+		minimum_sequence_length = min((len(seq) for seq in sequences), default=0)
+
+		# create an ORF finder in single or meta mode based on configuration
+		if self.training_file is not None:
+			with open(self.training_file, "rb") as src:
+				training_info = pyrodigal.TrainingInfo.load(src)
+			orf_finder = pyrodigal.OrfFinder(meta=False, mask=True, training_info=training_info)
+		elif self.low_quality or minimum_sequence_length < 20000:
+			orf_finder = pyrodigal.OrfFinder(meta=True, mask=True)
+		else:
+			orf_finder = pyrodigal.OrfFinder(meta=False, mask=True)
+			orf_finder.train(*sequences, force_nonsd=True)
+
+		with contextlib.ExitStack() as stack:
+			# open result files
+			filename = os.path.basename(self.input_file)
+			trans_filename = os.path.join(self.working_directory, "{}.temp.contig.fsa".format(filename))
+			trans_file = stack.enter_context(open(trans_filename, "w"))
+			nuc_filename = os.path.join(self.working_directory, "{}.temp.contigToORF.fsa".format(filename))
+			nuc_file = stack.enter_context(open(nuc_filename, "w"))
+			# prepare a pool to run ORF detection in parallel
+			pool = stack.enter_context(ThreadPool(self.threads))
+			# detect genes and save results to output files
+			for record, genes in zip(records, pool.map(orf_finder.find_genes, sequences)):
+				genes.write_genes(nuc_file, record.id)
+				genes.write_translations(trans_file, record.id)
diff --git a/app/RGI.py b/app/RGI.py
@@ -2,7 +2,7 @@
 from app.Database import Database
 from app.Blast import Blast
 from app.Diamond import Diamond
-from app.ORF import ORF
+from app.ORF import ORF, PyORF
 from app.Filter import Filter
 
 import filetype
@@ -17,7 +17,7 @@ class RGI(RGIBase):
 	"""Class to predict resistome(s) from protein or nucleotide data based on CARD detection models."""
 
 	def __init__(self,input_type='contig',input_sequence=None,threads=32,output_file=None,loose=False, \
-				clean=True,data='na',aligner='blast',galaxy=None, local_database=False, low_quality=False, debug=False, split_prodigal_jobs=False, include_nudge=False, keep=False):
+				clean=True,data='na',aligner='blast',galaxy=None, local_database=False, low_quality=False, debug=False, split_prodigal_jobs=False, include_nudge=False, keep=False, orf_finder='pyrodigal'):
 		"""Creates RGI object for resistome(s) prediction."""
 
 		o_f_path, o_f_name = os.path.split(os.path.abspath(output_file))
@@ -31,6 +31,7 @@ def __init__(self,input_type='contig',input_sequence=None,threads=32,output_file
 		self.clean = clean
 		self.data = data
 		self.aligner = aligner.lower()
+		self.orf_finder = orf_finder.lower()
 		self.database = galaxy
 		self.low_quality = low_quality
 
@@ -341,7 +342,13 @@ def process_protein(self):
 	def process_contig(self):
 		"""Process nuclotide sequence(s)."""
 		file_name = os.path.basename(self.input_sequence)
-		orf_obj = ORF(input_file=self.input_sequence, threads=self.threads, clean=self.clean, working_directory=self.working_directory, low_quality=self.low_quality, split_prodigal_jobs=self.split_prodigal_jobs)
+
+		if self.orf_finder == "pyrodigal":
+			orf_finder_cls = PyORF
+		elif self.orf_finder == "prodigal":
+			orf_finder_cls = ORF
+
+		orf_obj = orf_finder_cls(input_file=self.input_sequence, threads=self.threads, clean=self.clean, working_directory=self.working_directory, low_quality=self.low_quality, split_prodigal_jobs=self.split_prodigal_jobs)
 		orf_obj.contig_to_orf()
 		contig_fsa_file = os.path.join(self.working_directory,"{}.temp.contig.fsa".format(file_name))
 		blast_results_xml_file = os.path.join(self.working_directory,"{}.temp.contig.fsa.blastRes.xml".format(file_name))

diff --git a/conda_env.yml b/conda_env.yml
@@ -21,6 +21,7 @@ dependencies:
   - blast 2.9.0
   - zlib
   - prodigal 2.6.3
+  - pyrodigal >=2.0.0
   - diamond 0.8.36
   - oligoarrayaux 3.8
   - samtools 1.9

diff --git a/requirements.txt b/requirements.txt
@@ -11,3 +11,4 @@ beautifulsoup4>=4.9.3
 requests==2.24.0
 lxml>=4.9.1
 dask[dataframe]
+pyrodigal>=2.0.0