Skip to content

Commit

Permalink
moved count_fastq_reads() to util/file.py; refactored to use grep cou…
Browse files Browse the repository at this point in the history
…nting on non-gzip files

moved count_fastq_reads() to util/file.py; refactored to use grep
counting on non-gzip files which seems to be faster on Pythons <3.5.
changed fasta_length() to use grep_count() which is also used by
count_fastq_reads()
  • Loading branch information
tomkinsc committed Jun 16, 2016
1 parent c98f1e9 commit d40d511
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 17 deletions.
17 changes: 5 additions & 12 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,6 @@ def __init__(self, n_start, n_trimmed, n_rmdup, n_purge, n_subsamp):
n_start, n_trimmed, n_rmdup, n_purge, n_subsamp))


def count_fastq_reads(inFastq):
''' Maybe move this to util.file one day '''
with util.file.open_or_gzopen(inFastq, 'rt') as inf:
n = sum(1 for line in inf if line.startswith('@'))
return n


def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
''' Take reads through Trimmomatic, Prinseq, and subsampling.
This should probably move over to read_utils or taxon_filter.
Expand All @@ -69,7 +62,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
# BAM -> fastq
infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq']))
tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1])
n_input = count_fastq_reads(infq[0])
n_input = util.file.count_fastq_reads(infq[0])

# Trimmomatic
trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq']))
Expand All @@ -80,7 +73,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb)
os.unlink(infq[0])
os.unlink(infq[1])
n_trim = count_fastq_reads(trimfq[0])
n_trim = util.file.count_fastq_reads(trimfq[0])

# Prinseq
rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq']))
Expand All @@ -91,7 +84,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
read_utils.rmdup_prinseq_fastq(trimfq[i], rmdupfq[i])
os.unlink(trimfq[0])
os.unlink(trimfq[1])
n_rmdup = count_fastq_reads(rmdupfq[0])
n_rmdup = util.file.count_fastq_reads(rmdupfq[0])

# Purge unmated
purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq']))
Expand All @@ -102,7 +95,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
os.unlink(rmdupfq[0])
os.unlink(rmdupfq[1])
n_purge = count_fastq_reads(purgefq[0])
n_purge = util.file.count_fastq_reads(purgefq[0])

# Log count
log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n_purge)
Expand All @@ -127,7 +120,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
util.misc.run_and_print(cmd, check=True)
os.unlink(purgefq[0])
os.unlink(purgefq[1])
n_subsamp = count_fastq_reads(subsampfq[0])
n_subsamp = util.file.count_fastq_reads(subsampfq[0])

# Fastq -> BAM
# Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way
Expand Down
57 changes: 52 additions & 5 deletions util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,13 +390,60 @@ def string_to_file_name(string_value):

return string_value

def fasta_length(fasta_path):
if not os.path.isfile(fasta_path) or os.path.getsize(fasta_path)==0:
def grep_count(file_path, to_match, additional_flags=None, fixed_mode=True, starts_with=False):
'''
This uses grep for fast counting of strings in a file
'''
if not os.path.isfile(file_path) or os.path.getsize(file_path)==0:
return 0

env = os.environ.copy()
env['LC_ALL'] = 'C' #use C locale rather than UTF8 for faster grep

# grep for record start characters, in fixed-string mode (no regex)
number_of_seqs = util.misc.run_and_print(["grep", "-cF", ">", fasta_path], silent=False, check=True, env=env)

cmd = ["grep"]
# '-c' returns the match count
cmd.append("-c")
if additional_flags:
cmd.extend(additional_flags)

# fixed mode cannot be used with starts_with, since it does not match regular expressions
# only add the fixed_mode flag if we're not using starts_with
if not starts_with:
if fixed_mode:
cmd.append("-F")
cmd.append(to_match)
else:
cmd.append("^"+to_match)

cmd.append(file_path)

number_of_seqs = util.misc.run_and_print(cmd, silent=False, check=True, env=env)
return int(number_of_seqs.stdout.decode("utf-8").rstrip(os.linesep))

def count_str_in_file(in_file, query_str, starts_with=False):
if not os.path.isfile(in_file) or os.path.getsize(in_file)==0:
return 0

if in_file.endswith('.gz'):
n = 0
with gzip.open(in_file, 'rt') as inf:
if starts_with:
n = sum(1 for line in inf if line[0]==query_str)
else:
n = sum(1 for line in inf if query_str in line)
return n
# use grep count for non-gzip files since it seems to be faster than native on Pythons <3.5
else:
return grep_count(in_file, query_str, starts_with=starts_with)

def fasta_length(fasta_path):
'''
Count number of records in fasta file
'''
return count_str_in_file(fasta_path, '>', starts_with=True)

def count_fastq_reads(inFastq):
'''
Count number of reads in fastq file
'''
return count_str_in_file(inFastq, '@', starts_with=True)

0 comments on commit d40d511

Please sign in to comment.