Skip to content

Commit

Permalink
Merge 0472cad into 453001a
Browse files Browse the repository at this point in the history
  • Loading branch information
amnona committed Aug 15, 2018
2 parents 453001a + 0472cad commit bc66e50
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 11 deletions.
2 changes: 2 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
## Version 1.0.4-dev

### Bug fixes
* Fixed problem causing deblur to ignore the indel error uper bound and use the mismatch error upper bound instead. This caused deblur to use the mismatch depenent default error rate instead of the default indel error rate (which is constant for up to 3 indels). Also increased the default indel error rate to 0.02. See [issue #178](https://github.com/biocore/deblur/issues/178) for more details.

* Fixed problem running deblur on new python versions (due to python treating the sparse matrix size (1E9) as float rather than int).

## Version 1.0.4
Expand Down
2 changes: 1 addition & 1 deletion deblur/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

__version__ = "1.0.4-dev"
__version__ = "1.0.5-dev"
14 changes: 10 additions & 4 deletions deblur/deblurring.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def get_sequences(input_seqs):

def deblur(input_seqs, mean_error=0.005,
error_dist=None,
indel_prob=0.01, indel_max=3):
indel_prob=0.02, indel_max=3):
"""Deblur the reads
Parameters
Expand All @@ -86,7 +86,7 @@ def deblur(input_seqs, mean_error=0.005,
amount of hamming distances taken into account. Default: None, use
the default error profile (from get_default_error_profile() )
indel_prob : float, optional
Indel probability (same for N indels). Default: 0.01
Indel probability (same for N indels). Default: 0.02
indel_max : int, optional
The maximal number of indels expected by errors. Default: 3
Expand Down Expand Up @@ -122,7 +122,7 @@ def deblur(input_seqs, mean_error=0.005,
mod_factor = pow((1 - mean_error), seqs[0].unaligned_length)
error_dist = np.array(error_dist) / mod_factor

max_h_dist = len(error_dist)-1
max_h_dist = len(error_dist) - 1

for seq_i in seqs:
# no need to remove neighbors if freq. is <=0
Expand Down Expand Up @@ -158,13 +158,19 @@ def deblur(input_seqs, mean_error=0.005,
# the insertions/deletions
length = min(seq_i_len, len(seq_j.sequence.rstrip('-')))
sub_seq_i = seq_i.np_sequence[:length]
sub_seq_j = seq_i.np_sequence[:length]
sub_seq_j = seq_j.np_sequence[:length]

mask = (sub_seq_i != sub_seq_j)
# find all indels
mut_is_indel = np.logical_or(sub_seq_i[mask] == 4,
sub_seq_j[mask] == 4)
num_indels = mut_is_indel.sum()
if num_indels > 0:
# need to account for indel in one sequence not solved in the other
# (so we have '-' at the end. Need to ignore it in the total count)
h_dist = np.count_nonzero(np.not_equal(seq_i.np_sequence[:length],
seq_j.np_sequence[:length]))

num_substitutions = h_dist - num_indels

correction_value = num_err[num_substitutions]
Expand Down
19 changes: 15 additions & 4 deletions deblur/test/test_deblurring.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,21 @@ def test_deblur_indel(self):
for chead, cseq in seqs:
tseq = cseq[:10] + '-' + cseq[10:]
newseqs.append((chead, tseq))
# now add a sequence with an A insertion
tseq = cseq[:10] + 'A' + cseq[10:-1]+'-'

# now add a sequence with an A insertion at the expected freq. (30 < 0.02 * (720 / 0.47) where 0.47 is the mod_factor) so should be removed
cseq = newseqs[0][1]
tseq = cseq[:10] + 'A' + cseq[11:-1] + '-'
chead = '>indel1-read;size=30;'
newseqs.append((chead, tseq))

# and add a sequence with an A insertion but at higher freq. (not expected by indel upper bound - (31 > 0.02 * (720 / 0.47) so should not be removed)
cseq = newseqs[0][1]
tseq = cseq[:10] + 'A' + cseq[11:-1] + '-'
chead = '>indel2-read;size=31;'
newseqs.append((chead, tseq))

obs = deblur(newseqs)

# remove the '-' (same as in launch_workflow)
for s in obs:
s.sequence = s.sequence.replace('-', '')
Expand All @@ -132,10 +142,11 @@ def test_deblur_indel(self):
"tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
"ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
"gcaagcttgagtctcgtagaggggggcagaattccag")]
# make sure we get 1 sequence as output
self.assertEqual(len(obs), 1)
# make sure we get 2 sequences as output - the original and the indel2 (too many reads for the expected indel probabilty)
self.assertEqual(len(obs), 2)
# and that it is the correct sequence
self.assertEqual(obs[0].sequence, exp[0].sequence)
self.assertEqual(obs[1].label, '>indel2-read;size=31;')

def test_deblur_with_non_default_error_profile(self):
error_dist = [1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005,
Expand Down
4 changes: 2 additions & 2 deletions scripts/deblur
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def deblur_cmds():
help="A comma separated list of error probabilities for each "
"hamming distance. The length of the list determines the "
"number of hamming distances taken into account.")
@click.option('--indel-prob', '-i', required=False, type=float, default=0.01,
@click.option('--indel-prob', '-i', required=False, type=float, default=0.02,
help='Insertion/deletion (indel) probability '
'(same for N indels)')
@click.option('--indel-max', required=False, type=int, default=3,
Expand Down Expand Up @@ -479,7 +479,7 @@ def build_biom_table(seqs_fp, output_biom_fp, min_reads, file_type, log_level,
help="A comma separated list of error probabilities for each "
"Hamming distance. The length of the list determines the "
"number of Hamming distances taken into account.")
@click.option('--indel-prob', '-i', required=False, type=float, default=0.01,
@click.option('--indel-prob', '-i', required=False, type=float, default=0.02,
show_default=True,
help=('Insertion/deletion (indel) probability. This probability '
'consistent for multiple indels; there is not an added '
Expand Down

0 comments on commit bc66e50

Please sign in to comment.