Browse files

hmmer3.pm: Fixed a bug where an homology line ending in CS or RF

aminoacids made "next_seq" think it was the consensus structure
track, braking the parser. Added a test that show this
  • Loading branch information...
1 parent 8fe9c61 commit 41b4de2894f7711f24b89ba23bc0fcfefa90e76a @fjossandon fjossandon committed Aug 6, 2014
Showing with 146 additions and 43 deletions.
  1. +12 −3 Bio/SearchIO/hmmer3.pm
  2. +87 −40 t/SearchIO/hmmer.t
  3. +47 −0 t/data/hmmscan_sec_struct.out
View
15 Bio/SearchIO/hmmer3.pm
@@ -461,7 +461,7 @@ sub next_result {
elsif ( /inclusion threshold/ ) {
while ( defined( $_ = $self->_readline ) ) {
if ( /Domain( and alignment)? annotation for each/
- || /Internal pipeline statistics summary/
+ || /Internal pipeline statistics summary/
|| /Annotation for each hit\s+\(and alignments\)/
)
{
@@ -676,6 +676,12 @@ sub next_result {
}
elsif ($_ =~ m/^$/ )
{
+ # Reset these scalars on empty lines to help
+ # distinguish between the consensus structure/reference
+ # tracks (CS|RF lines) and homology lines ending in
+ # CS or RF aminoacids
+ $align_offset = 0;
+ $align_length = 0;
next;
}
@@ -693,8 +699,11 @@ sub next_result {
$pline = $$hsp[-1];
$lastdomain = $name;
}
- # model data track, some reports don't have
- elsif ( $_ =~ m/\s+\S+\s(?:CS|RF)$/ ) {
+ # Consensus Structure or Reference track, some reports
+ # don't have it. Since it appears on top of the alignment,
+ # the reset of $align_length to 0 between alignment blocks
+ # avoid confusing homology lines with it.
+ elsif ( $_ =~ m/\s+\S+\s(?:CS|RF)$/ and $align_length == 0 ) {
my @data = split( " ", $_ );
$csline .= $data[-2];
$max_count++;
View
127 t/SearchIO/hmmer.t
@@ -7,7 +7,7 @@ BEGIN {
use lib '.';
use Bio::Root::Test;
- test_begin( -tests => 798 );
+ test_begin( -tests => 816 );
use_ok('Bio::SearchIO');
}
@@ -1173,50 +1173,97 @@ $searchio = Bio::SearchIO->new(
]
);
+my $result_counter = 0;
while ( $result = $searchio->next_result ) {
- is( ref($result),
- 'Bio::Search::Result::HMMERResult',
- 'Check for the correct result reference type'
- );
- is( $result->algorithm, 'HMMSCAN', 'Check algorithm' );
- is( $result->algorithm_version, '3.0', 'Check algorithm version' );
- is( $result->hmm_name, 'Pfam-A.hmm', 'Check hmm_name' );
- is( $result->sequence_file, 'BA000019.orf8.fasta',
- 'Check sequence_file' );
- is( $result->query_name, 'BA000019.orf8', 'Check query_name' );
- is( $result->query_length, '348', 'Check query_length' );
- is( $result->query_description, '', 'Check query_description' );
- is( $result->num_hits(), 3, 'Check num_hits' );
- my ( $hsp, $hit );
+ $result_counter++;
+ if ($result_counter == 1) {
+ is( ref($result),
+ 'Bio::Search::Result::HMMERResult',
+ 'Check for the correct result reference type'
+ );
+ is( $result->algorithm, 'HMMSCAN', 'Check algorithm' );
+ is( $result->algorithm_version, '3.0', 'Check algorithm version' );
+ is( $result->hmm_name, 'Pfam-A.hmm', 'Check hmm_name' );
+ is( $result->sequence_file, 'BA000019.orf8.fasta', 'Check sequence_file' );
+ is( $result->query_name, 'BA000019.orf8', 'Check query_name' );
+ is( $result->query_length, 348, 'Check query_length' );
+ is( $result->query_description, '', 'Check query_description' );
+ is( $result->num_hits(), 3, 'Check num_hits' );
+ my ( $hsp, $hit );
- while ( $hit = $result->next_model ) {
- if ($hit->name eq 'PKSI-KS_m3') {
- # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER.
- # When is not known, sometimes it can be deduced from domain data '[]'
- is( $hit->length, 16, 'Check hit length' );
- is( $hit->frac_aligned_query, 0.09 );
- is( $hit->frac_aligned_hit, '1.00' );
+ while ( $hit = $result->next_model ) {
+ if ($hit->name eq 'PKSI-KS_m3') {
+ # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER.
+ # When is not known, sometimes it can be deduced from domain data '[]'
+ is( $hit->length, 16, 'Check hit length' );
+ is( $hit->frac_aligned_query, 0.09 );
+ is( $hit->frac_aligned_hit, '1.00' );
+ }
+ my @expected = @{ shift @multi_hits };
+ is( ref($hit), 'Bio::Search::Hit::HMMERHit',
+ 'Check for the correct hit reference type' );
+ is( $hit->name, shift @expected, 'Check hit name' );
+ is( $hit->description, shift @expected, 'Check for hit description' );
+ is( $hit->raw_score, shift @expected, 'Check hit raw_score' );
+ float_is(
+ $hit->significance,
+ shift @expected,
+ 'Check hit significance'
+ );
+ is( $hit->num_hsps, shift @expected, 'Check num_hsps' );
+ my @hsp_list = @{ shift @expected };
+
+ while ( defined( $hsp = $hit->next_domain ) ) {
+ my @hsp_exp = @{ shift @hsp_list };
+ is( ref($hsp), 'Bio::Search::HSP::HMMERHSP',
+ 'Check for correct hsp reference type' );
+ is( $hsp->hit_string, shift @hsp_exp, 'Check hit sequence' );
+ is( $hsp->query_string, shift @hsp_exp, 'Check query sequence' );
+ }
}
- my @expected = @{ shift @multi_hits };
- is( ref($hit), 'Bio::Search::Hit::HMMERHit',
- 'Check for the correct hit reference type' );
- is( $hit->name, shift @expected, 'Check hit name' );
- is( $hit->description, shift @expected, 'Check for hit description' );
- is( $hit->raw_score, shift @expected, 'Check hit raw_score' );
- float_is(
- $hit->significance,
- shift @expected,
- 'Check hit significance'
+ }
+ elsif ($result_counter == 2) {
+ is( ref($result),
+ 'Bio::Search::Result::HMMERResult',
+ 'Check for the correct result reference type'
);
- is( $hit->num_hsps, shift @expected, 'Check num_hsps' );
- my @hsp_list = @{ shift @expected };
+ is( $result->algorithm, 'HMMSCAN', 'Check algorithm' );
+ is( $result->algorithm_version, '3.0', 'Check algorithm version' );
+ is( $result->query_name, 'lcl|aorf_00010|P1', 'Check query_name' );
+ is( $result->query_length, 132, 'Check query_length' );
+ is( $result->query_description, 'IS481.original transposase', 'Check query_description' );
+ is( $result->num_hits(), 1, 'Check num_hits' );
+ my ( $hsp, $hit );
- while ( defined( $hsp = $hit->next_domain ) ) {
- my @hsp_exp = @{ shift @hsp_list };
- is( ref($hsp), 'Bio::Search::HSP::HMMERHSP',
- 'Check for correct hsp reference type' );
- is( $hsp->hit_string, shift @hsp_exp, 'Check hit sequence' );
- is( $hsp->query_string, shift @hsp_exp, 'Check query sequence' );
+ while ( $hit = $result->next_model ) {
+ is( ref($hit), 'Bio::Search::Hit::HMMERHit',
+ 'Check for the correct hit reference type' );
+ is( $hit->name, 'IS481.original.hmm', 'Check hit name' );
+ is( $hit->description, '', 'Check for hit description' );
+ is( $hit->raw_score, '130.0', 'Check hit raw_score' );
+ float_is( $hit->significance, 3.4e-040, 'Check hit significance' );
+ is( $hit->num_hsps, 1, 'Check num_hsps' );
+
+ while ( defined( $hsp = $hit->next_domain ) ) {
+ is( ref($hsp), 'Bio::Search::HSP::HMMERHSP',
+ 'Check for correct hsp reference type' );
+ is( $hsp->query_string,
+ 'GEIETAHPSYLGSQDTFYVGNITGAGR----------------------------IYQQTFVDTYSKWDSTKLYTTKTPITAADLLNDRVLSFFA-EQGMGIIRLLTDRSTEYCSKA--ETQDYELCLALNDIEHTKTKVYHPQTNDICRRFHKA',
+ 'Check for query string'
+ );
+ is( $hsp->hit_string,
+ 'kRYErdhPgeLvhmDvkklgripdgGgvkighRwrgrtrgrgkrtnqsrnrglgkayvitaiDDhSRfayaeilsdettttaadfllraaayfygkigeeiitrvlTDnGaayrskkrsakhdFqealaelGIkhilTrprsPqTNGKiERFhrT',
+ 'Check for hit string'
+ );
+ is( $hsp->homology_string,
+ '+++E++hP +L+++D++++g+i + G+ +y++t++D++S+ +++++++t++taad l++ ++ f+ ++++i r lTD+ ++y+sk ++ d+ +la ++I+h++T++++PqTN ++ RFh+ ',
+ 'Check for homology string'
+ );
+ is( $hsp->posterior_string,
+ '579*******************88888............................****************************************.********************8..**********************************95',
+ 'Check for posterior probability string'
+ );
+ }
}
}
}
View
47 t/data/hmmscan_sec_struct.out
@@ -91,3 +91,50 @@ Domain search space (domZ): 2 [number of targets reported over t
# CPU time: 0.24u 0.15s 00:00:00.39 Elapsed: 00:00:00.27
# Mc/sec: 2782.58
//
+Query: lcl|aorf_00010|P1 [L=132]
+Description: IS481.original transposase
+Scores for complete sequence (score includes all domains):
+ --- full sequence --- --- best 1 domain --- -#dom-
+ E-value score bias E-value score bias exp N Model Description
+ ------- ------ ----- ------- ------ ----- ---- -- -------- -----------
+ 3.4e-40 130.0 0.4 3.8e-40 129.9 0.3 1.0 1 IS481.original.hmm
+
+
+Domain annotation for each model (and alignments):
+>> IS481.original.hmm
+ # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc
+ --- ------ ----- --------- --------- ------- ------- ------- ------- ------- ------- ----
+ 1 ! 129.9 0.3 6.6e-42 3.8e-40 127 281 .. 7 130 .. 2 132 .] 0.97
+
+ Alignments for each domain:
+ == domain 1 score: 129.9 bits; conditional E-value: 6.6e-42
+ IS481.original.hmm 127 kRYErdhPgeLvhmDvkklgripdgGgvkighRwrgrtrgrgkrtnqsrnrglgkayvitaiDDhSRfayaeilsd 202
+ +++E++hP +L+++D++++g+i + G+ +y++t++D++S+ ++++++
+ lcl|aorf_00010|P1 7 GEIETAHPSYLGSQDTFYVGNITGAGR----------------------------IYQQTFVDTYSKWDSTKLYTT 54
+ 579*******************88888............................********************* PP
+
+ IS481.original.hmm 203 ettttaadfllraaayfygkigeeiitrvlTDnGaayrskkrsakhdFqealaelGIkhilTrprsPqTNGKiERF 278
+ +t++taad l++ ++ f+ ++++i r lTD+ ++y+sk ++ d+ +la ++I+h++T++++PqTN ++ RF
+ lcl|aorf_00010|P1 55 KTPITAADLLNDRVLSFFA-EQGMGIIRLLTDRSTEYCSKA--ETQDYELCLALNDIEHTKTKVYHPQTNDICRRF 127
+ *******************.********************8..********************************* PP
+
+ IS481.original.hmm 279 hrT 281
+ h+
+ lcl|aorf_00010|P1 128 HKA 130
+ *95 PP
+
+
+
+Internal pipeline statistics summary:
+-------------------------------------
+Query sequence(s): 1 (132 residues)
+Target model(s): 116 (57162 nodes)
+Passed MSV filter: 4 (0.0344828); expected 2.3 (0.02)
+Passed bias filter: 4 (0.0344828); expected 2.3 (0.02)
+Passed Vit filter: 3 (0.0258621); expected 0.1 (0.001)
+Passed Fwd filter: 1 (0.0172414); expected 0.0 (1e-05)
+Initial search space (Z): 116 [actual number of targets]
+Domain search space (domZ): 1 [number of targets reported over threshold]
+# CPU time: 0.06u 0.00s 00:00:00.06 Elapsed: 00:00:00.06
+# Mc/sec: 117.90
+//

0 comments on commit 41b4de2

Please sign in to comment.