Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Hmmer3: For Hmmsearch and Hmmscan, "$hit->length" method will

return the correct length instead of 0 if the "Domain annotation"
line shows a hit coverage of '.]' or '[]'. In this case,
the END position is inferred as the length of the sequence
and the information is added accordingly. Also removed a little
code redundancy. Tests added.
  • Loading branch information...
commit f04d7c7e3c3496743a6cb2a4b8a846fcee7f3e3f 1 parent a8dd080
@fjossandon fjossandon authored
Showing with 116 additions and 87 deletions.
  1. +92 −80 Bio/SearchIO/hmmer3.pm
  2. +24 −7 t/SearchIO/hmmer.t
View
172 Bio/SearchIO/hmmer3.pm
@@ -543,88 +543,98 @@ sub next_result {
}
# Grab hsp data from table, push into @hsp;
- if ($self->{'_reporttype'} =~ m/(?:HMMSCAN|HMMSEARCH)/) {
- if (my ($domain_num, $score, $bias,
- $ceval, $ieval, $hmmstart,
- $hmmstop, $qalistart, $qalistop,
- $envstart, $envstop, $envbound,
- $acc
- )
- = m|^\s+(\d+)\s\!*\?*\s+ # domain number
- (\S+)\s+(\S+)\s+ # score, bias
- (\S+)\s+(\S+)\s+ # c-eval, i-eval
- (\d+)\s+(\d+).+? # hmm start, stop
- (\d+)\s+(\d+).+? # query start, stop
- (\d+)\s+(\d+).+? # env start, stop
- (\S+) # Accession
- \s*$|ox
- )
- {
- # Keep it simple for now. let's customize later
- my @vals = (
- $hmmstart, $hmmstop,
- $qalistart, $qalistop,
- $score, $ceval,
- '', '',
- '', '',
- ''
- );
- my $info = $hit_list[ $hitinfo{"$name.$annot_counter"} ];
- if ( !defined $info ) {
- $self->warn(
- "Incomplete sequence information; can't find $name, hitinfo says $hitinfo{$name}\n"
- );
- next;
+ if ($self->{'_reporttype'} =~ m/(?:HMMSCAN|HMMSEARCH|NHMMER)/) {
+ my ( $domain_num, $score, $bias,
+ $ceval, $ieval,
+ $hmm_start, $hmm_stop, $hmm_cov,
+ $seq_start, $seq_stop, $seq_cov,
+ $env_start, $env_stop, $env_cov,
+ $hitlength, $acc );
+ my @vals;
+
+ if ( # HMMSCAN & HMMSEARCH
+ ( $domain_num, $score, $bias,
+ $ceval, $ieval,
+ $hmm_start, $hmm_stop, $hmm_cov,
+ $seq_start, $seq_stop, $seq_cov,
+ $env_start, $env_stop, $env_cov,
+ $acc ) = (
+ m|^\s+(\d+)\s\!*\?*\s+ # domain number
+ (\S+)\s+(\S+)\s+ # score, bias
+ (\S+)\s+(\S+)\s+ # c-eval, i-eval
+ (\d+)\s+(\d+)\s+(\S+)\s+ # hmm start, stop, coverage
+ (\d+)\s+(\d+)\s+(\S+)\s+ # seq start, stop, coverage
+ (\d+)\s+(\d+)\s+(\S+)\s+ # env start, stop, coverage
+ (\S+) # posterior probability accuracy
+ \s*$|ox
+ )
+ ) {
+ # Values assigned when IF succeeded
+
+ # Try to get the Hit length from the alignment information
+ $hitlength = 0;
+ if ($self->{'_reporttype'} eq 'HMMSEARCH') {
+ # For Hmmsearch, if seq coverage ends in ']' it means that the alignment
+ # runs until the end. In that case add the END coordinate to @hitinfo
+ # to use it as Hit Length
+ if ( $seq_cov =~ m/\]$/ ) {
+ $hitlength = $seq_stop;
+ }
+ }
+ elsif ($self->{'_reporttype'} eq 'HMMSCAN') {
+ # For Hmmscan, if hmm coverage ends in ']' it means that the alignment
+ # runs until the end. In that case add the END coordinate to @hitinfo
+ # to use it as Hit Length
+ if ( $hmm_cov =~ m/\]$/ ) {
+ $hitlength = $hmm_stop;
+ }
}
- $domaincounter{"$name.$annot_counter"}++;
- my $hsp_key
- = $name . "_" . $domaincounter{"$name.$annot_counter"};
- push @hsp_list, [ $name, @vals ];
- $hspinfo{"$hsp_key.$annot_counter"} = $#hsp_list;
- }
- else {
- print "Missed this line: $_\n";
}
- }
- elsif ($self->{'_reporttype'} eq 'NHMMER') {
- if (my ($score, $bias, $eval,
- $hmmstart, $hmmstop, $hitstart,
- $hitstop, $envstart, $envstop,
- $hitlength, $acc
+ elsif ( # NHMMER
+ ( $score, $bias, $ceval,
+ $hmm_start, $hmm_stop, $hmm_cov,
+ $seq_start, $seq_stop, $seq_cov,
+ $env_start, $env_stop, $env_cov,
+ $hitlength, $acc ) = (
+ m|^\s+[!?]\s+
+ (\S+)\s+(\S+)\s+(\S+)\s+ # score, bias, evalue
+ (\d+)\s+(\d+)\s+(\S+)\s+ # hmm start, stop, coverage
+ (\d+)\s+(\d+)\s+(\S+)\s+ # seq start, stop, coverage
+ (\d+)\s+(\d+)\s+(\S+)\s+ # env start, stop, coverage
+ (\d+)\s+(\S+) # target length, pp accuracy
+ .*$|ox
)
- = m|^\s+[!?]\s+
- (\S+)\s+(\S+)\s+(\S+)\s+ # score, bias, evalue
- (\d+)\s+(\d+)\s+[.\[\]]*\s+ # hmm start, stop
- (\d+)\s+(\d+)\s+[.\[\]]*\s+ # query start, stop
- (\d+)\s+(\d+)\s+[.\[\]]*\s+ # env start, stop
- (\d+)\s+(\S+) # target length, Accession
- .*$|ox
- )
- {
- my @vals = (
- $hitstart, $hitstop,
- $hmmstart, $hmmstop,
- $score, $eval,
- $hitlength, '',
- '', '',
- '', ''
- );
- my $info = $hit_list[ $hitinfo{"$name.$annot_counter"} ];
- if ( !defined $info ) {
- $self->warn(
- "Incomplete information: can't find HSP $name in list of hits\n"
- );
- next;
- }
- $domaincounter{"$name.$annot_counter"}++;
- my $hsp_key
- = $name . "_" . $domaincounter{"$name.$annot_counter"};
- push @hsp_list, [ $name, @vals ];
- $hspinfo{"$hsp_key.$annot_counter"} = $#hsp_list;
+ ) {
+ # Values assigned when IF succeeded
}
else {
print "Missed this line: $_\n";
+ next;
}
+
+ my $info = $hit_list[ $hitinfo{"$name.$annot_counter"} ];
+ if ( !defined $info ) {
+ $self->warn(
+ "Incomplete information: can't find HSP $name in list of hits\n"
+ );
+ next;
+ }
+
+ $domaincounter{"$name.$annot_counter"}++;
+ my $hsp_key
+ = $name . "_" . $domaincounter{"$name.$annot_counter"};
+
+ # Keep it simple for now. let's customize later
+ @vals = (
+ $hmm_start, $hmm_stop,
+ $seq_start, $seq_stop,
+ $score, $ceval,
+ $hitlength, '',
+ '', '',
+ '', ''
+ );
+ push @hsp_list, [ $name, @vals ];
+ $hspinfo{"$hsp_key.$annot_counter"} = $#hsp_list;
}
}
}
@@ -830,8 +840,7 @@ sub next_result {
}
);
}
- if ( $self->{'_reporttype'} eq 'HMMSCAN'
- or $self->{'_reporttype'} eq 'NHMMER') {
+ if ( $self->{'_reporttype'} eq 'HMMSCAN' ) {
$self->element(
{ 'Name' => 'Hsp_hit-from',
'Data' => shift @$hsp
@@ -853,7 +862,9 @@ sub next_result {
}
);
}
- else { # hmmsearch
+ elsif ( $self->{'_reporttype'} eq 'HMMSEARCH'
+ or $self->{'_reporttype'} eq 'NHMMER'
+ ) {
$self->element(
{ 'Name' => 'Hsp_query-from',
'Data' => shift @$hsp
@@ -885,10 +896,11 @@ sub next_result {
'Data' => shift @$hsp
}
);
- if ( $self->{'_reporttype'} eq 'NHMMER' ) {
+ my $hitlength = shift @$hsp;
+ if ( $hitlength != 0 ) {
$self->element(
{ 'Name' => 'Hit_len',
- 'Data' => shift @$hsp
+ 'Data' => $hitlength
}
);
}
View
31 t/SearchIO/hmmer.t
@@ -7,7 +7,7 @@ BEGIN {
use lib '.';
use Bio::Root::Test;
- test_begin( -tests => 748 );
+ test_begin( -tests => 754 );
use_ok('Bio::SearchIO');
}
@@ -487,7 +487,7 @@ while ( $result = $searchio->next_result ) {
# Query and Hit lengths are usually unknown in HMMER,
# but sometimes they can be deduced from domain data '[]'
- is( $hit->length, 0, 'Check hit length absence' );
+ is( $hit->length, 0, 'Check hit length absence' );
is( $hit->frac_aligned_query, '1.00' );
is( $hit->frac_aligned_hit, undef );
@@ -644,7 +644,8 @@ while ( $result = $searchio->next_result ) {
float_is( $hit->significance, 6e-30, 'Check hit significance' );
is( $hit->num_hsps, 1, 'Check num_hsps' );
- # Hit length is unknown for HMMSCAN and HMMSEARCH but not for NHMMER
+ # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER.
+ # When is not known, sometimes it can be deduced from domain data '[]'
is( $hit->length, 0, 'Check hit length absence' );
is( $hit->frac_aligned_query, 0.87 );
is( $hit->frac_aligned_hit, undef );
@@ -954,7 +955,8 @@ while ( $result = $searchio->next_result ) {
float_is( $hit->significance, 9.3e-189, 'Check hit significance' );
is( $hit->num_hsps, 1, 'Check num_hsps' );
- # Hit length is unknown for HMMSCAN and HMMSEARCH but not for NHMMER
+ # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER.
+ # When is not known, sometimes it can be deduced from domain data '[]'
is( $hit->length, 0, 'Check hit length absence' );
is( $hit->frac_aligned_query, 0.93 );
is( $hit->frac_aligned_hit, undef );
@@ -1071,6 +1073,13 @@ while ( $result = $searchio->next_result ) {
my ( $hsp, $hit );
while ( $hit = $result->next_model ) {
+ if ($hit->name eq 'HemolysinCabind') {
+ # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER.
+ # When is not known, sometimes it can be deduced from domain data '[]'
+ is( $hit->length, 18, 'Check hit length' );
+ is( $hit->frac_aligned_query, 0.03 );
+ is( $hit->frac_aligned_hit, '1.00' );
+ }
my @expected = @{ shift @multi_hits };
is( ref($hit), 'Bio::Search::Hit::HMMERHit',
'Check for the correct hit reference type' );
@@ -1160,6 +1169,13 @@ while ( $result = $searchio->next_result ) {
my ( $hsp, $hit );
while ( $hit = $result->next_model ) {
+ if ($hit->name eq 'PKSI-KS_m3') {
+ # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER.
+ # When is not known, sometimes it can be deduced from domain data '[]'
+ is( $hit->length, 16, 'Check hit length' );
+ is( $hit->frac_aligned_query, 0.09 );
+ is( $hit->frac_aligned_hit, '1.00' );
+ }
my @expected = @{ shift @multi_hits };
is( ref($hit), 'Bio::Search::Hit::HMMERHit',
'Check for the correct hit reference type' );
@@ -1374,7 +1390,8 @@ is( $result->num_hits(), 2, 'Check num_hits' );
float_is( $hit->significance, 3.2e-48, 'Check nhmmer hit significance' );
is( $hit->num_hsps, 1, 'Check num_hsps' );
- # Hit length is unknown for HMMSCAN and HMMSEARCH but not for NHMMER
+ # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER.
+ # When is not known, sometimes it can be deduced from domain data '[]'
is( $hit->length, 151, 'Check nhmmer hit length' );
is( $hit->frac_aligned_query, 0.09 );
is( $hit->frac_aligned_hit, '1.00' );
@@ -1471,8 +1488,8 @@ is( $result->num_hits(), 2, 'Check num_hits' );
is( $hsp->length('hit'), 59, 'Check for hsp hit length' );
is( $hsp->length('total'), 59, 'Check for hsp total length' );
is( $hsp->gaps('query'), 0, 'Check for hsp query gaps' );
- is( $hsp->gaps('hit'), 0, 'Check for hsp hit gaps' );
- is( $hsp->gaps('total'), 0, 'Check for hsp total gaps' );
+ is( $hsp->gaps('hit'), 0, 'Check for hsp hit gaps' );
+ is( $hsp->gaps('total'), 0, 'Check for hsp total gaps' );
($hit->length == 0) ?
is( $hsp->{HIT_LENGTH}, $hsp->hit->length, 'Check hit length consistency' )
Please sign in to comment.
Something went wrong with that request. Please try again.