Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Hmmer2: Removed code redundancy by merging the parsing of

Hmmsearch and Hmmpfam into the same block, removed some unused
variables, and also fixed an previously unnoticed bug where
the query/hit coordinates were flipped for Hmmsearch results
with Histogram lines. Updated tests accordingly.
  • Loading branch information...
commit 76822a4560a9a3fd0c72faecd53843b0b22683d7 1 parent de6fe01
Francisco J. Ossandon fjossandon authored

Showing 2 changed files with 313 additions and 528 deletions. Show diff stats Hide diff stats

  1. +251 514 Bio/SearchIO/hmmer2.pm
  2. +62 14 t/SearchIO/hmmer.t
765 Bio/SearchIO/hmmer2.pm
@@ -161,6 +161,7 @@ sub next_result {
161 161 local ($_);
162 162 while ( defined( $_ = $self->_readline ) ) {
163 163 my $lineorig = $_;
  164 +
164 165 chomp;
165 166 if (/^HMMER\s+(\S+)\s+\((.+)\)/o) {
166 167 my ( $prog, $version ) = split;
@@ -216,7 +217,7 @@ sub next_result {
216 217 }
217 218 );
218 219 }
219   - elsif (s/^Query(\s+(sequence|HMM))?(?:\s+\d+)?:\s+//o) {
  220 + elsif (s/^Query(?:\s+(?:sequence|HMM))?(?:\s+\d+)?:\s+//o) {
220 221 if ( !$seentop ) {
221 222
222 223 # we're in a multi-query report
@@ -252,20 +253,24 @@ sub next_result {
252 253 }
253 254 );
254 255 }
255   - elsif ( defined $self->{'_reporttype'}
256   - && $self->{'_reporttype'} eq 'HMMSEARCH' )
257   - {
258   -
259   - # PROCESS HMMSEARCH RESULTS HERE
260   - if (/^Scores for complete sequences/o) {
  256 + elsif ( defined $self->{'_reporttype'}
  257 + && ( $self->{'_reporttype'} eq 'HMMSEARCH'
  258 + || $self->{'_reporttype'} eq 'HMMPFAM' )
  259 + ) {
  260 + # PROCESS RESULTS HERE
  261 + if (/^Scores for (?:complete sequences|sequence family)/o) {
261 262 while ( defined( $_ = $self->_readline ) ) {
262 263 last if (/^\s+$/);
263   - next if ( /^Sequence\s+Description/o || /^\-\-\-/o );
  264 + next if ( /^Model\s+Description/o
  265 + || /^Sequence\s+Description/o
  266 + || /^\-\-\-/o );
  267 +
  268 + chomp;
264 269 my @line = split;
265   - my ( $name, $n, $evalue, $score ) =
  270 + my ( $name, $domaintotal, $evalue, $score ) =
266 271 ( shift @line, pop @line, pop @line, pop @line );
267 272 my $desc = join( ' ', @line );
268   - push @hitinfo, [ $name, $desc, $score, $evalue, $n ];
  273 + push @hitinfo, [ $name, $desc, $score, $evalue, $domaintotal ];
269 274 $hitinfo{$name} = $#hitinfo;
270 275 }
271 276 }
@@ -278,11 +283,11 @@ sub next_result {
278 283 $self->_pushback($_);
279 284 last;
280 285 }
281   - next if ( /^(Model|Sequence)\s+Domain/ || /^\-\-\-/ );
  286 + next if ( /^(?:Model|Sequence)\s+Domain/ || /^\-\-\-/ );
282 287
283 288 chomp;
284 289 if (
285   - my ( $n, $domainnum, $domainct,
  290 + my ( $name, $domainct, $domaintotal,
286 291 $seq_start, $seq_end, $seq_cov,
287 292 $hmm_start, $hmm_end, $hmm_cov,
288 293 $score, $evalue ) = (
@@ -296,74 +301,118 @@ sub next_result {
296 301 (\S+) # evalue
297 302 \s*$!ox
298 303 )
299   - )
300   - {
301   - my $hindex = $hitinfo{$n};
  304 + ) {
  305 + my $hindex = $hitinfo{$name};
302 306 if ( !defined $hindex ) {
303 307 push @hitinfo,
304   - [ $n, '', $score, $evalue, $domainct ];
305   - $hitinfo{$n} = $#hitinfo;
  308 + [ $name, '', $score, $evalue, $domaintotal ];
  309 + $hitinfo{$name} = $#hitinfo;
306 310 $hindex = $#hitinfo;
307 311 }
  312 +
308 313 my $info = $hitinfo[$hindex];
309 314 if ( !defined $info ) {
310   - $self->warn(
311   - "Incomplete Sequence information, can't find $n hitinfo says $hitinfo{$n}"
312   - );
  315 + if ($self->{'_reporttype'} eq 'HMMSEARCH') {
  316 + $self->warn(
  317 + "Incomplete Sequence information, can't find $name hitinfo says $hitinfo{$name}"
  318 + );
  319 + }
  320 + elsif ($self->{'_reporttype'} eq 'HMMPFAM') {
  321 + $self->warn(
  322 + "Incomplete Domain information, can't find $name hitinfo says $hitinfo{$name}"
  323 + );
  324 + }
313 325 next;
314 326 }
315   - # For Hmmsearch, if seq coverage ends in ']' it means that the alignment
316   - # runs until the end. In that case add the END coordinate to @hitinfo
317   - # to use it as Hit Length
318   - if ( $seq_cov =~ m/\]$/
319   - and scalar @{ $hitinfo[$hindex] } == 5
320   - ) {
321   - push @{ $hitinfo[$hindex] }, $seq_end ;
  327 +
  328 + # Try to get HMM and Sequence lengths from the alignment information
  329 + if ($self->{'_reporttype'} eq 'HMMSEARCH') {
  330 + # For Hmmsearch, if seq coverage ends in ']' it means that the alignment
  331 + # runs until the end. In that case add the END coordinate to @hitinfo
  332 + # to use it as Hit Length
  333 + if ( $seq_cov =~ m/\]$/
  334 + and scalar @{ $hitinfo[$hindex] } == 5
  335 + ) {
  336 + push @{ $hitinfo[$hindex] }, $seq_end ;
  337 + }
  338 + # For Hmmsearch, if hmm coverage ends in ']', it means that the alignment
  339 + # runs until the end. In that case use the END coordinate as Query Length
  340 + if ( $hmm_cov =~ m/\]$/
  341 + and not exists $self->{_values}->{'RESULT-query_length'}
  342 + ) {
  343 + $self->element(
  344 + { 'Name' => 'HMMER_query-len',
  345 + 'Data' => $hmm_end
  346 + }
  347 + );
  348 + }
322 349 }
323   - # For Hmmsearch, if hmm coverage ends in ']', it means that the alignment
324   - # runs until the end. In that case use the END coordinate as Query Length
325   - if ( $hmm_cov =~ m/\]$/
326   - and not exists $self->{_values}->{'RESULT-query_length'}
327   - ) {
328   - $self->element(
329   - { 'Name' => 'HMMER_query-len',
330   - 'Data' => $hmm_end
331   - }
332   - );
  350 + elsif ($self->{'_reporttype'} eq 'HMMPFAM') {
  351 + # For Hmmpfam, if hmm coverage ends in ']' it means that the alignment
  352 + # runs until the end. In that case add the END coordinate to @hitinfo
  353 + # to use it as Hit Length
  354 + if ( $hmm_cov =~ m/\]$/
  355 + and scalar @{ $hitinfo[$hindex] } == 5
  356 + ) {
  357 + push @{ $hitinfo[$hindex] }, $hmm_end ;
  358 + }
  359 + # For Hmmpfam, if seq coverage ends in ']', it means that the alignment
  360 + # runs until the end. In that case use the END coordinate as Query Length
  361 + if ( $seq_cov =~ m/\]$/
  362 + and not exists $self->{_values}->{'RESULT-query_length'}
  363 + ) {
  364 + $self->element(
  365 + { 'Name' => 'HMMER_query-len',
  366 + 'Data' => $seq_end
  367 + }
  368 + );
  369 + }
333 370 }
  371 +
334 372 my @vals = ($seq_start, $seq_end,
335 373 $hmm_start, $hmm_end,
336 374 $score, $evalue);
337   - push @hspinfo, [ $n, @vals ];
  375 + push @hspinfo, [ $name, @vals ];
338 376 }
339 377 }
340 378 }
341 379 elsif (/^Alignments of top/o) {
342   - my ( $prelength, $lastdomain, $count, $width );
  380 + my ( $prelength, $count, $width );
343 381 $count = 0;
344 382 my %domaincounter;
345 383 my $second_tier = 0;
  384 +
346 385 while ( defined( $_ = $self->_readline ) ) {
347   - next if ( /^Align/o
348   - || /^\s+RF\s+[x\s]+$/o );
349   - if ( /^Histogram/o || m!^//!o ) {
  386 + next if ( /^Align/o
  387 + || ( $count != 1 && /^\s+RF\s+[x\s]+$/o )
  388 + );
  389 +
  390 + # fix for bug 2632
  391 + next if ($_ =~ m/^\s+CS\s+/o && $count == 0);
  392 +
  393 + if ( m/^Histogram/o
  394 + || m!^//!o
  395 + || m/^Query(?:\s+(?:sequence|HMM))?(?:\s+\d+)?:/o
  396 + ) {
350 397 if ( $self->in_element('hsp') ) {
351 398 $self->end_element( { 'Name' => 'Hsp' } );
352 399 }
353 400 if ( $self->within_element('hit') ) {
354 401 $self->end_element( { 'Name' => 'Hit' } );
355 402 }
  403 + $self->_pushback($_);
356 404 last;
357 405 }
358   - chomp;
359 406
  407 + chomp;
360 408 if (
361   - m/^\s*(.+):\s+domain\s+(\d+)\s+of\s+(\d+)\,\s+
362   - from\s+(\d+)\s+to\s+(\d+)/x
363   - )
364   - {
365   - my ( $name, $domainct, $domaintotal, $from, $to ) =
366   - ( $1, $2, $3, $4, $5 );
  409 + my ( $name, $domainct, $domaintotal,
  410 + $from, $to ) = (
  411 + m/^\s*(.+):
  412 + \s+ domain \s+ (\d+) \s+ of \s+ (\d+) ,
  413 + \s+ from \s+ (\d+) \s+ to \s+ (\d+)/x
  414 + )
  415 + ) {
367 416 $domaincounter{$name}++;
368 417 if ( $self->within_element('hit') ) {
369 418 if ( $self->within_element('hsp') ) {
@@ -372,20 +421,25 @@ sub next_result {
372 421 $self->end_element( { 'Name' => 'Hit' } );
373 422 }
374 423
375   - $self->start_element( { 'Name' => 'Hit' } );
376   - my $info = [
377   - @{
378   - $hitinfo[ $hitinfo{$name} ] || $self->throw(
379   -"Could not find hit info for $name: Insure that your database contains only unique sequence names"
380   - )
381   - }
382   - ];
383   - if ( $info->[0] ne $name ) {
384   - $self->throw(
385   -"Somehow the Model table order does not match the order in the domains (got "
386   - . $info->[0]
387   - . ", expected $name)" );
  424 + my $info = [ @{ $hitinfo[ $hitinfo{$name} ] } ];
  425 + if ( !defined $info
  426 + || $info->[0] ne $name
  427 + ) {
  428 + $self->warn(
  429 + "Somehow the Model table order does not match the order in the domains (got "
  430 + . $info->[0]
  431 + . ", expected $name). We're back loading this from the alignment information instead"
  432 + );
  433 + $info = [
  434 + $name, '',
  435 + /score \s+ ([^,\s]+), \s+E\s+=\s+ (\S+)/ox,
  436 + $domaintotal
  437 + ];
  438 + push @hitinfo, $info;
  439 + $hitinfo{$name} = $#hitinfo;
388 440 }
  441 +
  442 + $self->start_element( { 'Name' => 'Hit' } );
389 443 $self->element(
390 444 {
391 445 'Name' => 'Hit_id',
@@ -410,12 +464,12 @@ sub next_result {
410 464 'Data' => shift @{$info}
411 465 }
412 466 );
413   - my $dom_ct = shift @{$info};
414   - if (my $hmm_end = shift @{$info}) {
  467 + my $dom_total = shift @{$info};
  468 + if (my $hit_end = shift @{$info}) {
415 469 $self->element(
416 470 {
417 471 'Name' => 'Hit_len',
418   - 'Data' => $hmm_end
  472 + 'Data' => $hit_end
419 473 }
420 474 );
421 475 }
@@ -426,434 +480,65 @@ sub next_result {
426 480
427 481 if ( $id ne $name ) {
428 482 $self->throw(
429   -"Somehow the domain list details do not match the table (got $id, expected $name)"
  483 + "Somehow the domain list details do not match "
  484 + . "the table (got $id, expected $name)"
430 485 );
431 486 }
432   - if ( $domaincounter{$name} == $domaintotal ) {
433   - $hitinfo[ $hitinfo{$name} ] = undef;
434   - }
435   - $self->element(
436   - {
437   - 'Name' => 'Hsp_hit-from',
438   - 'Data' => shift @$HSPinfo
439   - }
440   - );
441   - $self->element(
442   - {
443   - 'Name' => 'Hsp_hit-to',
444   - 'Data' => shift @$HSPinfo
445   - }
446   - );
447   - $self->element(
448   - {
449   - 'Name' => 'Hsp_query-from',
450   - 'Data' => shift @$HSPinfo
451   - }
452   - );
453   - $self->element(
454   - {
455   - 'Name' => 'Hsp_query-to',
456   - 'Data' => shift @$HSPinfo
457   - }
458   - );
459   - $self->element(
460   - {
461   - 'Name' => 'Hsp_score',
462   - 'Data' => shift @$HSPinfo
463   - }
464   - );
465   - $self->element(
466   - {
467   - 'Name' => 'Hsp_evalue',
468   - 'Data' => shift @$HSPinfo
469   - }
470   - );
471   - $lastdomain = $name;
472   - }
473   - else {
474 487
475   - # Might want to change this so that it
476   - # accumulates all the of the alignment lines into
477   - # three array slots and then tests for the
478   - # end of the line
479   - if (/^(\s+\*\-\>)(\S+)/o) { # start of domain
480   - $prelength = CORE::length($1);
481   - $width = 0;
482   -
483   - # deal with fact that start en stop is on same line
484   - my $data = $2;
485   - if ($data =~ s/\<\-?\*?\s*$//)
486   - {
487   - $width = CORE::length($data);
488   - }
489   -
  488 + if ($self->{'_reporttype'} eq 'HMMSEARCH') {
490 489 $self->element(
491 490 {
492   - 'Name' => 'Hsp_qseq',
493   - 'Data' => $data
  491 + 'Name' => 'Hsp_hit-from',
  492 + 'Data' => shift @$HSPinfo
494 493 }
495 494 );
496   - $count = 0;
497   - $second_tier = 0;
498   - }
499   - elsif (/^(\s+)(\S+)\<\-\*\s*$/o) { #end of domain
500 495 $self->element(
501 496 {
502   - 'Name' => 'Hsp_qseq',
503   - 'Data' => $2
  497 + 'Name' => 'Hsp_hit-to',
  498 + 'Data' => shift @$HSPinfo
504 499 }
505 500 );
506   - $width = CORE::length($2);
507   - $count = 0;
508   - }
509   - elsif (( $count != 1 && /^\s+$/o )
510   - || CORE::length($_) == 0
511   - || /^\s+\-?\*\s*$/ )
512   - {
513   - next;
514   - }
515   - elsif ( $count == 0 ) {
516   - $prelength -= 3 unless ( $second_tier++ );
517   - unless ( defined $prelength ) {
518   -
519   - # $self->warn("prelength not set");
520   - next;
521   - }
522 501 $self->element(
523 502 {
524   - 'Name' => 'Hsp_qseq',
525   - 'Data' => substr( $_, $prelength )
  503 + 'Name' => 'Hsp_query-from',
  504 + 'Data' => shift @$HSPinfo
526 505 }
527 506 );
528   - }
529   - elsif ( $count == 1 ) {
530   - if ( !defined $prelength ) {
531   - $self->warn("prelength not set");
532   - }
533   - if ($width) {
534   - $self->element(
535   - {
536   - 'Name' => 'Hsp_midline',
537   - 'Data' =>
538   - substr( $_, $prelength, $width )
539   - }
540   - );
541   - }
542   - else {
543   - $self->element(
544   - {
545   - 'Name' => 'Hsp_midline',
546   - 'Data' => substr( $_, $prelength )
547   - }
548   - );
549   - }
550   - }
551   - elsif ( $count == 2 ) {
552   - if (/^\s+(\S+)\s+(\d+|\-)\s+(\S*)\s+(\d+|\-)/o) {
553   - $self->element(
554   - {
555   - 'Name' => 'Hsp_hseq',
556   - 'Data' => $3
557   - }
558   - );
559   - }
560   - else {
561   - $self->warn("unrecognized line: $_\n");
562   - }
563   - }
564   - $count = 0 if $count++ >= 2;
565   - }
566   - }
567   - }
568   - elsif ( /^Histogram/o || m!^//!o ) {
569   - while ( my $HSPinfo = shift @hspinfo ) {
570   - my $id = shift @$HSPinfo;
571   - my $info = [ @{ $hitinfo[ $hitinfo{$id} ] } ];
572   - next unless defined $info;
573   - $self->start_element( { 'Name' => 'Hit' } );
574   - $self->element(
575   - {
576   - 'Name' => 'Hit_id',
577   - 'Data' => shift @{$info}
578   - }
579   - );
580   - $self->element(
581   - {
582   - 'Name' => 'Hit_desc',
583   - 'Data' => shift @{$info}
584   - }
585   - );
586   - $self->element(
587   - {
588   - 'Name' => 'Hit_score',
589   - 'Data' => shift @{$info}
590   - }
591   - );
592   - $self->element(
593   - {
594   - 'Name' => 'Hit_signif',
595   - 'Data' => shift @{$info}
596   - }
597   - );
598   - my $dom_ct = shift @{$info};
599   - if (my $hmm_end = shift @{$info}) {
600   - $self->element(
601   - {
602   - 'Name' => 'Hit_len',
603   - 'Data' => $hmm_end
604   - }
605   - );
606   - }
607   -
608   - $self->start_element( { 'Name' => 'Hsp' } );
609   - $self->element(
610   - {
611   - 'Name' => 'Hsp_query-from',
612   - 'Data' => shift @$HSPinfo
613   - }
614   - );
615   - $self->element(
616   - {
617   - 'Name' => 'Hsp_query-to',
618   - 'Data' => shift @$HSPinfo
619   - }
620   - );
621   - $self->element(
622   - {
623   - 'Name' => 'Hsp_hit-from',
624   - 'Data' => shift @$HSPinfo
625   - }
626   - );
627   - $self->element(
628   - {
629   - 'Name' => 'Hsp_hit-to',
630   - 'Data' => shift @$HSPinfo
631   - }
632   - );
633   - $self->element(
634   - {
635   - 'Name' => 'Hsp_score',
636   - 'Data' => shift @$HSPinfo
637   - }
638   - );
639   - $self->element(
640   - {
641   - 'Name' => 'Hsp_evalue',
642   - 'Data' => shift @$HSPinfo
643   - }
644   - );
645   - $self->end_element( { 'Name' => 'Hsp' } );
646   - $self->end_element( { 'Name' => 'Hit' } );
647   - }
648   - @hitinfo = ();
649   - %hitinfo = ();
650   - last;
651   - }
652   - }
653   - elsif ( defined $self->{'_reporttype'}
654   - && $self->{'_reporttype'} eq 'HMMPFAM' )
655   - {
656   - # process HMMPFAM results here
657   - if (/^Scores for sequence family/o) {
658   - while ( defined( $_ = $self->_readline ) ) {
659   - last if (/^\s+$/);
660   - next if ( /^Model\s+Description/o || /^\-\-\-/o );
661   - chomp;
662   - my @line = split;
663   - my ( $model, $n, $evalue, $score ) =
664   - ( shift @line, pop @line, pop @line, pop @line );
665   - my $desc = join( ' ', @line );
666   - push @hitinfo, [ $model, $desc, $score, $evalue, $n ];
667   - $hitinfo{$model} = $#hitinfo;
668   - }
669   - }
670   - elsif (/^Parsed for domains:/o) {
671   - @hspinfo = ();
672   - while ( defined( $_ = $self->_readline ) ) {
673   - last if (/^\s+$/);
674   - if (m!^//!) {
675   - $self->_pushback($_);
676   - last;
677   - }
678   - next if ( /^Model\s+Domain/o || /^\-\-\-/o );
679   - chomp;
680   - if (
681   - my ( $n, $domainnum, $domainct,
682   - $seq_start, $seq_end, $seq_cov,
683   - $hmm_start, $hmm_end, $hmm_cov,
684   - $score, $evalue ) = (
685   - m!^(\S+)\s+ # domain name
686   - (\d+)/(\d+)\s+ # domain num out of num
687   - (\d+)\s+(\d+)\s+ # seq start, end
688   - (\S+)\s+ # seq coverage
689   - (\d+)\s+(\d+)\s+ # hmm start, end
690   - (\S+)\s+ # hmm coverage
691   - (\S+)\s+ # score
692   - (\S+) # evalue
693   - \s*$!ox
694   - )
695   - )
696   - {
697   - my $hindex = $hitinfo{$n};
698   - if ( !defined $hindex ) {
699   - push @hitinfo,
700   - [ $n, '', $score, $evalue, $domainct ];
701   - $hitinfo{$n} = $#hitinfo;
702   - $hindex = $#hitinfo;
703   - }
704   - my $info = $hitinfo[$hindex];
705   - if ( !defined $info ) {
706   - $self->warn(
707   - "Incomplete Domain information, can't find $n hitinfo says $hitinfo{$n}"
  507 + $self->element(
  508 + {
  509 + 'Name' => 'Hsp_query-to',
  510 + 'Data' => shift @$HSPinfo
  511 + }
708 512 );
709   - next;
710   - }
711   - # For Hmmpfam, if hmm coverage ends in ']' it means that the alignment
712   - # runs until the end. In that case add the END coordinate to @hitinfo
713   - # to use it as Hit Length
714   - if ( $hmm_cov =~ m/\]$/
715   - and scalar @{ $hitinfo[$hindex] } == 5
716   - ) {
717   - push @{ $hitinfo[$hindex] }, $hmm_end ;
718 513 }
719   - # For Hmmpfam, if seq coverage ends in ']', it means that the alignment
720   - # runs until the end. In that case use the END coordinate as Query Length
721   - if ( $seq_cov =~ m/\]$/
722   - and not exists $self->{_values}->{'RESULT-query_length'}
723   - ) {
  514 + elsif ($self->{'_reporttype'} eq 'HMMPFAM') {
724 515 $self->element(
725   - { 'Name' => 'HMMER_query-len',
726   - 'Data' => $seq_end
  516 + {
  517 + 'Name' => 'Hsp_query-from',
  518 + 'Data' => shift @$HSPinfo
727 519 }
728 520 );
729   - }
730   - my @vals = ($seq_start, $seq_end,
731   - $hmm_start, $hmm_end,
732   - $score, $evalue);
733   - push @hspinfo, [ $n, @vals ];
734   - }
735   - }
736   - }
737   - elsif (/^Alignments of top/o) {
738   - my ( $prelength, $lastdomain, $count, $width );
739   - $count = 0;
740   - my $second_tier = 0;
741   - while ( defined( $_ = $self->_readline ) ) {
742   - next
743   - if (
744   - /^Align/o
745   - || ( $count != 1
746   - && /^\s+RF\s+[x\s]+$/o )
747   - );
748   - # fix for bug 2632
749   - next if ($_ =~ m/^\s+CS\s+/o && $count == 0);
750   - if ( /^Histogram/o || m!^//!o || /^Query sequence/o ) {
751   - if ( $self->in_element('hsp') ) {
752   - $self->end_element( { 'Name' => 'Hsp' } );
753   - }
754   - if ( $self->in_element('hit') ) {
755   - $self->end_element( { 'Name' => 'Hit' } );
756   - }
757   - $self->_pushback($_);
758   - last;
759   - }
760   - chomp;
761   - if (m/(\S+):.*from\s+(\d+)\s+to\s+(\d+)/o) {
762   - my ( $name, $from, $to ) = ( $1, $2, $3 );
763   -
764   - if ( $self->within_element('hit') ) {
765   - if ( $self->in_element('hsp') ) {
766   - $self->end_element( { 'Name' => 'Hsp' } );
767   - }
768   - $self->end_element( { 'Name' => 'Hit' } );
769   - }
770   - my $info = [ @{ $hitinfo[ $hitinfo{$name} ] } ];
771   - if ( !defined $info
772   - || $info->[0] ne $name )
773   - {
774   - $self->warn(
775   - "Somehow the Model table order does not match the order in the domains (got "
776   - . $info->[0]
777   - . ", expected $name). We're back loading this from the alignment information instead"
  521 + $self->element(
  522 + {
  523 + 'Name' => 'Hsp_query-to',
  524 + 'Data' => shift @$HSPinfo
  525 + }
778 526 );
779   - $info = [
780   - $name, '',
781   - /score\s+([^,\s]+),\s+E\s+=\s+(\S+)/ox
782   - ];
783   - push @hitinfo, $info;
784   - $hitinfo{$name} = $#hitinfo;
785   - }
786   - $self->start_element( { 'Name' => 'Hit' } );
787   -
788   - $self->element(
789   - {
790   - 'Name' => 'Hit_id',
791   - 'Data' => shift @{$info}
792   - }
793   - );
794   - $self->element(
795   - {
796   - 'Name' => 'Hit_desc',
797   - 'Data' => shift @{$info}
798   - }
799   - );
800   - $self->element(
801   - {
802   - 'Name' => 'Hit_score',
803   - 'Data' => shift @{$info}
804   - }
805   - );
806   - $self->element(
807   - {
808   - 'Name' => 'Hit_signif',
809   - 'Data' => shift @{$info}
810   - }
811   - );
812   - my $dom_ct = shift @{$info};
813   - if (my $hmm_end = shift @{$info}) {
814 527 $self->element(
815 528 {
816   - 'Name' => 'Hit_len',
817   - 'Data' => $hmm_end
  529 + 'Name' => 'Hsp_hit-from',
  530 + 'Data' => shift @$HSPinfo
818 531 }
819 532 );
820   - }
821   -
822   - $self->start_element( { 'Name' => 'Hsp' } );
823   - my $HSPinfo = shift @hspinfo;
824   - my $id = shift @$HSPinfo;
825   -
826   - if ( $id ne $name ) {
827   - $self->throw(
828   -"Somehow the domain list details do not match the table (got $id, expected $name)"
  533 + $self->element(
  534 + {
  535 + 'Name' => 'Hsp_hit-to',
  536 + 'Data' => shift @$HSPinfo
  537 + }
829 538 );
830 539 }
831 540 $self->element(
832 541 {
833   - 'Name' => 'Hsp_query-from',
834   - 'Data' => shift @$HSPinfo
835   - }
836   - );
837   - $self->element(
838   - {
839   - 'Name' => 'Hsp_query-to',
840   - 'Data' => shift @$HSPinfo
841   - }
842   - );
843   - $self->element(
844   - {
845   - 'Name' => 'Hsp_hit-from',
846   - 'Data' => shift @$HSPinfo
847   - }
848   - );
849   - $self->element(
850   - {
851   - 'Name' => 'Hsp_hit-to',
852   - 'Data' => shift @$HSPinfo
853   - }
854   - );
855   - $self->element(
856   - {
857 542 'Name' => 'Hsp_score',
858 543 'Data' => shift @$HSPinfo
859 544 }
@@ -864,49 +549,74 @@ sub next_result {
864 549 'Data' => shift @$HSPinfo
865 550 }
866 551 );
867   - $lastdomain = $name;
  552 +
  553 + if ( $domaincounter{$name} == $domaintotal ) {
  554 + $hitinfo[ $hitinfo{$name} ] = undef;
  555 + }
868 556 }
869 557 else {
870   - if (/^(\s+\*\-\>)(\S+)/o) {
871 558
  559 + # Might want to change this so that it
  560 + # accumulates all the of the alignment lines into
  561 + # three array slots and then tests for the
  562 + # end of the line
  563 + if (/^(\s+ \*->) (\S+)/ox) {
872 564 # start of domain
873 565 $prelength = CORE::length($1);
874 566 $width = 0;
875 567
876   - # deal with fact that start en stop is on same line
  568 + # deal with fact that start and stop is on same line
877 569 my $data = $2;
878   - if ($data =~ s/\<\-?\*?\s*$//)
  570 + if ($data =~ s/<-?\*?\s*$//)
879 571 {
880   - $width = CORE::length($data);
  572 + $width = CORE::length($data);
881 573 }
882 574
883   - $self->element(
884   - {
885   - 'Name' => 'Hsp_hseq',
886   - 'Data' => $data
887   - }
888   - );
  575 + if ($self->{'_reporttype'} eq 'HMMSEARCH') {
  576 + $self->element(
  577 + {
  578 + 'Name' => 'Hsp_qseq',
  579 + 'Data' => $data
  580 + }
  581 + );
  582 + }
  583 + elsif ($self->{'_reporttype'} eq 'HMMPFAM') {
  584 + $self->element(
  585 + {
  586 + 'Name' => 'Hsp_hseq',
  587 + 'Data' => $data
  588 + }
  589 + );
  590 + }
889 591 $count = 0;
890 592 $second_tier = 0;
891   -
892 593 }
893   - elsif (/^(\s+)(\S+)\<\-?\*?\s*$/o) {
894   -
895   - #end of domain
  594 + elsif (/^(\s+) (\S+) <-?\*? \s*$/ox) {
  595 + # end of domain
896 596 $prelength -= 3 unless ( $second_tier++ );
897   - $self->element(
898   - {
899   - 'Name' => 'Hsp_hseq',
900   - 'Data' => $2
901   - }
902   - );
  597 + if ($self->{'_reporttype'} eq 'HMMSEARCH') {
  598 + $self->element(
  599 + {
  600 + 'Name' => 'Hsp_qseq',
  601 + 'Data' => $2
  602 + }
  603 + );
  604 + }
  605 + elsif ($self->{'_reporttype'} eq 'HMMPFAM') {
  606 + $self->element(
  607 + {
  608 + 'Name' => 'Hsp_hseq',
  609 + 'Data' => $2
  610 + }
  611 + );
  612 + }
903 613 $width = CORE::length($2);
904 614 $count = 0;
905 615 }
906   - elsif (CORE::length($_) == 0
907   - || ( $count != 1 && /^\s+$/o )
908   - || /^\s+\-?\*\s*$/
909   - || /^\s+\S+\s+\-\s+\-\s*$/ )
  616 + elsif ( ( $count != 1 && /^\s+$/o )
  617 + || CORE::length($_) == 0
  618 + || /^\s+\-?\*\s*$/
  619 + || /^\s+\S+\s+\-\s+\-\s*$/ )
910 620 {
911 621 next;
912 622 }
@@ -917,12 +627,22 @@ sub next_result {
917 627 # $self->warn("prelength not set");
918 628 next;
919 629 }
920   - $self->element(
921   - {
922   - 'Name' => 'Hsp_hseq',
923   - 'Data' => substr( $_, $prelength )
924   - }
925   - );
  630 + if ($self->{'_reporttype'} eq 'HMMSEARCH') {
  631 + $self->element(
  632 + {
  633 + 'Name' => 'Hsp_qseq',
  634 + 'Data' => substr( $_, $prelength )
  635 + }
  636 + );
  637 + }
  638 + elsif ($self->{'_reporttype'} eq 'HMMPFAM') {
  639 + $self->element(
  640 + {
  641 + 'Name' => 'Hsp_hseq',
  642 + 'Data' => substr( $_, $prelength )
  643 + }
  644 + );
  645 + }
926 646 }
927 647 elsif ( $count == 1 ) {
928 648 if ( !defined $prelength ) {
@@ -947,19 +667,26 @@ sub next_result {
947 667 }
948 668 }
949 669 elsif ( $count == 2 ) {
950   - if ( /^\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)/o
951   - || /^\s+(\S+)\s+(\-)\s+(\S*)\s+(\-)/o )
952   - {
953   - $self->element(
954   - {
955   - 'Name' => 'Hsp_qseq',
956   - 'Data' => $3
957   - }
958   - );
  670 + if ( /^\s+(\S+)\s+(\d+|\-)\s+(\S*)\s+(\d+|\-)/o) {
  671 + if ($self->{'_reporttype'} eq 'HMMSEARCH') {
  672 + $self->element(
  673 + {
  674 + 'Name' => 'Hsp_hseq',
  675 + 'Data' => $3
  676 + }
  677 + );
  678 + }
  679 + elsif ($self->{'_reporttype'} eq 'HMMPFAM') {
  680 + $self->element(
  681 + {
  682 + 'Name' => 'Hsp_qseq',
  683 + 'Data' => $3
  684 + }
  685 + );
  686 + }
959 687 }
960 688 else {
961   - $self->throw(
962   - "unrecognized line ($count): $_\n");
  689 + $self->warn("unrecognized line ($count): $_\n");
963 690 }
964 691 }
965 692 $count = 0 if $count++ >= 2;
@@ -967,11 +694,15 @@ sub next_result {
967 694 }
968 695 }
969 696 elsif ( /^Histogram/o || m!^//!o ) {
  697 + my %domaincounter;
970 698
971 699 while ( my $HSPinfo = shift @hspinfo ) {
972 700 my $id = shift @$HSPinfo;
  701 + $domaincounter{$id}++;
  702 +
973 703 my $info = [ @{ $hitinfo[ $hitinfo{$id} ] } ];
974 704 next unless defined $info;
  705 +
975 706 $self->start_element( { 'Name' => 'Hit' } );
976 707 $self->element(
977 708 {
@@ -997,38 +728,40 @@ sub next_result {
997 728 'Data' => shift @{$info}
998 729 }
999 730 );
1000   - my $dom_ct = shift @{$info};
1001   - if (my $hmm_end = shift @{$info}) {
  731 + my $domaintotal = shift @{$info};
  732 + if (my $hit_end = shift @{$info}) {
1002 733 $self->element(
1003 734 {
1004 735 'Name' => 'Hit_len',
1005   - 'Data' => $hmm_end
  736 + 'Data' => $hit_end
1006 737 }
1007 738 );
1008 739 }
1009 740
  741 + # Histogram is exclusive of Hmmsearch, not found in Hmmpfam,
  742 + # so just use Hmmsearch start/end order (first hit, then query)
1010 743 $self->start_element( { 'Name' => 'Hsp' } );
1011 744 $self->element(
1012 745 {
1013   - 'Name' => 'Hsp_query-from',
  746 + 'Name' => 'Hsp_hit-from',
1014 747 'Data' => shift @$HSPinfo
1015 748 }
1016   - );
  749 + );
1017 750 $self->element(
1018 751 {
1019   - 'Name' => 'Hsp_query-to',
  752 + 'Name' => 'Hsp_hit-to',
1020 753 'Data' => shift @$HSPinfo
1021 754 }
1022 755 );
1023 756 $self->element(
1024 757 {
1025   - 'Name' => 'Hsp_hit-from',
  758 + 'Name' => 'Hsp_query-from',
1026 759 'Data' => shift @$HSPinfo
1027 760 }
1028 761 );
1029 762 $self->element(
1030 763 {
1031   - 'Name' => 'Hsp_hit-to',
  764 + 'Name' => 'Hsp_query-to',
1032 765 'Data' => shift @$HSPinfo
1033 766 }
1034 767 );
@@ -1046,6 +779,10 @@ sub next_result {
1046 779 );
1047 780 $self->end_element( { 'Name' => 'Hsp' } );
1048 781 $self->end_element( { 'Name' => 'Hit' } );
  782 +
  783 + if ( $domaincounter{$id} == $domaintotal ) {
  784 + $hitinfo[ $hitinfo{$id} ] = undef;
  785 + }
1049 786 }
1050 787 @hitinfo = ();
1051 788 %hitinfo = ();
76 t/SearchIO/hmmer.t
@@ -7,7 +7,7 @@ BEGIN {
7 7 use lib '.';
8 8 use Bio::Root::Test;
9 9
10   - test_begin( -tests => 709 );
  10 + test_begin( -tests => 740 );
11 11
12 12 use_ok('Bio::SearchIO');
13 13 }
@@ -37,7 +37,7 @@ while ( $result = $searchio->next_result ) {
37 37 is( $result->num_hits(), 2, 'Check num_hits' );
38 38 my ( $hsp, $hit );
39 39
40   - if ( $hit = $result->next_model ) {
  40 + if ( defined( $hit = $result->next_model ) ) {
41 41 is( ref($hit), 'Bio::Search::Hit::HMMERHit',
42 42 'Check for the correct hit reference type' );
43 43 is( $hit->name, 'SEED', 'Check hit name' );
@@ -251,28 +251,76 @@ while ( $result = $searchio->next_result ) {
251 251 is( $hsp->query->seq_id(), 'SEED', 'Check for query seq_id' );
252 252 is( $hsp->hit->seq_id(), 'Q91581', 'Check for hit seq_id' );
253 253
254   - is( $hsp->hit->start, 1, 'Check for hit hmmfrom value' );
255   - is( $hsp->hit->end, 77, 'Check for hit hmm to value' );
256   - is( $hsp->query->start, 18, 'Check for query alifrom value' );
257   - is( $hsp->query->end, 89, 'Check for query ali to value' );
  254 + is( $hsp->hit->start, 18, 'Check for hit hmmfrom value' );
  255 + is( $hsp->hit->end, 89, 'Check for hit hmm to value' );
  256 + is( $hsp->query->start, 1, 'Check for query alifrom value' );
  257 + is( $hsp->query->end, 77, 'Check for query ali to value' );
258 258 is( $hsp->score, 119.7, 'Check for hsp score' );
259 259 float_is( $hsp->evalue, 2e-31, 'Check for hsp c-Evalue' );
260 260
261   - is( $hsp->length('query'), 72, 'Check for hsp query length' );
262   - is( $hsp->length('hit'), 77, 'Check for hsp hit length' );
  261 + is( $hsp->length('query'), 77, 'Check for hsp query length' );
  262 + is( $hsp->length('hit'), 72, 'Check for hsp hit length' );
263 263 is( $hsp->length('total'), 0, 'Check for hsp total length' );
264 264 is( $hsp->gaps('query'), 0, 'Check for hsp query gaps' );
265 265 is( $hsp->gaps('hit'), 0, 'Check for hsp hit gaps' );
266 266 is( $hsp->gaps('total'), 0, 'Check for hsp total gaps' );
267 267
  268 + my $example_counter = 0;
268 269 while ($hit = $result->next_model) {
269 270 if ($hit->name eq 'Q61954') {
270   - # Query and Hit lengths are usually unknown in HMMER,
271   - # but sometimes they can be deduced from domain data '[]'
272   - is( $hit->length, 153, 'Check hit length' );
273   - is( $hit->frac_aligned_query, 0.83 );
274   - is( $hit->frac_aligned_hit, '0.50' );
275   - last;
  271 + $example_counter++;
  272 + if ($example_counter == 1) {
  273 + # Query and Hit lengths are usually unknown in HMMER,
  274 + # but sometimes they can be deduced from domain data '[]'
  275 + is( $hit->length, 153, 'Check hit length' );
  276 + is( $hit->frac_aligned_query, '1.00' );
  277 + is( $hit->frac_aligned_hit, 0.42 );
  278 +
  279 + $hsp = $hit->next_domain;
  280 + is( $hsp->query->seq_id(), 'SEED', 'Check for query seq_id' );
  281 + is( $hsp->hit->seq_id(), 'Q61954', 'Check for hit seq_id' );
  282 +
  283 + is( $hsp->hit->start, 26, 'Check for hit hmmfrom value' );
  284 + is( $hsp->hit->end, 89, 'Check for hit hmm to value' );
  285 + is( $hsp->query->start, 1, 'Check for query alifrom value' );
  286 + is( $hsp->query->end, 77, 'Check for query ali to value' );
  287 + is( $hsp->score, 72.9, 'Check for hsp score' );
  288 + float_is( $hsp->evalue, 2.4e-17, 'Check for hsp c-Evalue' );
  289 +
  290 + is( $hsp->length('query'), 77, 'Check for hsp query length' );
  291 + is( $hsp->length('hit'), 64, 'Check for hsp hit length' );
  292 + is( $hsp->length('total'), 0, 'Check for hsp total length' );
  293 + is( $hsp->gaps('query'), 0, 'Check for hsp query gaps' );
  294 + is( $hsp->gaps('hit'), 0, 'Check for hsp hit gaps' );
  295 + is( $hsp->gaps('total'), 0, 'Check for hsp total gaps' );
  296 + }
  297 + elsif ($example_counter == 2) {
  298 + # Query and Hit lengths are usually unknown in HMMER,
  299 + # but sometimes they can be deduced from domain data '[]'
  300 + is( $hit->length, 153, 'Check hit length' );
  301 + is( $hit->frac_aligned_query, '1.00' );
  302 + is( $hit->frac_aligned_hit, 0.34 );
  303 +
  304 + $hsp = $hit->next_domain;
  305 + is( $hsp->query->seq_id(), 'SEED', 'Check for query seq_id' );
  306 + is( $hsp->hit->seq_id(), 'Q61954', 'Check for hit seq_id' );
  307 +
  308 + is( $hsp->hit->start, 102, 'Check for hit hmmfrom value' );
  309 + is( $hsp->hit->end, 153, 'Check for hit hmm to value' );
  310 + is( $hsp->query->start, 1, 'Check for query alifrom value' );
  311 + is( $hsp->query->end, 77, 'Check for query ali to value' );
  312 + is( $hsp->score, 3.3, 'Check for hsp score' );
  313 + float_is( $hsp->evalue, 1.9, 'Check for hsp c-Evalue' );
  314 +
  315 + is( $hsp->length('query'), 77, 'Check for hsp query length' );
  316 + is( $hsp->length('hit'), 52, 'Check for hsp hit length' );
  317 + is( $hsp->length('total'), 0, 'Check for hsp total length' );
  318 + is( $hsp->gaps('query'), 0, 'Check for hsp query gaps' );
  319 + is( $hsp->gaps('hit'), 0, 'Check for hsp hit gaps' );
  320 + is( $hsp->gaps('total'), 0, 'Check for hsp total gaps' );
  321 +
  322 + last;
  323 + }
276 324 }
277 325 }
278 326 }

0 comments on commit 76822a4

Please sign in to comment.
Something went wrong with that request. Please try again.