diff --git a/.travis.yml b/.travis.yml index 4e22586177..a2ccf8f0a5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ install: - "cpanm XML::SAX XML::SAX::Writer XML::Simple XML::LibXML XML::Twig XML::Writer 2>&1 | tail -n 1" - "cpanm PostScript::TextBlock Set::Scalar Sort::Naturally YAML | tail -n 1" - "cpanm Math::Random SOAP::Lite Spreadsheet::ParseExcel | tail -n 1" - #- "cpanm Bio::ASN1::EntrezGene | tail -n 1" + - "cpanm Bio::ASN1::EntrezGene | tail -n 1" - "cpanm Bio::Phylo | tail -n 1" #for some reason tests and deps aren't skipped here. Will have to look into it more... #git repos, seems to only work for simple checkouts, so pure perl only (TODO: look into before_script for more detail) @@ -34,7 +34,12 @@ script: #TODO - send emails to bioperl-guts-l notifications: - email: false + email: + recipients: + - bioperl-guts-l@lists.open-bio.org + - cjfields1@gmail.com + on_success: change + on_failure: change # whitelist branches branches: diff --git a/Bio/Align/Utilities.pm b/Bio/Align/Utilities.pm index 90bc80c547..b2815185fd 100644 --- a/Bio/Align/Utilities.pm +++ b/Bio/Align/Utilities.pm @@ -1,3 +1,20 @@ +package Bio::Align::Utilities; +use strict; +use warnings; +use Carp; +use Bio::Root::Version; + +use Exporter 'import'; +our @EXPORT_OK = qw( + aa_to_dna_aln + bootstrap_replicates + cat + bootstrap_replicates_codons + dna_to_aa_aln + most_common_sequences +); +our %EXPORT_TAGS = (all => \@EXPORT_OK); + # # BioPerl module for Bio::Align::Utilities # @@ -87,28 +104,9 @@ Internal methods are usually preceded with a _ =cut -#' keep my emacs happy -# Let the code begin... - -package Bio::Align::Utilities; -use vars qw(@EXPORT @EXPORT_OK $GAP $CODONGAP %EXPORT_TAGS ); -use strict; -use Carp; -use Bio::Root::Version; -require Exporter; - -use base qw(Exporter); - -@EXPORT = qw(); -@EXPORT_OK = - qw(aa_to_dna_aln bootstrap_replicates cat bootstrap_replicates_codons dna_to_aa_aln); -%EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); - -BEGIN { - use constant CODONSIZE => 3; - $GAP = '-'; - $CODONGAP = $GAP x CODONSIZE; -} +use constant CODONSIZE => 3; +our $GAP = '-'; +our $CODONGAP = $GAP x CODONSIZE; =head2 aa_to_dna_aln @@ -503,4 +501,41 @@ sub cat { return $aln; } + +=head2 most_common_sequences + + Title : most_common_sequences + Usage : @common = most_common_sequences ($align, $case_sensitivity) + Function : Returns an array of the sequences that appear most often in the + alignment (although this probably makes more sense when there is + only a single most common sequence). Sequences are compared after + removing any "-" (gap characters), and ambiguous units (e.g., R + for purines) are only compared to themselves. The returned + sequence is also missing the "-" since they don't actually make + part of the sequence. + Returns : Array of text strings. + Arguments : Optional argument defining whether the comparison between sequences + to find the most common should be case sensitive. Defaults to + false, i.e, not case sensitive. + +=cut + +sub most_common_sequences { + my $align = shift + or croak ("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences"); + my $case_sensitive = shift; # defaults to false (we get undef if nothing) + + ## We keep track of the max on this loop. Saves us having to + ## transverse the hash table later to find the maximum value. + my $max = 0; + my %counts; + foreach ($align->each_seq) { + (my $seq = $_->seq) =~ tr/-//d; + $seq = uc ($seq) unless $case_sensitive; + $max++ if (++$counts{$seq} > $max); + } + my @common = grep ($counts{$_} == $max, keys %counts); + return @common; +} + 1; diff --git a/Bio/Assembly/Singlet.pm b/Bio/Assembly/Singlet.pm index 92ee21a734..4f648a816b 100644 --- a/Bio/Assembly/Singlet.pm +++ b/Bio/Assembly/Singlet.pm @@ -157,6 +157,10 @@ sub _seq_to_singlet { -start => 1, #-end => we let Bio::LocatableSeq calculate it (Seq and LocatableSeq) ); + # Get End from $seq if $lseq can't figure it out (e.g. phrap output) + if (not defined $lseq->end) { + $lseq->end($seq->end); + } # Add new sequence and its coordinates to the contig my $lcoord = Bio::SeqFeature::Generic->new( -start => $lseq->start, -end => $lseq->end ); diff --git a/Bio/DB/Flat/BDB.pm b/Bio/DB/Flat/BDB.pm index c2aa7c91ea..1aa25b9e84 100644 --- a/Bio/DB/Flat/BDB.pm +++ b/Bio/DB/Flat/BDB.pm @@ -241,6 +241,7 @@ sub _index_file { my $fh = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!"); my $offset = 0; my $count = 0; + while (!eof($fh)) { my ($ids,$adjustment) = $self->parse_one_record($fh) or next; $adjustment ||= 0; # prevent uninit variable warning diff --git a/Bio/DB/Flat/BDB/fasta.pm b/Bio/DB/Flat/BDB/fasta.pm index 3e68bd6748..b7967ed7f6 100644 --- a/Bio/DB/Flat/BDB/fasta.pm +++ b/Bio/DB/Flat/BDB/fasta.pm @@ -85,7 +85,7 @@ sub parse_one_record { my $fh = shift; # fasta parses by changing $/ to '\n>', need to adjust accordingly - my $adj = ( $^O =~ /mswin/i ) ? -2 : -1; + my $adj = -1; my $parser = $self->{cached_parsers}{fileno($fh)} ||= Bio::SeqIO->new(-fh=>$fh,-format=>$self->default_file_format); diff --git a/Bio/DB/Flat/BinarySearch.pm b/Bio/DB/Flat/BinarySearch.pm index e6d8c60890..dc12cfdb77 100644 --- a/Bio/DB/Flat/BinarySearch.pm +++ b/Bio/DB/Flat/BinarySearch.pm @@ -840,6 +840,16 @@ sub _index_file { my %secondary_id; my $last_one; + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($fh); + my $curr_line = <$fh>; + my $pos_diff = tell($fh) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $fh, $init_pos, 0; # Rewind position to proceed to read the file + while (<$fh>) { $last_one = $_; $self->{alphabet} ||= $self->guess_alphabet($_); @@ -848,7 +858,7 @@ sub _index_file { $id = $new_primary_entry; $self->{alphabet} ||= $self->guess_alphabet($_); - my $tmplen = ( tell $fh ) - length($_); + my $tmplen = ( tell $fh ) - length($_) - $correction; $length = $tmplen - $pos; diff --git a/Bio/DB/IndexedBase.pm b/Bio/DB/IndexedBase.pm index 2d536dbeed..2abbe5bedc 100644 --- a/Bio/DB/IndexedBase.pm +++ b/Bio/DB/IndexedBase.pm @@ -747,9 +747,18 @@ sub _calc_termination_length { # Account for crlf-terminated Windows and Mac files my ($self, $file) = @_; my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!"); - my $line = <$fh>; + + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($fh); + my $curr_line = <$fh>; + my $pos_diff = tell($fh) - $init_pos; + my $correction = $pos_diff - length $curr_line; close $fh; - $self->{termination_length} = ($line =~ /\r\n$/) ? 2 : 1; + + $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction; return $self->{termination_length}; } @@ -1056,9 +1065,18 @@ sub NEXTKEY { sub DESTROY { my $self = shift; + + # Close filehandles + while (my ($file, $fh) = each %{ $self->{fhcache} }) { + if (defined $fh) { + $fh->close; + } + } + $self->_close_index($self->{offsets}); + if ( $self->{clean} || $self->{indexing} ) { - # Indexing aborted or cleaning requested. Delete the index file. - unlink $self->{index_name}; + # Indexing aborted or cleaning requested. Delete the index file. + unlink $self->{index_name}; } return 1; } diff --git a/Bio/DB/Qual.pm b/Bio/DB/Qual.pm index a836434214..62c87e020f 100644 --- a/Bio/DB/Qual.pm +++ b/Bio/DB/Qual.pm @@ -378,7 +378,10 @@ sub header { my $fh = $self->_fh($id) or return; seek($fh, $offset, 0); read($fh, $data, $headerlen); - chomp $data; + # On Windows chomp remove '\n' but leaves '\r' + # when reading '\r\n' in binary mode + $data =~ s/\n//g; + $data =~ s/\r//g; substr($data, 0, 1) = ''; return $data; } diff --git a/Bio/DB/SeqFeature/Store/DBI/SQLite.pm b/Bio/DB/SeqFeature/Store/DBI/SQLite.pm index 1eb5753adc..213d623d42 100644 --- a/Bio/DB/SeqFeature/Store/DBI/SQLite.pm +++ b/Bio/DB/SeqFeature/Store/DBI/SQLite.pm @@ -146,7 +146,7 @@ use base 'Bio::DB::SeqFeature::Store::DBI::mysql'; use Bio::DB::SeqFeature::Store::DBI::Iterator; use DBI qw(:sql_types); use Memoize; -use Cwd 'abs_path'; +use Cwd qw(abs_path getcwd); use Bio::DB::GFF::Util::Rearrange 'rearrange'; use Bio::SeqFeature::Lite; use File::Spec; @@ -254,6 +254,10 @@ sub init { $dbh->do("PRAGMA synchronous = OFF;"); # makes writes much faster $dbh->do("PRAGMA temp_store = MEMORY;"); # less disk I/O; some speedup $dbh->do("PRAGMA cache_size = 20000;"); # less disk I/O; some speedup + # Keep track of database file location + my $cwd = getcwd; + my ($db_file) = ($dsn =~ m/(?:db(?:name)?|database)=(.+)$/); + $self->{dbh_file} = "$cwd/$db_file"; } $self->{dbh} = $dbh; $self->{is_temp} = $is_temporary; @@ -1196,6 +1200,16 @@ sub _dump_update_location_index { print $fh join("\t",@args),"\n"; } +sub DESTROY { + my $self = shift; + # Remove filehandles, so temporal files can be properly deleted + if (%DBI::installed_drh) { + DBI->disconnect_all; + %DBI::installed_drh = (); + } + undef $self->{dbh}; +} + 1; =head1 AUTHOR diff --git a/Bio/DB/SeqFeature/Store/LoadHelper.pm b/Bio/DB/SeqFeature/Store/LoadHelper.pm index 46e8ddcd70..b852efa8ef 100644 --- a/Bio/DB/SeqFeature/Store/LoadHelper.pm +++ b/Bio/DB/SeqFeature/Store/LoadHelper.pm @@ -59,6 +59,13 @@ sub new { sub DESTROY { my $self = shift; + # Destroy all filehandle references + # before trying to delete files and folder + %DBHandles = (); + undef $self->{IndexIt}; + undef $self->{TopLevel}; + undef $self->{Local2Global}; + undef $self->{Parent2Child}; rmtree $self->{tmppath}; # File::Temp::cleanup() unless $self->{keep}; } diff --git a/Bio/DB/SeqFeature/Store/Loader.pm b/Bio/DB/SeqFeature/Store/Loader.pm index 8d7e97809e..8bbc427649 100644 --- a/Bio/DB/SeqFeature/Store/Loader.pm +++ b/Bio/DB/SeqFeature/Store/Loader.pm @@ -720,6 +720,28 @@ sub unescape { sub DESTROY { my $self = shift; + # Close filehandles, so temporal files can be properly deleted + my $store = $self->store; + if ( $store->isa('Bio::DB::SeqFeature::Store::memory') + or $store->isa('Bio::DB::SeqFeature::Store::berkeleydb3') + ) { + $store->private_fasta_file->close; + + if ($store->{fasta_db}) { + while (my ($file, $fh) = each %{ $store->{fasta_db}->{fhcache} }) { + $fh->close; + } + $store->{fasta_db}->_close_index($store->{fasta_db}->{offsets}); + } + } + elsif ($store->isa('Bio::DB::SeqFeature::Store::DBI::SQLite')) { + if (%DBI::installed_drh) { + DBI->disconnect_all; + %DBI::installed_drh = (); + } + undef $store->{dbh}; + } + if (my $ld = $self->{temp_load}) { unlink $ld; } @@ -755,5 +777,3 @@ This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut - - diff --git a/Bio/DB/SeqFeature/Store/berkeleydb.pm b/Bio/DB/SeqFeature/Store/berkeleydb.pm index c2219f5c40..53f34b8c58 100644 --- a/Bio/DB/SeqFeature/Store/berkeleydb.pm +++ b/Bio/DB/SeqFeature/Store/berkeleydb.pm @@ -621,6 +621,7 @@ sub _open_databases { return if $ignore_errors; # autoindex set, so defer this $self->throw("Couldn't tie: ".$self->_features_path . " $!"); } + if ($create) { %h = (); $h{'.next_id'} = 1; @@ -701,6 +702,7 @@ sub _close_databases { $self->db(undef); $self->dna_db(undef); $self->notes_db(undef); + $self->parentage_db(undef); $self->index_db($_=>undef) foreach $self->_index_files; } @@ -1449,6 +1451,7 @@ sub db_version { sub DESTROY { my $self = shift; $self->_close_databases(); + $self->private_fasta_file->close; rmtree($self->directory,0,1) if $self->temporary && -e $self->directory; } @@ -1575,4 +1578,3 @@ This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut - diff --git a/Bio/DB/TFBS/transfac_pro.pm b/Bio/DB/TFBS/transfac_pro.pm index a3d932980f..fbbce58b47 100644 --- a/Bio/DB/TFBS/transfac_pro.pm +++ b/Bio/DB/TFBS/transfac_pro.pm @@ -1884,4 +1884,15 @@ sub _species_to_taxid { return $ncbi_taxid; } +sub DESTROY { + my $self = shift; + # Destroy tied references to close filehandles + # and allow proper temporary files deletion + undef $self->{_tax_db}->{'_nodes'}; + undef $self->{_tax_db}->{'_id2name'}; + undef $self->{_tax_db}->{'_name2id'}; + undef $self->{_tax_db}->{'_parent2children'}; + undef $self->{_tax_db}->{'_parentbtree'}; +} + 1; diff --git a/Bio/Index/Abstract.pm b/Bio/Index/Abstract.pm index acc34d964a..ab1e5dbe20 100644 --- a/Bio/Index/Abstract.pm +++ b/Bio/Index/Abstract.pm @@ -773,8 +773,13 @@ sub add_record { =cut sub pack_record { - my( $self, @args ) = @_; - return join "\034", @args; + my( $self, @args ) = @_; + # Silence undefined warnings + @args = map { + $_ = (defined $_) ? $_ : ''; + $_ ; + } @args; + return join "\034", @args; } =head2 unpack_record diff --git a/Bio/Index/Blast.pm b/Bio/Index/Blast.pm index c857f86a24..50298d0ee0 100644 --- a/Bio/Index/Blast.pm +++ b/Bio/Index/Blast.pm @@ -218,20 +218,30 @@ sub _index_file { my $lastline = 0; my $prefix = ''; + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($BLAST); + my $curr_line = <$BLAST>; + my $pos_diff = tell($BLAST) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $BLAST, $init_pos, 0; # Rewind position to proceed to read the file + # fencepost problem: we basically just find the top and the query - while( <$BLAST> ) { + while( my $line = <$BLAST> ) { # in recent RPS-BLAST output the only delimiter between result # sections is '^Query=' - in other BLAST outputs you # can use '^(RPS-|T?)BLAST(P?|N?|X?)' - if ( /^(RPS-|T?)BLAST(P?|N?|X?)/ ) { + if ( $line =~ /^(RPS-|T?)BLAST(P?|N?|X?)/ ) { $prefix = $1; - $indexpoint = tell($BLAST) - length $_; + $indexpoint = tell($BLAST) - length($line) - $correction; } - if ( /^Query=\s*([^\n]+)$/ ) { + if ( $line =~ /^Query=\s*([^\n]+)$/ ) { - $indexpoint = tell($BLAST) - length $_ if ( $prefix eq 'RPS-' ); + $indexpoint = tell($BLAST) - length($line) - $correction if ( $prefix eq 'RPS-' ); foreach my $id ($self->id_parser()->($1)) { $self->debug("id is $id, begin is $indexpoint\n"); diff --git a/Bio/Index/BlastTable.pm b/Bio/Index/BlastTable.pm index b33043697d..42bc62b77b 100644 --- a/Bio/Index/BlastTable.pm +++ b/Bio/Index/BlastTable.pm @@ -194,18 +194,29 @@ sub _index_file { my $lastline = 0; my $last_query = ''; my $is_m9; + + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($BLAST); + my $curr_line = <$BLAST>; + my $pos_diff = tell($BLAST) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $BLAST, $init_pos, 0; # Rewind position to proceed to read the file + while( <$BLAST> ) { if (m{^#}) { $is_m9 ||= 1; if(m{^#\s+T?BLAST[PNX]}i ) { - $indexpoint = tell($BLAST) - length($_); + $indexpoint = tell($BLAST) - length($_) - $correction; } next } if (/^(?:([^\t]+)\t)(?:[^\t]+\t){7,}/) { next if $last_query eq $1; - $indexpoint = tell($BLAST) - length($_) unless $is_m9; + $indexpoint = tell($BLAST) - length($_) - $correction unless $is_m9; foreach my $id ($self->id_parser()->($1)) { $self->debug("id is $id, begin is $indexpoint\n"); $self->add_record($id, $i, $indexpoint); diff --git a/Bio/Index/EMBL.pm b/Bio/Index/EMBL.pm index cb8ea268b2..c92a44a6a0 100644 --- a/Bio/Index/EMBL.pm +++ b/Bio/Index/EMBL.pm @@ -146,6 +146,16 @@ sub _index_file { open my $EMBL, '<', $file or $self->throw("Can't open file for read : $file"); + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($EMBL); + my $curr_line = <$EMBL>; + my $pos_diff = tell($EMBL) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $EMBL, $init_pos, 0; # Rewind position to proceed to read the file + # Main indexing loop $id = undef; @accs = (); @@ -170,7 +180,7 @@ sub _index_file { $id = $1; # not sure if I like this. Assummes tell is in bytes. # we could tell before each line and save it. - $begin = tell($EMBL) - length( $_ ); + $begin = tell($EMBL) - length( $_ ) - $correction; } elsif (/^AC\s+(.*)?/) { push @accs , split (/[; ]+/, $1); @@ -204,13 +214,3 @@ sub _file_format{ 1; - - - - - - - - - - diff --git a/Bio/Index/Fasta.pm b/Bio/Index/Fasta.pm index c2261e7726..bae5c39c88 100644 --- a/Bio/Index/Fasta.pm +++ b/Bio/Index/Fasta.pm @@ -180,7 +180,15 @@ sub _index_file { open my $FASTA, '<', $file or $self->throw("Can't open file for read : $file"); - my $offset = ( $^O =~ /mswin/i ) ? 1 : 0; + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($FASTA); + my $curr_line = <$FASTA>; + my $pos_diff = tell($FASTA) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $FASTA, $init_pos, 0; # Rewind position to proceed to read the file # Main indexing loop while (<$FASTA>) { @@ -189,7 +197,7 @@ sub _index_file { # the following was fixed to allow validation - cjfields # $begin is the position of the first character after the '>' - $begin = tell($FASTA) - length( $_ ) - $offset; + $begin = tell($FASTA) - length( $_ ) - $correction; foreach my $id (&$id_parser($_)) { $self->add_record($id, $i, $begin); diff --git a/Bio/Index/Fastq.pm b/Bio/Index/Fastq.pm index 81304174a4..ba511b1ea4 100644 --- a/Bio/Index/Fastq.pm +++ b/Bio/Index/Fastq.pm @@ -157,10 +157,21 @@ sub _index_file { my $id_parser = $self->id_parser; my $c = 0; open my $FASTQ, '<', $file or $self->throw("Can't open file for read : $file"); + + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($FASTQ); + my $curr_line = <$FASTQ>; + my $pos_diff = tell($FASTQ) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $FASTQ, $init_pos, 0; # Rewind position to proceed to read the file + # Main indexing loop while (<$FASTQ>) { if (/^@/) { - my $begin = tell($FASTQ) - length( $_ ); + my $begin = tell($FASTQ) - length( $_ ) - $correction; foreach my $id (&$id_parser($_)) { $self->add_record($id, $i, $begin); $c++; diff --git a/Bio/Index/GenBank.pm b/Bio/Index/GenBank.pm index a9a606c74c..b9bedfe1b5 100644 --- a/Bio/Index/GenBank.pm +++ b/Bio/Index/GenBank.pm @@ -153,9 +153,20 @@ sub _index_file { $self->throw("Can't open file for read : $file"); my %done_ids; + + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($GENBANK); + my $curr_line = <$GENBANK>; + my $pos_diff = tell($GENBANK) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $GENBANK, $init_pos, 0; # Rewind position to proceed to read the file + while (<$GENBANK>) { if (/^LOCUS/) { - $begin = tell($GENBANK) - length($_); + $begin = tell($GENBANK) - length($_) - $correction; } for my $id (&$id_parser($_)) { next if exists $done_ids{$id}; diff --git a/Bio/Index/Qual.pm b/Bio/Index/Qual.pm index 9796af1d59..656ee63846 100755 --- a/Bio/Index/Qual.pm +++ b/Bio/Index/Qual.pm @@ -180,12 +180,20 @@ sub _index_file { open my $QUAL, '<', $file or $self->throw("Can't open file for read : $file"); + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($QUAL); + my $curr_line = <$QUAL>; + my $pos_diff = tell($QUAL) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $QUAL, $init_pos, 0; # Rewind position to proceed to read the file + # Main indexing loop while (<$QUAL>) { if (/^>/) { - # $begin is the position of the first character after the '>' - my $offset = ( $^O =~ /mswin/i ) ? 0 : 1; - my $begin = tell($QUAL) - length( $_ ) + $offset; + my $begin = tell($QUAL) - length( $_ ) + 1 - $correction; foreach my $id (&$id_parser($_)) { $self->add_record($id, $i, $begin); diff --git a/Bio/Index/Stockholm.pm b/Bio/Index/Stockholm.pm index 89907cbdba..f508e42077 100644 --- a/Bio/Index/Stockholm.pm +++ b/Bio/Index/Stockholm.pm @@ -215,10 +215,19 @@ sub _index_file { my %done_ids; + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($STOCKHOLM); + my $curr_line = <$STOCKHOLM>; + my $pos_diff = tell($STOCKHOLM) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $STOCKHOLM, $init_pos, 0; # Rewind position to proceed to read the file + while (<$STOCKHOLM>) { - if ( /^#\sSTOCKHOLM/ ) { - $begin = tell($STOCKHOLM) - length($_); + $begin = tell($STOCKHOLM) - length($_) - $correction; } for my $id ( &$id_parser($_) ) { diff --git a/Bio/Index/SwissPfam.pm b/Bio/Index/SwissPfam.pm index 12c1c91705..604461555d 100644 --- a/Bio/Index/SwissPfam.pm +++ b/Bio/Index/SwissPfam.pm @@ -138,12 +138,22 @@ sub _index_file { open my $SP, '<', $file or $self->throw("Can't open file for read : $file"); + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($SP); + my $curr_line = <$SP>; + my $pos_diff = tell($SP) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $SP, $init_pos, 0; # Rewind position to proceed to read the file + # Main indexing loop while (<$SP>) { if (/^>(\S+)\s+\|=*\|\s+(\S+)/) { $nid = $1; $nacc = $2; - my $new_begin = tell($SP) - length( $_ ); + my $new_begin = tell($SP) - length( $_ ) - $correction; $end = $new_begin - 1; if( $id ) { diff --git a/Bio/Index/Swissprot.pm b/Bio/Index/Swissprot.pm index b4be1c22de..ae113ac290 100644 --- a/Bio/Index/Swissprot.pm +++ b/Bio/Index/Swissprot.pm @@ -152,9 +152,20 @@ sub _index_file { open my $SWISSPROT,'<',$file or $self->throw("Can't read file: $file"); my %done_ids; + + # In Windows, text files have '\r\n' as line separator, but when reading in + # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", + # "length $_" will report 4 although the line is 5 bytes in length. + # We assume that all lines have the same line separator and only read current line. + my $init_pos = tell($SWISSPROT); + my $curr_line = <$SWISSPROT>; + my $pos_diff = tell($SWISSPROT) - $init_pos; + my $correction = $pos_diff - length $curr_line; + seek $SWISSPROT, $init_pos, 0; # Rewind position to proceed to read the file + while (<$SWISSPROT>) { if (/^ID\s+\S+/) { - $begin = tell($SWISSPROT) - length( $_ ); + $begin = tell($SWISSPROT) - length( $_ ) - $correction; } for my $id (&$id_parser($_)) { next if exists $done_ids{$id}; diff --git a/Bio/SearchIO/SearchResultEventBuilder.pm b/Bio/SearchIO/SearchResultEventBuilder.pm index 4682a3d26a..a78a7cd5cf 100644 --- a/Bio/SearchIO/SearchResultEventBuilder.pm +++ b/Bio/SearchIO/SearchResultEventBuilder.pm @@ -350,7 +350,15 @@ sub start_hit{ =cut sub end_hit{ - my ($self,$type,$data) = @_; + my ($self,$type,$data) = @_; + + # Skip process unless there is HSP data or Hit Significance (e.g. a bl2seq with no similarity + # gives a hit with the subject, but shows a "no hits found" message instead + # of the alignment data and don't have a significance value). + # This way, we avoid false positives + my @hsp_data = grep { /^HSP/ } keys %{$data}; + return unless (scalar @hsp_data > 0 or exists $data->{'HIT-significance'}); + my %args = map { my $v = $data->{$_}; s/HIT//; ($_ => $v); } grep { /^HIT/ } keys %{$data}; # I hate special cases, but this is here because NCBI BLAST XML diff --git a/Bio/SearchIO/blast.pm b/Bio/SearchIO/blast.pm index f490118fc5..b06df23ff0 100644 --- a/Bio/SearchIO/blast.pm +++ b/Bio/SearchIO/blast.pm @@ -258,7 +258,7 @@ BEGIN { { 'RESULT-statistics' => 'num_successful_extensions' }, 'Statistics_length_adjustment' => { 'RESULT-statistics' => 'length_adjustment' }, 'Statistics_number_of_hsps_better_than_expect_value_cutoff_without_gapping' => - { 'RESULT-statistics' => 'number_of_hsps_better_than_expect_value_cutoff_without_gapping' }, + { 'RESULT-statistics' => 'number_of_hsps_better_than_expect_value_cutoff_without_gapping' }, 'Statistics_number_of_hsps_gapped' => { 'RESULT-statistics' => 'number_of_hsps_gapped' }, 'Statistics_number_of_hsps_successfully_gapped' => { 'RESULT-statistics' => 'number_of_hsps_successfully_gapped' }, @@ -652,15 +652,15 @@ sub next_result { } ) if $self->{'_blsdb_letters'}; } - # added check for WU-BLAST -echofilter option (bug 2388) - elsif (/^>Unfiltered[+-]1$/) { - # skip all of the lines of unfiltered sequence - while($_ !~ /^Database:/) { - $self->debug("Bypassing features line: $_"); - $_ = $self->_readline; - } - $self->_pushback($_); - } + # added check for WU-BLAST -echofilter option (bug 2388) + elsif (/^>Unfiltered[+-]1$/) { + # skip all of the lines of unfiltered sequence + while($_ !~ /^Database:/) { + $self->debug("Bypassing features line: $_"); + $_ = $self->_readline; + } + $self->_pushback($_); + } elsif (/Sequences producing significant alignments:/) { $self->debug("blast.pm: Processing NCBI-BLAST descriptions\n"); $flavor = 'ncbi'; @@ -891,10 +891,10 @@ sub next_result { elsif ( ( $self->in_element('hit') || $self->in_element('hsp') ) && # paracel genewise BTK - m/Score\s*=\s*(\S+)\s*bits\s* # Bit score - (?:\((\d+)\))?, # Raw score - \s+Log\-Length\sScore\s*=\s*(\d+) # Log-Length score - /ox + m/Score\s*=\s*(\S+)\s*bits\s* # Bit score + (?:\((\d+)\))?, # Raw score + \s+Log\-Length\sScore\s*=\s*(\d+) # Log-Length score + /ox ) { $self->in_element('hsp') @@ -928,10 +928,10 @@ sub next_result { elsif ( ( $self->in_element('hit') || $self->in_element('hsp') ) && # paracel hframe BTK - m/Score\s*=\s*([^,\s]+), # Raw score - \s*Expect\s*=\s*([^,\s]+), # E-value - \s*P(?:\(\S+\))?\s*=\s*([^,\s]+) # P-value - /ox + m/Score\s*=\s*([^,\s]+), # Raw score + \s*Expect\s*=\s*([^,\s]+), # E-value + \s*P(?:\(\S+\))?\s*=\s*([^,\s]+) # P-value + /ox ) { $self->in_element('hsp') @@ -1174,29 +1174,60 @@ sub next_result { elsif ( $self->in_element('hsp') && /Frame\s*=\s*([\+\-][1-3])\s*(\/\s*([\+\-][1-3]))?/ ) { - + my $frame1 = $1 || 0; + my $frame2 = $2 || 0; # this is for bl2seq only - unless ( defined $reporttype ) { + if ( not defined $reporttype ) { $bl2seq_fix = 1; - if ( $1 && $2 ) { $reporttype = 'TBLASTX' } + if ( $frame1 && $frame2 ) { + $reporttype = 'TBLASTX' + } else { - $reporttype = 'BLASTX'; + # We can distinguish between BLASTX and TBLASTN from the report + # (and assign $frame1 properly) by using the start/end from query. + # If the report is BLASTX, the coordinates distance from query + # will be 3 times the length of the alignment shown (coordinates in nt, + # alignment in aa); if not then subject is the nucleotide sequence (TBLASTN). + # Will have to fast-forward to query alignment line and then go back. + my $fh = $self->_fh; + my $file_pos = tell $fh; + + my $a_position = ''; + my $ali_length = ''; + my $b_position = ''; + while (my $line = <$fh>) { + if ($line =~ m/^(?:Query|Sbjct):?\s+(\-?\d+)?\s*(\S+)\s+(\-?\d+)?/) { + $a_position = $1; + my $alignment = $2; + $b_position = $3; + + use Bio::LocatableSeq; + my $gap_symbols = $Bio::LocatableSeq::GAP_SYMBOLS; + $alignment =~ s/[$gap_symbols]//g; + $ali_length = length($alignment); + last; + } + } + my $coord_length = ($a_position < $b_position) ? ($b_position - $a_position + 1) + : ($a_position - $b_position + 1); + ($coord_length == ($ali_length * 3)) ? ($reporttype = 'BLASTX') : ($reporttype = 'TBLASTN'); - # we can't distinguish between BLASTX and TBLASTN straight from the report } + # Rewind filehandle to its original position to continue parsing + seek $fh, $file_pos, 0; } $self->{'_reporttype'} = $reporttype; } my ( $queryframe, $hitframe ); if ( $reporttype eq 'TBLASTX' ) { - ( $queryframe, $hitframe ) = ( $1, $2 ); + ( $queryframe, $hitframe ) = ( $frame1, $frame2 ); $hitframe =~ s/\/\s*//g; } elsif ( $reporttype eq 'TBLASTN' || $reporttype eq 'PSITBLASTN') { - ( $hitframe, $queryframe ) = ( $1, 0 ); + ( $hitframe, $queryframe ) = ( $frame1, 0 ); } elsif ( $reporttype eq 'BLASTX' || $reporttype eq 'RPS-BLAST(BLASTP)') { - ( $queryframe, $hitframe ) = ( $1, 0 ); + ( $queryframe, $hitframe ) = ( $frame1, 0 ); # though NCBI doesn't report it, this is a special BLASTX-like # RPS-BLAST; should be handled differently if ($reporttype eq 'RPS-BLAST(BLASTP)') { @@ -1307,7 +1338,7 @@ sub next_result { } elsif ( $blast eq 'wublast' ) { - # warn($_); + # warn($_); if (/E=(\S+)/) { $self->element( { @@ -1520,7 +1551,7 @@ sub next_result { } elsif ( m/^\s+Time to generate neighborhood:\s+ - (\S+\s+\S+\s+\S+)/x + (\S+\s+\S+\s+\S+)/x ) { $self->element( @@ -1539,9 +1570,9 @@ sub next_result { ); } elsif ( - m/^\s+(\S+)\s+cpu\s+time:\s+# cputype - (\S+\s+\S+\s+\S+) # cputime - \s+Elapsed:\s+(\S+)/x + m/^\s+(\S+)\s+cpu\s+time:\s+ # cputype + (\S+\s+\S+\s+\S+) # cputime + \s+Elapsed:\s+(\S+)/x ) { my $cputype = lc($1); @@ -1672,7 +1703,7 @@ sub next_result { } elsif ( m/Gap\s+Penalties:\s+Existence:\s+(\d+)\, - \s+Extension:\s+(\d+)/ox + \s+Extension:\s+(\d+)/ox ) { $self->element( @@ -1771,7 +1802,7 @@ sub next_result { } elsif ( m/frameshift\s+window\, - \s+decay\s+const:\s+(\d+)\,\s+([\.\d]+)/x + \s+decay\s+const:\s+(\d+)\,\s+([\.\d]+)/x ) { $self->element( @@ -1805,7 +1836,7 @@ sub next_result { } elsif ( m/^Number\s+of\s+successful\s+extensions:\s+ - (\S+)/ox + (\S+)/ox ) { $self->element( @@ -1817,7 +1848,7 @@ sub next_result { } elsif ( m/^Number\s+of\s+sequences\s+better\s+than\s+ - (\S+):\s+(\d+)/ox + (\S+):\s+(\d+)/ox ) { $self->element( @@ -1863,8 +1894,8 @@ sub next_result { for ( my $i = 0 ; defined($_) && $i < 3 ; $i++ ) { # $self->debug("$i: $_") if $v; if ( ( $i == 0 && /^\s+$/) || - /^\s*(?:Lambda|Minus|Plus|Score)/i ) - { + /^\s*(?:Lambda|Minus|Plus|Score)/i + ) { $self->_pushback($_) if defined $_; $self->end_element( { 'Name' => 'Hsp' } ); last; @@ -2146,7 +2177,7 @@ sub end_element { $self->{'_last_data'} = ''; # remove read data if we are at # end of an element $self->{'_result'} = $rc if ( defined $type && $type eq 'result' ); - $self->{'_seen_hsp_features'} = 0; + $self->{'_seen_hsp_features'} = 0; return $rc; } diff --git a/Bio/SearchIO/blasttable.pm b/Bio/SearchIO/blasttable.pm index 7a6ae0617a..868cd2ecbe 100644 --- a/Bio/SearchIO/blasttable.pm +++ b/Bio/SearchIO/blasttable.pm @@ -270,6 +270,11 @@ sub next_result{ 'Data' => $evalue}); } my $identical = $hsp_len - $mismatches - $gapsm; + # If $positives value is absent, try to recover it from $percent_pos, + # this is better than letting the program to assume "conserved == identical" + if (not defined $positives and defined $percent_pos) { + $positives = sprintf "%d", ($percent_pos * $hsp_len / 100); + } $self->start_element({'Name' => 'Hsp'}); $self->element({'Name' => 'Hsp_evalue', 'Data' => $evalue}); diff --git a/Bio/SeqIO/fasta.pm b/Bio/SeqIO/fasta.pm index b65652a01f..100e80c9c8 100644 --- a/Bio/SeqIO/fasta.pm +++ b/Bio/SeqIO/fasta.pm @@ -123,7 +123,9 @@ sub next_seq { local $/ = "\n>"; return unless my $entry = $self->_readline; - chomp($entry); + # Replacing chomp for s///, since chomp is not working in some cases + $entry =~ s/\n$//; + $entry =~ s/\r$//; if ($entry =~ m/\A\s*\Z/s) { # very first one return unless $entry = $self->_readline; chomp($entry); diff --git a/Bio/SeqIO/interpro.pm b/Bio/SeqIO/interpro.pm index d46c53b351..a61fe0fba2 100644 --- a/Bio/SeqIO/interpro.pm +++ b/Bio/SeqIO/interpro.pm @@ -137,8 +137,8 @@ sub next_seq { $xml_fragment .= $finishedline; last if $finishedline =~ m!!; } - - return unless $xml_fragment =~ / but not other similar elements like + return unless $xml_fragment =~ /]/; $self->_parse_xml($xml_fragment); @@ -229,7 +229,8 @@ sub _initialize { my $line = undef; # fast forward to first record. while($line = $self->_readline()){ - if($line =~ / but not other similar elements like + if($line =~ /]/){ $self->_pushback($line); last; } diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index f0af533af5..dd075e6333 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -1,3 +1,12 @@ +package Bio::SimpleAlign; +use strict; +use warnings; +use Bio::LocatableSeq; # uses Seq's as list +use Bio::Seq; +use Bio::SeqFeature::Generic; + +use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI); + # BioPerl module for SimpleAlign # # Please direct questions and support issues to @@ -145,51 +154,16 @@ methods. Internal methods are usually preceded with a _ =cut -# 'Let the code begin... - -package Bio::SimpleAlign; -use vars qw(%CONSERVATION_GROUPS); -use strict; - -use Bio::LocatableSeq; # uses Seq's as list +## This data should probably be in a more centralized module... +## it is taken from Clustalw documentation. +## These are all the positively scoring groups that occur in the +## Gonnet Pam250 matrix. The strong and weak groups are +## defined as strong score >0.5 and weak score =<0.5 respectively. +our %CONSERVATION_GROUPS = ( + 'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )], + 'weak' => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)], +); -use Bio::Seq; -use Bio::SeqFeature::Generic; - -BEGIN { - # This data should probably be in a more centralized module... - # it is taken from Clustalw documentation. - # These are all the positively scoring groups that occur in the - # Gonnet Pam250 matrix. The strong and weak groups are - # defined as strong score >0.5 and weak score =<0.5 respectively. - - %CONSERVATION_GROUPS = ( - 'strong' => [ qw( - STA - NEQK - NHQK - NDEQ - QHRK - MILV - MILF - HY - FYW )], - 'weak' => [ qw( - CSA - ATV - SAG - STNK - STPA - SGND - SNDEQK - NDEQHK - NEQHRK - FVLIM - HFY )],); -} - -use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI - Bio::FeatureHolderI); =head2 new @@ -1140,11 +1114,17 @@ sub slice { my $slice_seq = $seq->subseq($start, $seq_end); $new_seq->seq( $slice_seq ); - $slice_seq =~ s/\W//g; + # Allowed extra characters in string + my $allowed_chars = ''; + if (exists $self->{_mask_char}) { + $allowed_chars = $self->{_mask_char}; + $allowed_chars = quotemeta $allowed_chars; + } + $slice_seq =~ s/[^\w$allowed_chars]//g; - if ($start > 1) { + if ($start > 1) { my $pre_start_seq = $seq->subseq(1, $start - 1); - $pre_start_seq =~ s/\W//g; + $pre_start_seq =~ s/[^\w$allowed_chars]//g; if (!defined($seq->strand)) { $new_seq->start( $seq->start + CORE::length($pre_start_seq) ); } elsif ($seq->strand < 0){ @@ -3286,6 +3266,8 @@ sub mask_columns { $new_seq->seq($new_dna_string); $aln->add_seq($new_seq); } + # Preserve chosen mask character, it may be need later (like in 'slice') + $aln->{_mask_char} = $mask_char; return $aln; } diff --git a/Bio/Tools/Alignment/Consed.pm b/Bio/Tools/Alignment/Consed.pm index 247c25b0d2..f7a390a72a 100644 --- a/Bio/Tools/Alignment/Consed.pm +++ b/Bio/Tools/Alignment/Consed.pm @@ -148,7 +148,8 @@ sub new { # this is special to UNIX and should probably use catfile : DONE! # if (!($self->{'filename'} =~ m{/})) { # $self->{'filename'} = "./".$self->{'filename'}; -# } +# } +# $self->{'filename'} =~ s#\\#\/#g if $^O =~ m/mswin/i; # $self->{'filename'} =~ m/(.*\/)(.*)ace.*$/; # $self->{'path'} = $1; @@ -977,6 +978,7 @@ sub set_singlets { $self->debug("Bio::Tools::Alignment::Consed Adding singlets to the contig hash...\n"); my $full_filename = $self->{'filename'}; $self->debug("Bio::Tools::Alignment::Consed::set_singlets: \$full_filename is $full_filename\n"); + $full_filename =~ s#\\#\/#g if $^O =~ m/mswin/i; $full_filename =~ m/(.*\/)(.*ace.*)$/; my ($base_path,$filename) = ($1,$2); $self->debug("Bio::Tools::Alignment::Consed::set_singlets: singlets filename is $filename and \$base_path is $base_path\n"); @@ -1105,6 +1107,7 @@ sub set_quality_by_name { sub set_singlet_quality { my $self = shift; my $full_filename = $self->{'filename'}; + $full_filename =~ s#\\#\/#g if $^O =~ m/mswin/i; $full_filename =~ m/(.*\/)(.*)ace.*$/; my ($base_path,$filename) = ($1,"$2"."qual"); my $singletsfile = $base_path.$filename; @@ -1156,6 +1159,7 @@ sub set_contig_quality { my $full_filename = $self->{'filename'}; # Run_SRC3700_2000-08-01_73+74.fasta.screen.contigs.qual # from Consed.pm + $full_filename =~ s#\\#\/#g if $^O =~ m/mswin/i; $full_filename =~ m/(.*\/)(.*)ace.*$/; my ($base_path,$filename) = ($1,"$2"."contigs.qual"); my $singletsfile = $base_path.$filename; @@ -1846,4 +1850,3 @@ sub show_missing_sequence() { 1; - diff --git a/Build.PL b/Build.PL index 56fa3dfb13..c364cc6bf8 100644 --- a/Build.PL +++ b/Build.PL @@ -176,7 +176,7 @@ my $build = Bio::Root::Build->new( dist_author => 'BioPerl Team ', dist_abstract => 'Bioinformatics Toolkit', license => 'perl', - no_index => {'dir' => [qw(examples/root/lib)]}, + no_index => {'x_dir' => [qw(examples/root/lib)]}, requires => { 'perl' => '5.6.1', 'IO::String' => 0, # why is this required? @@ -256,22 +256,22 @@ my $build = Bio::Root::Build->new( my $accept = $build->args('accept'); # how much do I hate this? Let me count the ways..... -if (!$build->feature('EntrezGene')) { - warn <feature('EntrezGene')) { +# warn <feature('Bio::DB::GFF') || $build->feature('MySQL Tests') || diff --git a/Changes b/Changes index 27ae050052..1b69013c7f 100644 --- a/Changes +++ b/Changes @@ -17,6 +17,16 @@ Bugs and requested features list: CPAN releases are branched from 'master'. --------------------------------------------------------- +1.6.923 + + * Major Windows support updates! [fjossandon] + * MAKER update to allow for stricter standard codon table [cjfields] + * Better support for circular sequences [fjossandon] + * Fixes for some complex location types [fjossandon] + * Address CONTIG bug in GenBank format, bug #3448 [cjfields] + * Fix bug #2978 related to BLAST report type [fjossandon] + * Deobfuscator fixes [DaveMessina] + 1.6.922 * Address CPAN test failures [cjfields] diff --git a/t/Align/SimpleAlign.t b/t/Align/SimpleAlign.t index edd4fe557e..fc336d7cc7 100644 --- a/t/Align/SimpleAlign.t +++ b/t/Align/SimpleAlign.t @@ -426,11 +426,9 @@ my @slice_lens = qw(1 1 2 2); for my $feature ( $aln->get_SeqFeatures ) { for my $loc ( $feature->location->each_Location ) { my $masked = $aln->mask_columns( $loc->start, $loc->end, '?'); - TODO: { - local $TODO = "This should pass but dies; see bug 2842"; - $masked->verbose(2); - lives_ok {my $fslice = $masked->slice( $loc->start, $loc->end )}; - } + $masked->verbose(2); + lives_ok {my $fslice = $masked->slice( $loc->start, $loc->end )}; + $masked->verbose(-1); my $fslice = $masked->slice( $loc->start, $loc->end ); is( $fslice->length, $slice_lens[ $i++ ], "slice $i len" ); diff --git a/t/AlignIO/bl2seq.t b/t/AlignIO/bl2seq.t index b4eb3e6f24..d5cbb9753a 100644 --- a/t/AlignIO/bl2seq.t +++ b/t/AlignIO/bl2seq.t @@ -4,12 +4,12 @@ use strict; BEGIN { - use lib '.'; + use lib '.'; use Bio::Root::Test; - test_begin(-tests => 3); - - use_ok('Bio::AlignIO::bl2seq'); + test_begin(-tests => 7); + + use_ok('Bio::AlignIO::bl2seq'); } my $DEBUG = test_debug(); @@ -17,11 +17,20 @@ my $DEBUG = test_debug(); my ($str,$aln,$strout,$status); # BL2SEQ -$str = Bio::AlignIO->new( - '-file' => test_input_file("bl2seq.out"), - '-format' => 'bl2seq', - '-report_type' => 'blastp'); +$str = Bio::AlignIO->new(-file => test_input_file("bl2seq.out"), + -format => 'bl2seq', + -report_type => 'blastp'); $aln = $str->next_aln(); isa_ok($aln,'Bio::Align::AlignI'); -is $aln->get_seq_by_pos(2)->get_nse, 'ALEU_HORVU/60-360', - "BLAST bl2seq format test"; \ No newline at end of file +is $aln->get_seq_by_pos(2)->get_nse, 'ALEU_HORVU/60-360', "BLAST bl2seq format test"; + +# Bug 2978, test report_type guessing for TBLASTN and correct Frame assignment to HitFrame +$str = Bio::AlignIO->new(-file => test_input_file("bl2seq.tblastn.out"), + -format => 'bl2seq'); +$aln = $str->next_aln(); +isa_ok($aln,'Bio::Align::AlignI'); +foreach my $seq ( $aln->each_seq_with_id('WAN03UHTX_1') ) { + is $seq->start(), 946; + is $seq->end(), 990; + is $seq->strand(), -1; +} diff --git a/t/LocalDB/BioDBGFF.t b/t/LocalDB/BioDBGFF.t index f6175e3d2d..8a5e7e1577 100644 --- a/t/LocalDB/BioDBGFF.t +++ b/t/LocalDB/BioDBGFF.t @@ -9,8 +9,7 @@ BEGIN { use lib '.'; use Bio::Root::Test; - test_begin(-tests => 275, - -excludes_os => 'mswin'); + test_begin(-tests => 275); use_ok('Bio::DB::GFF'); } diff --git a/t/LocalDB/Fasta.t b/t/LocalDB/Fasta.t index 133a91c725..9b409a0944 100644 --- a/t/LocalDB/Fasta.t +++ b/t/LocalDB/Fasta.t @@ -172,6 +172,9 @@ my $test_files = [ is $db3->file('AW057336'), '3.fa'; is $db1->file('AW057231'), '1.fa'; is $db4->file('AW057410'), '3.fa'; + unlink $db1->index_name; + unlink $db2->index_name; + unlink $db3->index_name; } diff --git a/t/LocalDB/Index/Index.t b/t/LocalDB/Index/Index.t index 3f8a416a9e..64f4ea9d81 100644 --- a/t/LocalDB/Index/Index.t +++ b/t/LocalDB/Index/Index.t @@ -24,8 +24,8 @@ BEGIN { } my $ind = Bio::Index::Fasta->new(-filename => 'Wibbl', - -write_flag => 1, - -verbose => 0); + -write_flag => 1, + -verbose => 0); $ind->make_index(test_input_file('multifa.seq')); $ind->make_index(test_input_file('seqs.fas')); @@ -47,15 +47,15 @@ $seq = $stream->next_seq; isa_ok $seq, 'Bio::PrimarySeqI'; $ind = Bio::Index::Fasta->new(-filename => 'multifa_index', - -write_flag => 1, - -verbose => 0); + -write_flag => 1, + -verbose => 0); $ind->make_index(test_input_file('multifa.seq.qual')); ok ( -e "multifa_index" ); $ind = Bio::Index::Qual->new(-filename => 'multifa_qual_index', - -write_flag => 1, - -verbose => 0); + -write_flag => 1, + -verbose => 0); $ind->make_index(test_input_file('multifa.seq.qual')); ok ( -e "multifa_qual_index" ); @@ -75,19 +75,19 @@ $seq = $ind->fetch('NONEXISTENT_SEQ'); ok(! defined $seq); $ind = Bio::Index::SwissPfam->new(-filename => 'Wibbl2', - -write_flag =>1); + -write_flag =>1); $ind->make_index(test_input_file('swisspfam.data')); ok ( -e "Wibbl2" || -e "Wibbl2.pag" ); $ind = Bio::Index::EMBL->new(-filename => 'Wibbl3', - -write_flag =>1); + -write_flag =>1); $ind->make_index(test_input_file('test.embl')); ok ( -e "Wibbl3" || -e "Wibbl3.pag" ); is ($ind->fetch('AL031232')->length, 4870); $ind = Bio::Index::Swissprot->new(-filename => 'Wibbl4', - -write_flag => 1); + -write_flag => 1); $ind->make_index(test_input_file('roa1.swiss')); ok ( -e "Wibbl4" || -e "Wibbl4.pag" ); $seq = $ind->fetch('ROA1_HUMAN'); @@ -97,7 +97,7 @@ is ($seq->display_id(), 'ROA1_HUMAN'); # test id_parser $ind = Bio::Index::Swissprot->new(-filename => 'Wibbl4', - -write_flag => 1); + -write_flag => 1); $ind->id_parser(\&get_id); $ind->make_index(test_input_file('roa1.swiss')); ok ( -e "Wibbl4" || -e "Wibbl4.pag" ); @@ -106,8 +106,8 @@ is ($seq->length,371); my $gb_ind = Bio::Index::GenBank->new(-filename => 'Wibbl5', - -write_flag =>1, - -verbose => 0); + -write_flag =>1, + -verbose => 0); $gb_ind->make_index(test_input_file('roa1.genbank')); ok ( -e "Wibbl5" || -e "Wibbl5.pag" ); $seq = $gb_ind->fetch('AI129902'); @@ -124,8 +124,8 @@ SKIP: { test_skip(-tests => 22, -requires_module => 'Bio::DB::FileCache'); $cache = Bio::DB::FileCache->new(-seqdb => $gb_ind, - -keep => 1, - -file => 'filecache.idx'); + -keep => 1, + -file => 'filecache.idx'); # problem: my $seq = $cache->get_Seq_by_id('AI129902'); ok ( $seq); @@ -147,8 +147,8 @@ SKIP: { $cache = undef; $cache = Bio::DB::FileCache->new(-seqdb => $gb_ind, - -keep => 0, - -file => 'filecache.idx'); + -keep => 0, + -file => 'filecache.idx'); $seq = $cache->get_Seq_by_id('AI129902'); ok ( $seq); is ( $seq->length, 37); @@ -170,8 +170,8 @@ SKIP: { # test id_parser $gb_ind = Bio::Index::GenBank->new(-filename => 'Wibbl5', - -write_flag =>1, - -verbose => 0); + -write_flag =>1, + -verbose => 0); $gb_ind->id_parser(\&get_id); $gb_ind->make_index(test_input_file('roa1.genbank')); ok ( -e "Wibbl5" || -e "Wibbl5.pag" ); @@ -180,8 +180,8 @@ is ($seq->length,141); # test Stockholm my $st_ind = Bio::Index::Stockholm->new(-filename => 'Wibbl6', - -write_flag => 1, - -verbose => 0); + -write_flag => 1, + -verbose => 0); isa_ok $st_ind, 'Bio::Index::Stockholm'; $st_ind->make_index(test_input_file('testaln.stockholm')); ok ( -e "Wibbl6" ); @@ -192,20 +192,20 @@ isa_ok($aln,'Bio::SimpleAlign'); sub get_id { - my $line = shift; - return $1 if ($line =~ /product="([^"]+)"/); - return $1 if ($line =~ /^DR\s+EMBL;\s+([^;]+)/); + my $line = shift; + return $1 if ($line =~ /product="([^"]+)"/); + return $1 if ($line =~ /^DR\s+EMBL;\s+([^;]+)/); } END { - cleanup(); + cleanup(); } sub cleanup { - for my $root ( qw( Wibbl Wibbl2 Wibbl3 Wibbl4 Wibbl5 Wibbl6 + for my $root ( qw( Wibbl Wibbl2 Wibbl3 Wibbl4 Wibbl5 Wibbl6 multifa_index multifa_qual_index ) ) { - unlink $root if( -e $root ); - unlink "$root.pag" if( -e "$root.pag"); - unlink "$root.dir" if( -e "$root.dir"); - } + unlink $root if( -e $root ); + unlink "$root.pag" if( -e "$root.pag"); + unlink "$root.dir" if( -e "$root.dir"); + } } diff --git a/t/LocalDB/SeqFeature.t b/t/LocalDB/SeqFeature.t index bd4e90f201..76f88890ab 100644 --- a/t/LocalDB/SeqFeature.t +++ b/t/LocalDB/SeqFeature.t @@ -319,9 +319,10 @@ is(scalar @results,2,'keyword search; 2 terms'); my $fasta_dir = make_fasta_testdir(); my $dbfa = Bio::DB::Fasta->new($fasta_dir, -reindex => 1); ok($dbfa); + ok(my $contig1=$dbfa->seq('Contig1')); -$db = Bio::DB::SeqFeature::Store->new(@args,-fasta=>$dbfa); +$db = Bio::DB::SeqFeature::Store->new(@args,-fasta=>$dbfa); $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store=>$db); ok($loader->load($gff_file)); @@ -334,6 +335,18 @@ ok(my $contig2 = $dbfa->seq('Contig2')); my $length = $f->length; ok(substr($contig2,0,$length) eq $f->dna); +# DESTROY for $dbfa sometimes is not being called at script end, +# so call it explicitly to close temporal filehandles +# and allow their deletion +$dbfa->DESTROY; + +# Remove temporal database file used for SQLite tests +if ($db->isa('Bio::DB::SeqFeature::Store::DBI::SQLite')) { + $db->DESTROY; + unlink $db->{dbh_file}; +} + + # testing namespaces for mysql and Pg adaptor SKIP: { diff --git a/t/RemoteDB/EntrezGene.t b/t/RemoteDB/EntrezGene.t index 61a7c51622..eaffa04760 100755 --- a/t/RemoteDB/EntrezGene.t +++ b/t/RemoteDB/EntrezGene.t @@ -38,4 +38,3 @@ SKIP: { is $seq->display_id, "RP"; is $seq->accession_number, 6099; } - diff --git a/t/RemoteDB/MeSH.t b/t/RemoteDB/MeSH.t index 959d45a176..04eb537db1 100755 --- a/t/RemoteDB/MeSH.t +++ b/t/RemoteDB/MeSH.t @@ -30,4 +30,3 @@ SKIP: { is $t->description, "Thrombus formation in an intracranial venous sinus, including the superior sagittal, cavernous, lateral, and petrous sinuses. Etiologies include thrombosis due to infection, DEHYDRATION, coagulation disorders (see THROMBOPHILIA), and CRANIOCEREBRAL TRAUMA."; is $t->id, "D012851"; } - diff --git a/t/RemoteDB/Query/GenBank.t b/t/RemoteDB/Query/GenBank.t index cbd11f94e3..35bf2da355 100755 --- a/t/RemoteDB/Query/GenBank.t +++ b/t/RemoteDB/Query/GenBank.t @@ -82,4 +82,3 @@ $seq = $seqio = undef; $query = Bio::DB::Query::GenBank->new('-query' => 'AF303112', '-ids' => [qw(J00522 AF303112 2981014)]); is $query->query, 'J00522[PACC]|AF303112[PACC]|2981014[UID]'; - diff --git a/t/Root/HTTPget.t b/t/Root/HTTPget.t index 51c8cb9599..b6683edfa1 100644 --- a/t/Root/HTTPget.t +++ b/t/Root/HTTPget.t @@ -102,4 +102,3 @@ $newobj->proxy('http', $TEST_PROXY); $newobj->authentication(@TEST_AUTHENTICATION); is ($newobj->proxy(), $TEST_PROXY); is_deeply([$newobj->authentication], \@TEST_AUTHENTICATION); - diff --git a/t/SeqIO/interpro.t b/t/SeqIO/interpro.t index 9d194cbb90..d4ba5dbe21 100644 --- a/t/SeqIO/interpro.t +++ b/t/SeqIO/interpro.t @@ -8,7 +8,7 @@ BEGIN { use Bio::Root::Test; test_begin(-tests => 20, - -requires_module => 'XML::DOM::XPath'); + -requires_module => 'XML::DOM::XPath'); use_ok('Bio::SeqIO::interpro'); } @@ -16,9 +16,9 @@ BEGIN { my $verbose = test_debug(); my $t_file = test_input_file('test.interpro'); -my $a_in = Bio::SeqIO->new( -file => $t_file, - -verbose => $verbose, - -format => 'interpro'); +my $a_in = Bio::SeqIO->new( -file => $t_file, + -verbose => $verbose, + -format => 'interpro'); isa_ok($a_in, 'Bio::SeqIO'); my $seq = $a_in->next_seq(); @@ -38,9 +38,9 @@ ok(!($seq = $a_in->next_seq()), 'there is no next_seq (correctly)'); # Bug 1908 (enhancement) $t_file = test_input_file('interpro_ebi.xml'); -my $b_in = Bio::SeqIO->new( -file => $t_file, - -verbose => $verbose, - -format => 'interpro'); +my $b_in = Bio::SeqIO->new( -file => $t_file, + -verbose => $verbose, + -format => 'interpro'); $seq = $b_in->next_seq(); ok($seq, 'bug 1908'); @@ -56,13 +56,12 @@ is($dblinks[1]->primary_id, 'IPR009366', 'first primary_id'); is($dblinks[2]->primary_id, 'PF06257.1', 'second primary_id'); my $other_t_file = test_input_file('test.interpro-go.xml'); -my $ipr_in = Bio::SeqIO->new( -file => $other_t_file, +my $ipr_in = Bio::SeqIO->new( -file => $other_t_file, -verbose => $verbose, - -format => 'interpro'); + -format => 'interpro'); $seq = $ipr_in->next_seq(); @features = $seq->get_SeqFeatures; @dblinks = $features[0]->annotation->get_Annotations('dblink'); is(scalar @dblinks, 4, 'right number of dblinks'); is($dblinks[3]->primary_id, 'GO:0003677', 'primary_id via dblinks'); - diff --git a/t/Tree/TreeIO/nhx.t b/t/Tree/TreeIO/nhx.t index 09bc7235c8..9be974b9ea 100644 --- a/t/Tree/TreeIO/nhx.t +++ b/t/Tree/TreeIO/nhx.t @@ -86,6 +86,7 @@ sub read_file { $string = ; close IN; $string =~ s/\n//g; + $string =~ s/\r//g; # For files with Windows line-endings #print STDERR "STR: $string\n"; return $string; } diff --git a/t/data/bl2seq.tblastn.out b/t/data/bl2seq.tblastn.out new file mode 100644 index 0000000000..3238f1c8a1 --- /dev/null +++ b/t/data/bl2seq.tblastn.out @@ -0,0 +1,90 @@ +Query= + (15 letters) + +>WAN03UHTX_1 pSMED2_VHP_GVHS-001 Homo sapiens + Length = 2367 + + Score = 42.4 bits (98), Expect = 4e-09 + Identities = 15/15 (100%), Positives = 15/15 (100%) + Frame = -1 + +Query: 1 EPKSCDKTHTCPPCP 15 + EPKSCDKTHTCPPCP +Sbjct: 990 EPKSCDKTHTCPPCP 946 + + + + Score = 19.6 bits (39), Expect = 0.026 + Identities = 5/9 (55%), Positives = 6/9 (66%) + Frame = -1 + +Query: 7 KTHTCPPCP 15 + + H C PCP +Sbjct: 390 ENHRCTPCP 364 + + + + Score = 16.9 bits (32), Expect = 0.17 + Identities = 5/10 (50%), Positives = 6/10 (60%) + Frame = +2 + +Query: 5 CDKTHTCPPC 14 + C + TC PC +Sbjct: 542 CWRPCTCTPC 571 + + + + Score = 16.5 bits (31), Expect = 0.22 + Identities = 6/12 (50%), Positives = 6/12 (50%) + Frame = +3 + +Query: 2 PKSCDKTHTCPP 13 + P SC T C P +Sbjct: 2154 PDSCSCT*ECTP 2189 + + + + Score = 15.8 bits (29), Expect = 0.37 + Identities = 5/12 (41%), Positives = 5/12 (41%) + Frame = +1 + +Query: 4 SCDKTHTCPPCP 15 + SC C P P +Sbjct: 250 SCSPAAHCSPTP 285 + + +Lambda K H + 0.316 0.138 0.541 + +Gapped +Lambda K H + 0.267 0.0410 0.140 + + +Matrix: BLOSUM62 +Gap Penalties: Existence: 11, Extension: 1 +Number of Hits to DB: 303 +Number of Sequences: 0 +Number of extensions: 5 +Number of successful extensions: 5 +Number of sequences better than 10.0: 2 +Number of HSP's better than 10.0 without gapping: 4 +Number of HSP's successfully gapped in prelim test: 0 +Number of HSP's that attempted gapping in prelim test: 0 +Number of HSP's gapped (non-prelim): 5 +length of query: 15 +length of database: 789 +effective HSP length: 0 +effective length of query: 24 +effective length of database: 789 +effective search space: 18936 +effective search space used: 18936 +frameshift window, decay const: 50, 0.1 +T: 13 +A: 40 +X1: 16 ( 7.3 bits) +X2: 38 (14.6 bits) +X3: 64 (24.7 bits) +S1: 17 (10.6 bits) +S2: 17 (11.2 bits) +