Merge branch 'master' into topic/cjfields_psl_fixes

bioperl · Dec 18, 2013 · a458d35 · a458d35
2 parents 0cfee80 + ef72a65
commit a458d35
Show file tree

Hide file tree

Showing 46 changed files with 610 additions and 236 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -21,7 +21,7 @@ install:
     - "cpanm XML::SAX XML::SAX::Writer XML::Simple XML::LibXML XML::Twig XML::Writer 2>&1 | tail -n 1"
     - "cpanm PostScript::TextBlock Set::Scalar Sort::Naturally YAML | tail -n 1"
     - "cpanm Math::Random SOAP::Lite Spreadsheet::ParseExcel | tail -n 1"
-    #- "cpanm Bio::ASN1::EntrezGene | tail -n 1"
+    - "cpanm Bio::ASN1::EntrezGene | tail -n 1"
     - "cpanm Bio::Phylo | tail -n 1"
     #for some reason tests and deps aren't skipped here.  Will have to look into it more...
     #git repos, seems to only work for simple checkouts, so pure perl only (TODO: look into before_script for more detail)
@@ -34,7 +34,12 @@ script:
 
 #TODO - send emails to bioperl-guts-l
 notifications:
-  email: false
+  email: 
+    recipients:
+      - bioperl-guts-l@lists.open-bio.org
+      - cjfields1@gmail.com
+    on_success: change
+    on_failure: change
 
 # whitelist branches
 branches:

diff --git a/Bio/Align/Utilities.pm b/Bio/Align/Utilities.pm
@@ -1,3 +1,20 @@
+package Bio::Align::Utilities;
+use strict;
+use warnings;
+use Carp;
+use Bio::Root::Version;
+
+use Exporter 'import';
+our @EXPORT_OK = qw(
+  aa_to_dna_aln
+  bootstrap_replicates
+  cat
+  bootstrap_replicates_codons
+  dna_to_aa_aln
+  most_common_sequences
+);
+our %EXPORT_TAGS = (all => \@EXPORT_OK);
+
 #
 # BioPerl module for Bio::Align::Utilities
 #
@@ -87,28 +104,9 @@ Internal methods are usually preceded with a _
 
 =cut
 
-#' keep my emacs happy
-# Let the code begin...
-
-package Bio::Align::Utilities;
-use vars qw(@EXPORT @EXPORT_OK $GAP $CODONGAP %EXPORT_TAGS );
-use strict;
-use Carp;
-use Bio::Root::Version;
-require Exporter;
-
-use base qw(Exporter);
-
-@EXPORT = qw();
-@EXPORT_OK =
-  qw(aa_to_dna_aln bootstrap_replicates cat bootstrap_replicates_codons dna_to_aa_aln);
-%EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
-
-BEGIN {
-    use constant CODONSIZE => 3;
-    $GAP       = '-';
-    $CODONGAP  = $GAP x CODONSIZE;
-}
+use constant CODONSIZE => 3;
+our $GAP       = '-';
+our $CODONGAP  = $GAP x CODONSIZE;
 
 =head2 aa_to_dna_aln
 
@@ -503,4 +501,41 @@ sub cat {
     return $aln;
 }
 
+
+=head2 most_common_sequences
+
+ Title     : most_common_sequences
+ Usage     : @common = most_common_sequences ($align, $case_sensitivity)
+ Function  : Returns an array of the sequences that appear most often in the
+             alignment (although this probably makes more sense when there is
+             only a single most common sequence).  Sequences are compared after
+             removing any "-" (gap characters), and ambiguous units (e.g., R
+             for purines) are only compared to themselves.  The returned
+             sequence is also missing the "-" since they don't actually make
+             part of the sequence.
+ Returns   : Array of text strings.
+ Arguments : Optional argument defining whether the comparison between sequences
+             to find the most common should be case sensitive. Defaults to
+             false, i.e, not case sensitive.
+
+=cut
+
+sub most_common_sequences {
+    my $align = shift
+      or croak ("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences");
+    my $case_sensitive = shift; # defaults to false (we get undef if nothing)
+
+    ## We keep track of the max on this loop. Saves us having to
+    ## transverse the hash table later to find the maximum value.
+    my $max = 0;
+    my %counts;
+    foreach ($align->each_seq) {
+        (my $seq = $_->seq) =~ tr/-//d;
+        $seq = uc ($seq) unless $case_sensitive;
+        $max++ if (++$counts{$seq} > $max);
+    }
+    my @common = grep ($counts{$_} == $max, keys %counts);
+    return @common;
+}
+
 1;
diff --git a/Bio/Assembly/Singlet.pm b/Bio/Assembly/Singlet.pm
@@ -157,6 +157,10 @@ sub _seq_to_singlet {
         -start  => 1,
         #-end   => we let Bio::LocatableSeq calculate it (Seq and LocatableSeq)
     );
+    # Get End from $seq if $lseq can't figure it out (e.g. phrap output)
+    if (not defined $lseq->end) {
+        $lseq->end($seq->end);
+    }
     # Add new sequence and its coordinates to the contig
     my $lcoord = Bio::SeqFeature::Generic->new( -start => $lseq->start,
                                                 -end   => $lseq->end    );

diff --git a/Bio/DB/Flat/BDB.pm b/Bio/DB/Flat/BDB.pm
@@ -241,6 +241,7 @@ sub _index_file {
   my $fh     = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!");
   my $offset = 0;
   my $count  = 0;
+
   while (!eof($fh)) {
     my ($ids,$adjustment)  = $self->parse_one_record($fh) or next;
     $adjustment ||= 0;  # prevent uninit variable warning

diff --git a/Bio/DB/Flat/BDB/fasta.pm b/Bio/DB/Flat/BDB/fasta.pm
@@ -85,7 +85,7 @@ sub parse_one_record {
     my $fh    = shift;
 
     # fasta parses by changing $/ to '\n>', need to adjust accordingly
-    my $adj = ( $^O =~ /mswin/i ) ? -2 : -1; 
+    my $adj = -1;
     my $parser =
       $self->{cached_parsers}{fileno($fh)}
         ||= Bio::SeqIO->new(-fh=>$fh,-format=>$self->default_file_format);

diff --git a/Bio/DB/Flat/BinarySearch.pm b/Bio/DB/Flat/BinarySearch.pm
@@ -840,6 +840,16 @@ sub _index_file {
     my %secondary_id;
     my $last_one;
 
+    # In Windows, text files have '\r\n' as line separator, but when reading in
+    # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
+    # "length $_" will report 4 although the line is 5 bytes in length.
+    # We assume that all lines have the same line separator and only read current line.
+    my $init_pos   = tell($fh);
+    my $curr_line  = <$fh>;
+    my $pos_diff   = tell($fh) - $init_pos;
+    my $correction = $pos_diff - length $curr_line;
+    seek $fh, $init_pos, 0; # Rewind position to proceed to read the file
+
     while (<$fh>) {
         $last_one = $_;
         $self->{alphabet} ||= $self->guess_alphabet($_);
@@ -848,7 +858,7 @@ sub _index_file {
                 $id = $new_primary_entry;
                 $self->{alphabet} ||= $self->guess_alphabet($_);
 
-                my $tmplen = ( tell $fh ) - length($_);
+                my $tmplen = ( tell $fh ) - length($_) - $correction;
 
                 $length = $tmplen - $pos;
 

diff --git a/Bio/DB/IndexedBase.pm b/Bio/DB/IndexedBase.pm
@@ -747,9 +747,18 @@ sub _calc_termination_length {
     # Account for crlf-terminated Windows and Mac files
     my ($self, $file) = @_;
     my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
-    my $line = <$fh>;
+
+    # In Windows, text files have '\r\n' as line separator, but when reading in
+    # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
+    # "length $_" will report 4 although the line is 5 bytes in length.
+    # We assume that all lines have the same line separator and only read current line.
+    my $init_pos   = tell($fh);
+    my $curr_line  = <$fh>;
+    my $pos_diff   = tell($fh) - $init_pos;
+    my $correction = $pos_diff - length $curr_line;
     close $fh;
-    $self->{termination_length} = ($line =~ /\r\n$/) ? 2 : 1;
+
+    $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
     return $self->{termination_length};
 }
 
@@ -1056,9 +1065,18 @@ sub NEXTKEY {
 
 sub DESTROY {
     my $self = shift;
+
+    # Close filehandles
+    while (my ($file, $fh) = each %{ $self->{fhcache} }) {
+        if (defined $fh) {
+            $fh->close;
+        }
+    }
+    $self->_close_index($self->{offsets});
+
     if ( $self->{clean} || $self->{indexing} ) {
-      # Indexing aborted or cleaning requested. Delete the index file.
-      unlink $self->{index_name};
+        # Indexing aborted or cleaning requested. Delete the index file.
+        unlink $self->{index_name};
     }
     return 1;
 }

diff --git a/Bio/DB/Qual.pm b/Bio/DB/Qual.pm
@@ -378,7 +378,10 @@ sub header {
     my $fh = $self->_fh($id) or return;
     seek($fh, $offset, 0);
     read($fh, $data, $headerlen);
-    chomp $data;
+    # On Windows chomp remove '\n' but leaves '\r'
+    # when reading '\r\n' in binary mode
+    $data =~ s/\n//g;
+    $data =~ s/\r//g;
     substr($data, 0, 1) = '';
     return $data;
 }

diff --git a/Bio/DB/SeqFeature/Store/DBI/SQLite.pm b/Bio/DB/SeqFeature/Store/DBI/SQLite.pm
@@ -146,7 +146,7 @@ use base 'Bio::DB::SeqFeature::Store::DBI::mysql';
 use Bio::DB::SeqFeature::Store::DBI::Iterator;
 use DBI qw(:sql_types);
 use Memoize;
-use Cwd 'abs_path';
+use Cwd qw(abs_path getcwd);
 use Bio::DB::GFF::Util::Rearrange 'rearrange';
 use Bio::SeqFeature::Lite;
 use File::Spec;
@@ -254,6 +254,10 @@ sub init {
     $dbh->do("PRAGMA synchronous = OFF;"); # makes writes much faster
     $dbh->do("PRAGMA temp_store = MEMORY;"); # less disk I/O; some speedup
     $dbh->do("PRAGMA cache_size = 20000;"); # less disk I/O; some speedup
+    # Keep track of database file location
+    my $cwd = getcwd;
+    my ($db_file) = ($dsn =~ m/(?:db(?:name)?|database)=(.+)$/);
+    $self->{dbh_file} = "$cwd/$db_file";
   }
   $self->{dbh}       = $dbh;
   $self->{is_temp}   = $is_temporary;
@@ -1196,6 +1200,16 @@ sub _dump_update_location_index {
     print $fh join("\t",@args),"\n";
 }
 
+sub DESTROY {
+    my $self = shift;
+    # Remove filehandles, so temporal files can be properly deleted
+    if (%DBI::installed_drh) {
+	DBI->disconnect_all;
+	%DBI::installed_drh = ();
+    }
+    undef $self->{dbh};
+}
+
 1;
 
 =head1 AUTHOR

diff --git a/Bio/DB/SeqFeature/Store/LoadHelper.pm b/Bio/DB/SeqFeature/Store/LoadHelper.pm
@@ -59,6 +59,13 @@ sub new {
 
 sub DESTROY {
     my $self = shift;
+    # Destroy all filehandle references
+    # before trying to delete files and folder
+    %DBHandles = ();
+    undef $self->{IndexIt};
+    undef $self->{TopLevel};
+    undef $self->{Local2Global};
+    undef $self->{Parent2Child};
     rmtree $self->{tmppath};
 #    File::Temp::cleanup() unless $self->{keep};
 }

diff --git a/Bio/DB/SeqFeature/Store/Loader.pm b/Bio/DB/SeqFeature/Store/Loader.pm
@@ -720,6 +720,28 @@ sub unescape {
 
 sub DESTROY {
     my $self = shift;
+    # Close filehandles, so temporal files can be properly deleted
+    my $store = $self->store;
+    if (   $store->isa('Bio::DB::SeqFeature::Store::memory')
+	or $store->isa('Bio::DB::SeqFeature::Store::berkeleydb3')
+	) {
+      $store->private_fasta_file->close;
+
+      if ($store->{fasta_db}) {
+	while (my ($file, $fh) = each %{ $store->{fasta_db}->{fhcache} }) {
+	  $fh->close;
+	}
+	$store->{fasta_db}->_close_index($store->{fasta_db}->{offsets});
+      }
+    }
+    elsif ($store->isa('Bio::DB::SeqFeature::Store::DBI::SQLite')) {
+      if (%DBI::installed_drh) {
+	DBI->disconnect_all;
+	%DBI::installed_drh = ();
+      }
+      undef $store->{dbh};
+    }
+
     if (my $ld = $self->{temp_load}) {
 	unlink $ld;
     }
@@ -755,5 +777,3 @@ This library is free software; you can redistribute it and/or modify
 it under the same terms as Perl itself.
 
 =cut
-
-
diff --git a/Bio/DB/SeqFeature/Store/berkeleydb.pm b/Bio/DB/SeqFeature/Store/berkeleydb.pm
@@ -621,6 +621,7 @@ sub _open_databases {
     return if $ignore_errors;  # autoindex set, so defer this
     $self->throw("Couldn't tie: ".$self->_features_path . " $!");
   }
+
   if ($create) {
     %h = ();
     $h{'.next_id'} = 1;
@@ -701,6 +702,7 @@ sub _close_databases {
   $self->db(undef);
   $self->dna_db(undef);
   $self->notes_db(undef);
+  $self->parentage_db(undef);
   $self->index_db($_=>undef) foreach $self->_index_files;
 }
 
@@ -1449,6 +1451,7 @@ sub db_version {
 sub DESTROY {
   my $self = shift;
   $self->_close_databases();
+  $self->private_fasta_file->close;
   rmtree($self->directory,0,1) if $self->temporary && -e $self->directory;
 }
 
@@ -1575,4 +1578,3 @@ This library is free software; you can redistribute it and/or modify
 it under the same terms as Perl itself.
 
 =cut
-
diff --git a/Bio/DB/TFBS/transfac_pro.pm b/Bio/DB/TFBS/transfac_pro.pm
@@ -1884,4 +1884,15 @@ sub _species_to_taxid {
     return $ncbi_taxid;
 }
 
+sub DESTROY {
+    my $self = shift;
+    # Destroy tied references to close filehandles
+    # and allow proper temporary files deletion
+    undef $self->{_tax_db}->{'_nodes'};
+    undef $self->{_tax_db}->{'_id2name'};
+    undef $self->{_tax_db}->{'_name2id'};
+    undef $self->{_tax_db}->{'_parent2children'};
+    undef $self->{_tax_db}->{'_parentbtree'};
+}
+
 1;
diff --git a/Bio/Index/Abstract.pm b/Bio/Index/Abstract.pm
@@ -773,8 +773,13 @@ sub add_record {
 =cut
 
 sub pack_record {
-	my( $self, @args ) = @_;
-	return join "\034", @args;
+    my( $self, @args ) = @_;
+    # Silence undefined warnings
+    @args = map {
+                 $_ = (defined $_) ? $_ : '';
+                 $_ ;
+                 } @args;
+    return join "\034", @args;
 }
 
 =head2 unpack_record