Merge branch 'master' of github.com:bioperl/bioperl-live

bioperl · Mar 5, 2015 · 557d24e · 557d24e
2 parents a6172de + 0c08f33
commit 557d24e
Show file tree

Hide file tree

Showing 56 changed files with 858 additions and 389 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@
 Build
 Build.bat
 _build*
+_Inline
 pm_to_blib*
 *.tar.gz
 .lwpcookies

diff --git a/.travis.yml b/.travis.yml
@@ -26,6 +26,8 @@ install:
     - "cpanm Bio::Phylo | tail -n 1"
     - "cpanm Test::Weaken | tail -n 1"
     - "cpanm Test::Memory::Cycle | tail -n 1"
+    #Test coverage from Coveralls
+    #- cpanm --quiet --notest Devel::Cover::Report::Coveralls
     #for some reason tests and deps aren't skipped here.  Will have to look into it more...
     #git repos, seems to only work for simple checkouts, so pure perl only (TODO: look into before_script for more detail)
     - "git clone https://github.com/bioperl/Bio-Root.git; export PERL5LIB=$( pwd )/Bio-Root/lib:$PERL5LIB"
@@ -34,6 +36,10 @@ install:
 
 script:
     - "./Build test"
+    #Devel::Cover coverage options are: statement, branch, condition, path, subroutine, pod, time, all and none
+    #- "./Build build && cover -test -report coveralls" #complete version coverage test
+    #- PERL5OPT=-MDevel::Cover=+ignore,prove,-coverage,statement,subroutine prove -lr t #limited version coverage test
+    #- cover -report coveralls
 
 #TODO - send emails to bioperl-guts-l
 notifications:

diff --git a/AUTHORS b/AUTHORS
@@ -20,6 +20,8 @@
 
 =item * Brian Osborne <bosborne at bioteam.net>
 
+=item * Francisco J. Ossandon <fco.j.ossandon at gmail.com>
+
 =item * Jason Stajich <jason at bioperl.org>
 
 =item * Lincoln Stein <lstein at cshl.org>

diff --git a/Bio/DB/BioFetch.pm b/Bio/DB/BioFetch.pm
@@ -26,14 +26,14 @@ Bio::DB::BioFetch - Database object interface to BioFetch retrieval
 
  $bf = Bio::DB::BioFetch->new();
 
- $seq = $bf->get_Seq_by_id('BUM');  # EMBL or SWALL ID
+ $seq = $bf->get_Seq_by_id('HSFOS');  # EMBL or SWALL ID
 
  # change formats, storage procedures
  $bf = Bio::DB::BioFetch->new(-format        => 'fasta',
  			     -retrievaltype => 'tempfile',
   			     -db            => 'EMBL');
 
- $stream = $bf->get_Stream_by_id(['BUM','J00231']);
+ $stream = $bf->get_Stream_by_id(['HSFOS','J00231']);
  while (my $s = $stream->next_seq) {
     print $s->seq,"\n";
  }
@@ -135,7 +135,7 @@ BEGIN {
 	    fasta     => 'fasta',
 	    namespace => 'uniprot',
 	},
-    'uniprot' => {
+	'uniprot' => {
 	    default   => 'swiss',
 	    swissprot => 'swiss',
 	    fasta     => 'fasta',

diff --git a/Bio/DB/EMBL.pm b/Bio/DB/EMBL.pm
@@ -23,7 +23,7 @@ Bio::DB::EMBL - Database object interface for EMBL entry retrieval
   $embl = Bio::DB::EMBL->new();
 
   # remember that EMBL_ID does not equal GenBank_ID!
-  $seq = $embl->get_Seq_by_id('BUM'); # EMBL ID
+  $seq = $embl->get_Seq_by_id('HSFOS'); # EMBL ID
   print "cloneid is ", $seq->id, "\n";
 
   # or changeing to accession number and Fasta format ...

diff --git a/Bio/DB/Fasta.pm b/Bio/DB/Fasta.pm
@@ -243,7 +243,6 @@ sub _calculate_offsets {
     return \%offsets;
 }
 
-
 =head2 seq
 
  Title   : seq, sequence, subseq
@@ -289,8 +288,8 @@ sub subseq {
 
     seek($fh, $filestart,0);
     read($fh, $data, $filestop-$filestart+1);
-    $data =~ s/\n//g;
-    $data =~ s/\r//g;
+
+    $data = Bio::DB::IndexedBase::_strip_crnl($data);
 
     if ($strand == -1) {
         # Reverse-complement the sequence
@@ -332,8 +331,7 @@ sub header {
     read($fh, $data, $headerlen);
     # On Windows chomp remove '\n' but leaves '\r'
     # when reading '\r\n' in binary mode
-    $data =~ s/\n//g;
-    $data =~ s/\r//g;
+    $data = Bio::DB::IndexedBase::_strip_crnl($data);
     substr($data, 0, 1) = '';
     return $data;
 }

diff --git a/Bio/DB/Flat.pm b/Bio/DB/Flat.pm
@@ -23,9 +23,9 @@ Bio::DB::Flat - Interface for indexed flat files
                            -write_flag => 1);
   $db->build_index('/usr/share/embl/primate.embl',
                    '/usr/share/embl/protists.embl');
-  $seq       = $db->get_Seq_by_id('BUM');
+  $seq       = $db->get_Seq_by_id('HSFOS');
   @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
-  $raw       = $db->fetch_raw('BUM');
+  $raw       = $db->fetch_raw('HSFOS');
 
 =head1 DESCRIPTION
 

diff --git a/Bio/DB/IndexedBase.pm b/Bio/DB/IndexedBase.pm
@@ -241,7 +241,7 @@ methods. Internal methods are usually preceded with a _
 package Bio::DB::IndexedBase;
 
 BEGIN {
-    @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File) 
+    @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
         if(!$INC{'AnyDBM_File.pm'});
 }
 
@@ -268,6 +268,46 @@ use constant DIE_ON_MISSMATCHED_LINES => 1;
 # you can avoid dying if you want but you may get incorrect results
 
 
+# Compiling the below regular expressions speeds up the Pure Perl
+# seq/subseq() from Bio::DB::Fasta by about 7% from 7.76s to 7.22s
+# over 32358 calls on Variant Effect Prediction data.
+my $nl = qr/\n/;
+my $cr = qr/\r/;
+
+# Remove carriage returns (\r) and newlines (\n) from a string.  When
+# called from subseq, this can take a signficiant portion of time, in
+# Variant Effect Prediction. Therefore we compile the match portion.
+sub _strip_crnl {
+    my $str = shift;
+    $str =~ s/$nl//g;
+    $str =~ s/$cr//g;
+    return $str;
+}
+
+# C can do perfrom _strip_crnl much faster. But this requires the
+# Inline::C module which we don't require people to have. So we make
+# this optional by wrapping the C code in an eval. If the eval works,
+# the Perl strip_crnl() function is overwritten.
+eval q{
+    use Inline C  => <<'END_OF_C_CODE';
+    /* Strip all new line (\n) and carriage return (\r) characters
+       from string str
+    */
+    char* _strip_crnl(char* str) {
+        char *s;
+        char *s2 = str;
+        for (s = str; *s; *s++) {
+            if (*s != '\n' && *s != '\r') {
+              *s2++ = *s;
+            }
+        }
+        *s2 = '\0';
+        return str;
+    }
+END_OF_C_CODE
+};
+
+
 =head2 new
 
  Title   : new
@@ -682,6 +722,8 @@ sub _close_index {
     return 1;
 }
 
+# Compiling the below regular expression speeds up _parse_compound_id
+my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x;
 
 sub _parse_compound_id {
     # Handle compound IDs:
@@ -699,7 +741,7 @@ sub _parse_compound_id {
     if ( (not defined $start ) &&
          (not defined $stop  ) &&
          (not defined $strand) &&
-         ($id =~ /^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x) ) {
+         ($id =~ m{$compound_id}) ) {
         # Start, stop and strand not provided and ID looks like a compound ID
         ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
     }

diff --git a/Bio/DB/Qual.pm b/Bio/DB/Qual.pm
@@ -335,8 +335,7 @@ sub subqual {
     read($fh, $data, $filestop-$filestart+1);
 
     # Process quality score
-    $data =~ s/\n//g;
-    $data =~ s/\r//g;
+    Bio::DB::IndexedBase::_strip_crnl($data);
     my $subqual = 0;
     $subqual = 1 if ( $start || $stop );
     my @data;
@@ -379,9 +378,9 @@ sub header {
     seek($fh, $offset, 0);
     read($fh, $data, $headerlen);
     # On Windows chomp remove '\n' but leaves '\r'
-    # when reading '\r\n' in binary mode
-    $data =~ s/\n//g;
-    $data =~ s/\r//g;
+    # when reading '\r\n' in binary mode,
+    # _strip_crnl removes both
+    $data = Bio::DB::IndexedBase::_strip_crnl($data);
     substr($data, 0, 1) = '';
     return $data;
 }

diff --git a/Bio/DB/Registry.pm b/Bio/DB/Registry.pm
@@ -122,7 +122,9 @@ sub _load_registry {
    my $self = shift;
 	eval { $HOME = (getpwuid($>))[7]; } unless $HOME;
 	if ($@) {
-		$self->warn("This Perl doesn't implement function getpwuid(), no \$HOME");
+		# Windows can have Win32::LoginName to get the Username, so check if it works before giving up
+		 ( defined &Win32::LoginName ) ? ( $HOME =  Win32::LoginName() )
+		:                                 $self->warn("This Perl doesn't implement function getpwuid(), no \$HOME");
 	}
 	my @ini_files = $self->_get_ini_files();
 

diff --git a/Bio/DB/SeqFeature/Store/LoadHelper.pm b/Bio/DB/SeqFeature/Store/LoadHelper.pm
@@ -40,7 +40,7 @@ use File::Temp 'tempdir';
 use File::Spec;
 use Fcntl qw(O_CREAT O_RDWR);
 
-our $VERSION = '1.10';
+our $VERSION = '1.11';
 
 my %DBHandles;
 

diff --git a/Bio/DB/Taxonomy/flatfile.pm b/Bio/DB/Taxonomy/flatfile.pm
@@ -99,6 +99,9 @@ $DEFAULT_PARENT_INDEX  = 'parents';
 
 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
 
+# 8192 bytes; this seems to work to keep OS X from complaining
+$DB_HASH->{'bsize'} = 0x2000;
+
 @DIVISIONS =   ([qw(BCT Bacteria)],
                 [qw(INV Invertebrates)],
                 [qw(MAM Mammals)],

diff --git a/Bio/SeqFeatureI.pm b/Bio/SeqFeatureI.pm
@@ -522,12 +522,13 @@ sub spliced_seq {
     my @locset = $self->location->each_Location;
     my @locs;
     if ( not $nosort ) {
-        @locs = map { $_->[0] }
+#        @locs = map { $_->[0] }
         # sort so that most negative is first basically to order
         # the features on the opposite strand 5'->3' on their strand
         # rather than they way most are input which is on the fwd strand
 
-        sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
+#        sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
+		my @proc_locs =
         map {
              $fstrand = $_->strand unless defined $fstrand;
              $mixed   = 1 if defined $_->strand && $fstrand != $_->strand;
@@ -538,6 +539,16 @@ sub spliced_seq {
              [ $_, $_->start * ($_->strand || 1) ];
         } @locset;
 
+		my @sort_locs;
+		if ( $fstrand == 1 ) {
+			@sort_locs = sort { $a->[1] <=> $b->[1] } @proc_locs; # Yes Tim, Schwartzian transformation
+		}elsif ( $fstrand == -1 ){
+			@sort_locs = sort { $b->[1] <=> $a->[1] } @proc_locs; # Yes Tim, Schwartzian transformation
+		} else {
+			@sort_locs = @proc_locs;
+		}
+		@locs = map { $_->[0] } @sort_locs;
+
         if ( $mixed ) {
             $self->warn(  "Mixed strand locations, spliced seq using the "
                         . "input order rather than trying to sort");

diff --git a/Bio/SeqIO/embl.pm b/Bio/SeqIO/embl.pm
@@ -1463,7 +1463,7 @@ sub _write_line_EMBL_regex {
 
   CHUNK: while($line) {
         foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
-            if ($line =~ m/^(.{0,$subl})($pat)(.*)/ ) {
+            if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
                 my $l = $1.$2;
                 $l =~ s/#/ /g  # remove word wrap protection char '#'
                     if $pre1 eq "RA   ";

diff --git a/Bio/SeqIO/genbank.pm b/Bio/SeqIO/genbank.pm
@@ -499,16 +499,30 @@ sub next_seq {
                 }
             }
 
-            # Comments
+            # Comments may be plain text or Structured Comments.
+            # Structured Comments are made up of tag/value pairs and have beginning 
+            # and end delimiters like ##*-Data-START## and ##*-Data-END##
             elsif ($line =~ /^COMMENT\s+(\S.*)/) {
                 if ($annotation) {
                     my $comment = $1;
                     while ( defined( $line = $self->_readline ) ) {
                         last if ($line =~ /^\S/);
                         $comment .= $line;
                     }
-                    $comment =~ s/\n/ /g;
                     $comment =~ s/  +/ /g;
+                    # Structured Comment, do not remove returns in the tabular section
+                    if ( my ( $text, $table )= $comment
+                        =~ /([^#]*)(##\S+Data-START##.+?##\S+Data-END##)/is
+                        ) {
+                        $text    =~ s/\n/ /g if $text;
+                        $table   =~ s/START##/START##\n/;
+                        $table   =~ s/^\s+//gm;
+                        $comment = $text . "\n" . $table;
+                    }
+                    # Plain text, remove returns
+                    else {
+                        $comment =~ s/\n/ /g;
+                    }
                     $annotation->add_Annotation(
                         'comment',
                         Bio::Annotation::Comment->new(

diff --git a/Bio/Tools/Analysis/Protein/GOR4.pm b/Bio/Tools/Analysis/Protein/GOR4.pm
@@ -153,12 +153,12 @@ use Bio::SeqIO;
 use HTTP::Request::Common qw(POST);
 use Bio::SeqFeature::Generic;
 use Bio::Seq::Meta::Array;
-
+$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;
 
 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
 
 use constant MIN_STRUC_LEN => 3;
-my $URL = 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_gor4.pl';
+my $URL = 'https://npsa-prabi.ibcp.fr/cgi-bin/secpred_sopma.pl';
 my $ANALYSIS_NAME = 'GOR4';
 my $ANALYSIS_SPEC = {name => 'Gor4', type => 'Protein'};
 my $INPUT_SPEC    = [
@@ -366,11 +366,9 @@ sub  _run {
     my $out = 'http://npsa-pbil.ibcp.fr/'.$next;
     my $req2 = HTTP::Request->new(GET=>$out);
     my $resp2 = $self->request($req2);
-	$self->status('COMPLETED') if $resp2 ne '';
+    $self->status('COMPLETED') if $resp2 ne '';
     $self->{'_result'} = $resp2->content;
 }
 
 
-
-
 1;
diff --git a/Bio/Tools/Analysis/Protein/HNN.pm b/Bio/Tools/Analysis/Protein/HNN.pm
@@ -154,7 +154,7 @@ use Bio::SeqIO;
 use HTTP::Request::Common qw (POST);
 use Bio::SeqFeature::Generic;
 use Bio::Seq::Meta::Array;
-
+$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;
 
 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
 
@@ -195,9 +195,10 @@ sub  _run {
     # delay repeated calls by default by 3 sec, set delay() to change
     $self->sleep;
     $self->status('TERMINATED_BY_ERROR');
-    my $request = POST 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_hnn.pl',
+    my $request = POST 'https://npsa-prabi.ibcp.fr/cgi-bin/secpred_hnn.pl',
         Content_Type => 'form-data',
-            Content  => [title => "",
+            Content  => [
+			 title => "",
                          notice => $self->seq->seq,
                          ali_width => 70,
                         ];

diff --git a/Bio/Tools/Analysis/Protein/Sopma.pm b/Bio/Tools/Analysis/Protein/Sopma.pm
@@ -154,12 +154,12 @@ use Bio::SeqIO;
 use HTTP::Request::Common qw (POST);
 use Bio::SeqFeature::Generic;
 use Bio::Seq::Meta::Array;
-
+$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;
 
 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
 
 #extends array for 2struc.
-my $URL = 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_sopma.pl';
+my $URL = 'https://npsa-prabi.ibcp.fr/cgi-bin/secpred_sopma.pl';
 my $ANALYSIS_NAME= 'Sopma';
 my $ANALYSIS_SPEC= {name => 'Sopma', type => 'Protein'};
 my $INPUT_SPEC = [
@@ -450,7 +450,7 @@ sub  _run {
     # delay repeated calls by default by 3 sec, set delay() to change
     $self->sleep;
     $self->status('TERMINATED_BY_ERROR');
-    my $request = POST 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_sopma.pl',
+    my $request = POST 'https://npsa-prabi.ibcp.fr/cgi-bin/secpred_sopma.pl',
         Content_Type => 'form-data',
             Content  => [title     => "",
                          notice    => $self->seq->seq,
@@ -465,6 +465,7 @@ sub  _run {
 
     #### get text only version of results ## 
     my ($next) = $text =~ /Prediction.*?=(.*?)>/;
+    return $self unless $next;
     my $out    = "http://npsa-pbil.ibcp.fr/". "$next";
     my $req2   = HTTP::Request->new(GET=>$out);
     my $resp2  = $self->request ($req2);