Moved optimized \n and \r removal subroutine from Bio::DB::Fasta

to a more central location at Bio::DB::IndexedBase, from where its used by Bio::DB::Fasta and Bio::DB::Qual. Also made a small tweak to nhx.t and updated Changes.
bioperl · Jan 2, 2015 · 01ec10d · 01ec10d
1 parent 054a689
commit 01ec10d
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 49 deletions.
diff --git a/Bio/DB/Fasta.pm b/Bio/DB/Fasta.pm
@@ -139,46 +139,6 @@ use base qw(Bio::DB::IndexedBase);
 our $obj_class = 'Bio::PrimarySeq::Fasta';
 our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
 
-# Compiling the below regular expressions speeds up the Pure Perl
-# seq/subseq() by about 7% from 7.76s to 7.22s over 32358 calls on
-# Variant Effect Prediction data.
-my $nl = qr/\n/;
-my $cr = qr/\r/;
-
-# Remove carriage returns (\r) and newlines (\n) from a string.  When
-# called from subseq, this can take a signficiant portion of time, in
-# Variant Effect Prediction. Therefore we compile the match
-# portion.
-sub strip_crnl {
-    my $str = shift;
-    $str =~ s/$nl//g;
-    $str =~ s/$cr//g;
-    return $str;
-}
-
-# C can do perfrom strip_crnl much faster. But this requires the
-# Inline::C module which we don't require people to have. So we make
-# this optional by wrapping the C code in an eval. If the eval works,
-# the Perl strip_crnl() function is overwritten.
-eval q{
-    use Inline C  => <<'END_OF_C_CODE';
-    /* Strip all new line (\n) and carriage return (\r) characters
-       from string str
-    */
-    char* strip_crnl(char* str) {
-        char *s;
-        char *s2 = str;
-        for (s = str; *s; *s++) {
-	    if (*s != '\n' && *s != '\r') {
-              *s2++ = *s;
-	    }
-        }
-        *s2 = '\0';
-        return str;
-    }
-END_OF_C_CODE
-};
-
 
 =head2 new
 
@@ -329,7 +289,7 @@ sub subseq {
     seek($fh, $filestart,0);
     read($fh, $data, $filestop-$filestart+1);
 
-    $data = strip_crnl($data);
+    $data = Bio::DB::IndexedBase::_strip_crnl($data);
 
     if ($strand == -1) {
         # Reverse-complement the sequence
@@ -371,7 +331,7 @@ sub header {
     read($fh, $data, $headerlen);
     # On Windows chomp remove '\n' but leaves '\r'
     # when reading '\r\n' in binary mode
-    $data = strip_crnl($data);
+    $data = Bio::DB::IndexedBase::_strip_crnl($data);
     substr($data, 0, 1) = '';
     return $data;
 }

diff --git a/Bio/DB/IndexedBase.pm b/Bio/DB/IndexedBase.pm
@@ -268,6 +268,46 @@ use constant DIE_ON_MISSMATCHED_LINES => 1;
 # you can avoid dying if you want but you may get incorrect results
 
 
+# Compiling the below regular expressions speeds up the Pure Perl
+# seq/subseq() from Bio::DB::Fasta by about 7% from 7.76s to 7.22s
+# over 32358 calls on Variant Effect Prediction data.
+my $nl = qr/\n/;
+my $cr = qr/\r/;
+
+# Remove carriage returns (\r) and newlines (\n) from a string.  When
+# called from subseq, this can take a signficiant portion of time, in
+# Variant Effect Prediction. Therefore we compile the match portion.
+sub _strip_crnl {
+    my $str = shift;
+    $str =~ s/$nl//g;
+    $str =~ s/$cr//g;
+    return $str;
+}
+
+# C can do perfrom _strip_crnl much faster. But this requires the
+# Inline::C module which we don't require people to have. So we make
+# this optional by wrapping the C code in an eval. If the eval works,
+# the Perl strip_crnl() function is overwritten.
+eval q{
+    use Inline C  => <<'END_OF_C_CODE';
+    /* Strip all new line (\n) and carriage return (\r) characters
+       from string str
+    */
+    char* _strip_crnl(char* str) {
+        char *s;
+        char *s2 = str;
+        for (s = str; *s; *s++) {
+            if (*s != '\n' && *s != '\r') {
+              *s2++ = *s;
+            }
+        }
+        *s2 = '\0';
+        return str;
+    }
+END_OF_C_CODE
+};
+
+
 =head2 new
 
  Title   : new

diff --git a/Bio/DB/Qual.pm b/Bio/DB/Qual.pm
@@ -335,8 +335,7 @@ sub subqual {
     read($fh, $data, $filestop-$filestart+1);
 
     # Process quality score
-    $data =~ s/\n//g;
-    $data =~ s/\r//g;
+    Bio::DB::IndexedBase::_strip_crnl($data);
     my $subqual = 0;
     $subqual = 1 if ( $start || $stop );
     my @data;
@@ -379,9 +378,9 @@ sub header {
     seek($fh, $offset, 0);
     read($fh, $data, $headerlen);
     # On Windows chomp remove '\n' but leaves '\r'
-    # when reading '\r\n' in binary mode
-    $data =~ s/\n//g;
-    $data =~ s/\r//g;
+    # when reading '\r\n' in binary mode,
+    # _strip_crnl removes both
+    $data = Bio::DB::IndexedBase::_strip_crnl($data);
     substr($data, 0, 1) = '';
     return $data;
 }

diff --git a/Build.PL b/Build.PL
@@ -107,6 +107,7 @@ my %recommends = (
 
     'Inline::C'                => [0.67,
         'Speeding up code like Fasta Bio::DB::Fasta'],
+
     'IO::Scalar'               => [0,
         'Deal with non-seekable filehandles/Bio::Tools::GuessSeqFormat'],
 

diff --git a/Changes b/Changes
@@ -34,6 +34,8 @@ CPAN releases are branched from 'master'.
     * Issue #81: Small updates to make sure possible memory leaks are detected [cjfields]
     * Issue #84: EMBL format wrapping problem [nyamned]
     * Issue #90: Missing entries for translation tables 24 and 25 [fjossandon]
+    * Issue #95: Speed up of Bio::DB::Fasta::subseq by using a compiled regex
+      or compiled C code (when Inline::C is installed) [rocky]
     * Fix various Bio::Tools::Analysis remote server config problems [cjfields]
     * Added several missing 'Data::Stag' and 'LWP::UserAgent' requirements [fjossandon]
     * Added a workaround in Bio::DB::Registry to get Username in Windows [fjossandon]

diff --git a/t/Tree/TreeIO/nhx.t b/t/Tree/TreeIO/nhx.t
@@ -13,6 +13,8 @@ BEGIN {
 }
 
 my $verbose = 0; #test_debug();
+my $nl = qr/\n/;
+my $cr = qr/\r/;
 
 my $treeio = Bio::TreeIO->new(
   -format => 'nhx',
@@ -85,8 +87,8 @@ sub read_file {
   binmode $IN;
   $string = <$IN>;
   close $IN;
-  $string =~ s/\n//g;
-  $string =~ s/\r//g; # For files with Windows line-endings
+  $string =~ s/$nl//g;
+  $string =~ s/$cr//g; # For files with Windows line-endings
   #print STDERR "STR: $string\n";
   return $string;
 }