Skip to content

Commit

Permalink
Moved optimized \n and \r removal subroutine from Bio::DB::Fasta
Browse files Browse the repository at this point in the history
to a more central location at Bio::DB::IndexedBase, from where
its used by Bio::DB::Fasta and Bio::DB::Qual. Also made a small
tweak to nhx.t and updated Changes.
  • Loading branch information
fjossandon committed Jan 2, 2015
1 parent 054a689 commit 01ec10d
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 49 deletions.
44 changes: 2 additions & 42 deletions Bio/DB/Fasta.pm
Expand Up @@ -139,46 +139,6 @@ use base qw(Bio::DB::IndexedBase);
our $obj_class = 'Bio::PrimarySeq::Fasta';
our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';

# Compiling the below regular expressions speeds up the Pure Perl
# seq/subseq() by about 7% from 7.76s to 7.22s over 32358 calls on
# Variant Effect Prediction data.
my $nl = qr/\n/;
my $cr = qr/\r/;

# Remove carriage returns (\r) and newlines (\n) from a string. When
# called from subseq, this can take a signficiant portion of time, in
# Variant Effect Prediction. Therefore we compile the match
# portion.
sub strip_crnl {
my $str = shift;
$str =~ s/$nl//g;
$str =~ s/$cr//g;
return $str;
}

# C can do perfrom strip_crnl much faster. But this requires the
# Inline::C module which we don't require people to have. So we make
# this optional by wrapping the C code in an eval. If the eval works,
# the Perl strip_crnl() function is overwritten.
eval q{
use Inline C => <<'END_OF_C_CODE';
/* Strip all new line (\n) and carriage return (\r) characters
from string str
*/
char* strip_crnl(char* str) {
char *s;
char *s2 = str;
for (s = str; *s; *s++) {
if (*s != '\n' && *s != '\r') {
*s2++ = *s;
}
}
*s2 = '\0';
return str;
}
END_OF_C_CODE
};


=head2 new
Expand Down Expand Up @@ -329,7 +289,7 @@ sub subseq {
seek($fh, $filestart,0);
read($fh, $data, $filestop-$filestart+1);

$data = strip_crnl($data);
$data = Bio::DB::IndexedBase::_strip_crnl($data);

if ($strand == -1) {
# Reverse-complement the sequence
Expand Down Expand Up @@ -371,7 +331,7 @@ sub header {
read($fh, $data, $headerlen);
# On Windows chomp remove '\n' but leaves '\r'
# when reading '\r\n' in binary mode
$data = strip_crnl($data);
$data = Bio::DB::IndexedBase::_strip_crnl($data);
substr($data, 0, 1) = '';
return $data;
}
Expand Down
40 changes: 40 additions & 0 deletions Bio/DB/IndexedBase.pm
Expand Up @@ -268,6 +268,46 @@ use constant DIE_ON_MISSMATCHED_LINES => 1;
# you can avoid dying if you want but you may get incorrect results


# Compiling the below regular expressions speeds up the Pure Perl
# seq/subseq() from Bio::DB::Fasta by about 7% from 7.76s to 7.22s
# over 32358 calls on Variant Effect Prediction data.
my $nl = qr/\n/;
my $cr = qr/\r/;

# Remove carriage returns (\r) and newlines (\n) from a string. When
# called from subseq, this can take a signficiant portion of time, in
# Variant Effect Prediction. Therefore we compile the match portion.
sub _strip_crnl {
my $str = shift;
$str =~ s/$nl//g;
$str =~ s/$cr//g;
return $str;
}

# C can do perfrom _strip_crnl much faster. But this requires the
# Inline::C module which we don't require people to have. So we make
# this optional by wrapping the C code in an eval. If the eval works,
# the Perl strip_crnl() function is overwritten.
eval q{
use Inline C => <<'END_OF_C_CODE';
/* Strip all new line (\n) and carriage return (\r) characters
from string str
*/
char* _strip_crnl(char* str) {
char *s;
char *s2 = str;
for (s = str; *s; *s++) {
if (*s != '\n' && *s != '\r') {
*s2++ = *s;
}
}
*s2 = '\0';
return str;
}
END_OF_C_CODE
};


=head2 new
Title : new
Expand Down
9 changes: 4 additions & 5 deletions Bio/DB/Qual.pm
Expand Up @@ -335,8 +335,7 @@ sub subqual {
read($fh, $data, $filestop-$filestart+1);

# Process quality score
$data =~ s/\n//g;
$data =~ s/\r//g;
Bio::DB::IndexedBase::_strip_crnl($data);
my $subqual = 0;
$subqual = 1 if ( $start || $stop );
my @data;
Expand Down Expand Up @@ -379,9 +378,9 @@ sub header {
seek($fh, $offset, 0);
read($fh, $data, $headerlen);
# On Windows chomp remove '\n' but leaves '\r'
# when reading '\r\n' in binary mode
$data =~ s/\n//g;
$data =~ s/\r//g;
# when reading '\r\n' in binary mode,
# _strip_crnl removes both
$data = Bio::DB::IndexedBase::_strip_crnl($data);
substr($data, 0, 1) = '';
return $data;
}
Expand Down
1 change: 1 addition & 0 deletions Build.PL
Expand Up @@ -107,6 +107,7 @@ my %recommends = (

'Inline::C' => [0.67,
'Speeding up code like Fasta Bio::DB::Fasta'],

'IO::Scalar' => [0,
'Deal with non-seekable filehandles/Bio::Tools::GuessSeqFormat'],

Expand Down
2 changes: 2 additions & 0 deletions Changes
Expand Up @@ -34,6 +34,8 @@ CPAN releases are branched from 'master'.
* Issue #81: Small updates to make sure possible memory leaks are detected [cjfields]
* Issue #84: EMBL format wrapping problem [nyamned]
* Issue #90: Missing entries for translation tables 24 and 25 [fjossandon]
* Issue #95: Speed up of Bio::DB::Fasta::subseq by using a compiled regex
or compiled C code (when Inline::C is installed) [rocky]
* Fix various Bio::Tools::Analysis remote server config problems [cjfields]
* Added several missing 'Data::Stag' and 'LWP::UserAgent' requirements [fjossandon]
* Added a workaround in Bio::DB::Registry to get Username in Windows [fjossandon]
Expand Down
6 changes: 4 additions & 2 deletions t/Tree/TreeIO/nhx.t
Expand Up @@ -13,6 +13,8 @@ BEGIN {
}

my $verbose = 0; #test_debug();
my $nl = qr/\n/;
my $cr = qr/\r/;

my $treeio = Bio::TreeIO->new(
-format => 'nhx',
Expand Down Expand Up @@ -85,8 +87,8 @@ sub read_file {
binmode $IN;
$string = <$IN>;
close $IN;
$string =~ s/\n//g;
$string =~ s/\r//g; # For files with Windows line-endings
$string =~ s/$nl//g;
$string =~ s/$cr//g; # For files with Windows line-endings
#print STDERR "STR: $string\n";
return $string;
}

0 comments on commit 01ec10d

Please sign in to comment.