Browse files

use Text::Unidecode to remove diacritical marks

switch to Module::Build
updated test suite
  • Loading branch information...
1 parent 3329f01 commit 72e31f93cffebe5a0475feab3316de1e24e1e868 @bricas committed Apr 18, 2006
Showing with 122 additions and 152 deletions.
  1. +19 −0 Build.PL
  2. +4 −0 Changes
  3. +2 −2 MANIFEST
  4. +0 −9 Makefile.PL
  5. +0 −31 README
  6. +93 −106 lib/Text/Normalize/NACO.pm
  7. +4 −4 t/11-normalize_suite.t
View
19 Build.PL
@@ -0,0 +1,19 @@
+use strict;
+
+use Module::Build;
+
+my $build = Module::Build->new(
+ module_name => 'Text::Normalize::NACO',
+ dist_author => 'Brian Cassidy <bricas@cpan.org>',
+ license => 'perl',
+ create_readme => 1,
+ create_makefile_pl => 'traditional',
+ requires => {
+ 'Text::Unidecode' => 0,
+ },
+ build_requres => {
+ 'Test::More' => 0,
+ }
+);
+
+$build->create_build_script;
View
4 Changes
@@ -1,5 +1,9 @@
Revision history for Perl extension Text::Normalize::NACO
+0.1 Tue Apr 18 2006
+ - use Text::Unidecode to remove diacritical marks
+ - switch to Module::Build
+
0.05 Thu Mar 03 12:46:20 2005
- added pod_coverage test
View
4 MANIFEST
@@ -1,8 +1,8 @@
+Build.PL
Changes
lib/Text/Normalize/NACO.pm
-Makefile.PL
MANIFEST
-README
+META.yml
t/01-use.t
t/10-normalize.t
t/11-normalize_suite.t
View
9 Makefile.PL
@@ -1,9 +0,0 @@
-use ExtUtils::MakeMaker;
-
-WriteMakefile(
- NAME => 'Text::Normalize::NACO',
- VERSION_FROM => 'lib/Text/Normalize/NACO.pm',
- PREREQ_PM => {
- 'Test::More' => 0
- }
-);
View
31 README
@@ -1,31 +0,0 @@
-Text::Normalize::NACO - Normalize text based on the NACO rules
-
-In general, normalization is defined as:
-
- To make (a text or language) regular and consistent, especially with respect to spelling or style.
-
-It is commonly used for comparative purposes. These particular normalization rules have been set out by the
-Name Authority Cooperative. The rules are described in detail at: http://www.loc.gov/catdir/pcc/naco/normrule.html
-
-
-INSTALLATION
-
-To install this module type the following:
-
- perl Makefile.PL
- make
- make test
- make install
-
-
-AUTHOR
-
-Brian Cassidy <bricas@cpan.org>
-
-
-COPYRIGHT AND LICENSE
-
-Copyright 2005 by Brian Cassidy
-
-This library is free software; you can redistribute it and/or modify
-it under the same terms as Perl itself.
View
199 lib/Text/Normalize/NACO.pm
@@ -6,58 +6,56 @@ Text::Normalize::NACO - Normalize text based on the NACO rules
=head1 SYNOPSIS
- # exported method
- use Text::Normalize::NACO qw( naco_normalize );
-
- $normalized = naco_normalize( $original );
-
- # as an object
- $naco = Text::Normalize::NACO->new;
- $normalized = $naco->normalize( $original );
-
- # normalize to lowercase
- $naco->case( 'lower' );
- $normalized = $naco->normalize( $original );
+ # exported method
+ use Text::Normalize::NACO qw( naco_normalize );
+
+ $normalized = naco_normalize( $original );
+
+ # as an object
+ $naco = Text::Normalize::NACO->new;
+ $normalized = $naco->normalize( $original );
+
+ # normalize to lowercase
+ $naco->case( 'lower' );
+ $normalized = $naco->normalize( $original );
=head1 DESCRIPTION
In general, normalization is defined as:
- To make (a text or language) regular and consistent, especially with respect to spelling or style.
+ To make (a text or language) regular and consistent, especially with respect to spelling or style.
It is commonly used for comparative purposes. These particular normalization rules have been set out by the
Name Authority Cooperative. The rules are described in detail at: http://www.loc.gov/catdir/pcc/naco/normrule.html
+=head1 INSTALLATION
+
+To install this module via Module::Build:
+
+ perl Build.PL
+ ./Build # or `perl Build`
+ ./Build test # or `perl Build test`
+ ./Build install # or `perl Build install`
+
+To install this module via ExtUtils::MakeMaker:
+
+ perl Makefile.PL
+ make
+ make test
+ make install
+
=cut
use base qw( Exporter );
use strict;
use warnings;
-our $VERSION = '0.05';
+use Text::Unidecode;
-our @EXPORT_OK = qw( naco_normalize );
+our $VERSION = '0.1';
-# LUT to convert diacritical and special characters
-# Modified from Pod::Escapes
-my ( %Latin1Code_to_fallback, %Latin1Char_to_fallback );
-
-@Latin1Code_to_fallback{ 0xA0..0xFF } = (
- ' ', ' ', 'C', ' ', ' ', 'Y', ' ', 'SS', ' ', ' ', 'a', ' ', ' ', '', ' ', ' ',
- ' ', ' ', '2', '3', '', 'u', 'P', ' ', ' ', '1', 'o', ' ', '1/4', '1/2', '3/4', ' ',
- 'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
- 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss',
- 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
- 'd', 'n', 'o', 'o', 'o', 'o', 'o', ' ', 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y',
-);
-
-{
- my( $k, $v );
- while( ( $k, $v ) = each %Latin1Code_to_fallback ) {
- $Latin1Char_to_fallback{ chr $k } = $v;
- }
-}
+our @EXPORT_OK = qw( naco_normalize );
=head1 METHODS
@@ -67,121 +65,110 @@ Creates a new Text::Normalize::NACO object. You explicitly request
strings to be normalized in upper or lower-case by setting
the "case" option (defaults to "upper").
- my $naco = Text::Normalize::NACO->new( case => 'lower' );
+ my $naco = Text::Normalize::NACO->new( case => 'lower' );
=cut
sub new {
- my $class = shift;
- my %options = @_;
- my $self = bless {}, $class;
+ my $class = shift;
+ my %options = @_;
+ my $self = bless {}, $class;
- $self->case( $options{ case } || 'upper' );
+ $self->case( $options{ case } || 'upper' );
- return $self;
+ return $self;
}
=head2 case( $case )
Accessor/Mutator for the case in which the string should be returned.
- # lower-case
- $naco->case( 'lower' );
+ # lower-case
+ $naco->case( 'lower' );
- # upper-case
- $naco->case( 'upper' );
+ # upper-case
+ $naco->case( 'upper' );
=cut
sub case {
- my $self = shift;
- my( $case ) = @_;
+ my $self = shift;
+ my( $case ) = @_;
- $self->{ _CASE } = $case if @_;
+ $self->{ _CASE } = $case if @_;
- return $self->{ _CASE };
+ return $self->{ _CASE };
}
=head2 naco_normalize( $text, { %options } )
Exported version of C<normalize>. You can specify any extra
options by passing a hashref after the string to be normalized.
- $normalized = naco_normalize( $original, { case => 'lower' } );
+ my $normalized = naco_normalize( $original, { case => 'lower' } );
=cut
sub naco_normalize {
- my $text = shift;
- my $options = shift;
- my $case = $options->{ case } || 'upper';
+ my $text = shift;
+ my $options = shift;
+ my $case = $options->{ case } || 'upper';
- my $normalized = normalize( undef, $text );
+ my $normalized = normalize( undef, $text );
- if( $case eq 'lower' ) {
- $normalized =~ tr/A-Z/a-z/;
- }
- else {
- $normalized =~ tr/a-z/A-Z/;
- }
+ if( $case eq 'lower' ) {
+ $normalized =~ tr/A-Z/a-z/;
+ }
+ else {
+ $normalized =~ tr/a-z/A-Z/;
+ }
- return $normalized;
+ return $normalized;
}
=head2 normalize( $text )
Normalizes $text and returns the new string.
+ my $normalized = $naco->normalize( $original );
+
=cut
sub normalize {
- my $self = shift;
- my $data = shift;
-
- # Rules taken from NACO Normalization
- # http://lcweb.loc.gov/catdir/pcc/naco/normrule.html
-
- # Convert special chars to spaces
- $data =~ s/[\Q!(){}<>-;:.?,\/\\@*%=\$^_~\E]/ /g;
-
- # Delete special chars
- $data =~ s/[\Q'[]|\E]//g;
-
- # Remove diacritical marks and convert special chars
- my @chars = split(//, $data);
- for ( my $i = 0; $i < @chars; $i++ ) {
- next unless ord( $chars[ $i ] ) >= 160 and ord( $chars[ $i ] ) <= 255;
- $chars[ $i ] = $Latin1Char_to_fallback{ $chars[ $i ] };
- }
- $data = join( '', @chars );
-
- # Convert lowercase to uppercase or vice-versa.
- if( $self ) {
- if( $self->case eq 'lower' ) {
- $data =~ tr/A-Z/a-z/;
- }
- else {
- $data =~ tr/a-z/A-Z/;
- }
- }
-
- # Remove leading and trailing spaces
- $data =~ s/^\s+|\s+$//g;
-
- # Condense multiple spaces
- $data =~ s/\s+/ /g;
-
- return $data;
+ my $self = shift;
+ my $data = shift;
+
+ # Rules taken from NACO Normalization
+ # http://lcweb.loc.gov/catdir/pcc/naco/normrule.html
+
+ # Remove diacritical marks and convert special chars
+ unidecode( $data );
+
+ # Convert special chars to spaces
+ $data =~ s/[\Q!(){}<>-;:.?,\/\\@*%=\$^_~\E]/ /g;
+
+ # Delete special chars
+ $data =~ s/[\Q'[]|\E]//g;
+
+ # Convert lowercase to uppercase or vice-versa.
+ if( $self ) {
+ if( $self->case eq 'lower' ) {
+ $data =~ tr/A-Z/a-z/;
+ }
+ else {
+ $data =~ tr/a-z/A-Z/;
+ }
+ }
+
+ # Remove leading and trailing spaces
+ $data =~ s/^\s+|\s+$//g;
+
+ # Condense multiple spaces
+ $data =~ s/\s+/ /g;
+
+ return $data;
}
-=head1 TODO
-
-=over 4
-
-=item * Add a test suite
-
-=back
-
=head1 SEE ALSO
=over 4
@@ -200,7 +187,7 @@ sub normalize {
=head1 COPYRIGHT AND LICENSE
-Copyright 2005 by Brian Cassidy
+Copyright 2006 by Brian Cassidy
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
View
8 t/11-normalize_suite.t
@@ -10,16 +10,16 @@ BEGIN {
my $naco = Text::Normalize::NACO->new( case => 'lower' );
isa_ok( $naco, 'Text::Normalize::NACO' );
-for my $file ( <t/*.dat> ) {
- open( FILE, $file );
+for my $file ( glob( 't/*.dat' ) ) {
+ open( my $text, $file ) or die $!;
- while( <FILE> ) {
+ while( <$text> ) {
s/[\r\n]//g;
my( $original, $normalized ) = split( /\t/, $_ );
is( $naco->normalize( $original ), $normalized, "\$naco->normalize( '$original' )" );
}
- close( FILE );
+ close( $text ) or die $!;
}

0 comments on commit 72e31f9

Please sign in to comment.