/
NACO.pm
184 lines (117 loc) · 3.57 KB
/
NACO.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
package Text::Normalize::NACO;
=head1 NAME
Text::Normalize::NACO - Normalize text based on the NACO rules
=head1 SYNOPSIS
# exported method
use Text::Normalize::NACO qw( naco_normalize );
$normalized = naco_normalize( $original );
# as an object
$naco = Text::Normalize::NACO->new;
$normalized = $naco->normalize( $original );
# normalize to lowercase
$naco->case( 'lower' );
$normalized = $naco->normalize( $original );
=head1 DESCRIPTION
In general, normalization is defined as:
To make (a text or language) regular and consistent, especially with respect to spelling or style.
It is commonly used for comparative purposes. These particular normalization rules have been set out by the
Name Authority Cooperative. The rules are described in detail at: http://www.loc.gov/catdir/pcc/naco/normrule.html
=head1 INSTALLATION
perl Makefile.PL
make
make test
make install
=cut
use base qw( Exporter );
use strict;
use warnings;
use Text::Unidecode;
our $VERSION = '0.11';
our @EXPORT_OK = qw( naco_normalize );
=head1 METHODS
=head2 new( %options )
Creates a new Text::Normalize::NACO object. You explicitly request
strings to be normalized in upper or lower-case by setting
the "case" option (defaults to "upper").
my $naco = Text::Normalize::NACO->new( case => 'lower' );
=cut
sub new {
my $class = shift;
my %options = @_;
my $self = bless {}, $class;
$self->case( $options{ case } || 'upper' );
return $self;
}
=head2 case( $case )
Accessor/Mutator for the case in which the string should be returned.
# lower-case
$naco->case( 'lower' );
# upper-case
$naco->case( 'upper' );
=cut
sub case {
my $self = shift;
my( $case ) = @_;
$self->{ _CASE } = $case if @_;
return $self->{ _CASE };
}
=head2 naco_normalize( $text, { %options } )
Exported version of C<normalize>. You can specify any extra
options by passing a hashref after the string to be normalized.
my $normalized = naco_normalize( $original, { case => 'lower' } );
=cut
sub naco_normalize {
my $text = shift;
my $options = shift;
my $case = $options->{ case } || 'upper';
my $normalized = normalize( undef, $text );
if( $case eq 'lower' ) {
$normalized =~ tr/A-Z/a-z/;
}
else {
$normalized =~ tr/a-z/A-Z/;
}
return $normalized;
}
=head2 normalize( $text )
Normalizes $text and returns the new string.
my $normalized = $naco->normalize( $original );
=cut
sub normalize {
my $self = shift;
my $data = shift;
# Rules taken from NACO Normalization
# http://lcweb.loc.gov/catdir/pcc/naco/normrule.html
# Remove diacritical marks and convert special chars
unidecode( $data );
# Convert special chars to spaces
$data =~ s/[\Q!(){}<>-;:.?,\/\\@*%=\$^_~\E]/ /g;
# Delete special chars
$data =~ s/[\Q'[]|\E]//g;
# Convert lowercase to uppercase or vice-versa.
if( $self ) {
if( $self->case eq 'lower' ) {
$data =~ tr/A-Z/a-z/;
}
else {
$data =~ tr/a-z/A-Z/;
}
}
# Remove leading and trailing spaces
$data =~ s/^\s+|\s+$//g;
# Condense multiple spaces
$data =~ s/\s+/ /g;
return $data;
}
=head1 SEE ALSO
=over 4
=item * http://www.loc.gov/catdir/pcc/naco/normrule.html
=back
=head1 AUTHOR
Brian Cassidy E<lt>bricas@cpan.orgE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright 2007 by Brian Cassidy
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
1;