Permalink
Browse files

documentation

  • Loading branch information...
1 parent 9323d4d commit 89eb07c3eec05f1bc2faf0cd998209b0c40900ec @andrefs committed Jun 3, 2012
Showing with 90 additions and 51 deletions.
  1. +80 −51 lib/Lingua/EN/Tokenizer/Offsets.pm
  2. +6 −0 t/01-medium_text.t
  3. +4 −0 weaver.ini
@@ -19,7 +19,7 @@ our @EXPORT_OK = qw/
-=method get_offsets
+=method get_offsets($text)
Takes text input and returns reference to array containin pairs of character
offsets, corresponding to the tokens start and end positions.
@@ -35,7 +35,7 @@ sub token_offsets {
}
-=method get_tokens
+=method get_tokens($text)
Takes text input and splits it into tokens.
@@ -50,7 +50,7 @@ sub get_tokens {
-=method adjust_offsets
+=method adjust_offsets($text,$offsets)
Minor adjusts to offsets (leading/trailing whitespace, etc)
@@ -81,6 +81,12 @@ sub adjust_offsets {
return $new_offsets;
}
+=head2 initial_offsets($text)
+
+First naive delimitation of tokens.
+
+=cut
+
sub initial_offsets {
my ($text) = @_;
my $end;
@@ -89,60 +95,53 @@ sub initial_offsets {
# token patterns
my @patterns = (
- qr{([^\p{IsAlnum}\s\.\'\`\,\-])},
+ qr{([^\p{IsAlnum}\s\.\'\`\,\-])},
qr{(?<!\p{IsN})(,)(?!\d)},
qr{(?<=\p{IsN})(,)(?!\d)},
qr{(?<!\p{IsN})(,)(?=\d)},
- qr{(?<!\p{isAlpha})(['`])(?!\p{isAlpha})},
- qr{(?<!\p{isAlpha})(['`])(?=\p{isAlpha})},
- qr{(?<=\p{isAlpha})(['`])(?!\p{isAlpha})},
+ qr{(?<!\p{isAlpha})(['`’])(?!\p{isAlpha})},
+ qr{(?<!\p{isAlpha})(['`’])(?=\p{isAlpha})},
+ qr{(?<=\p{isAlpha})(['`’])(?!\p{isAlpha})},
+ qr{(?<=\p{isAlpha})()['`’](?=\p{isAlpha})},
qr{(?:^|\s)(\S+)(?:$|\s)},
qr{(?:^|[^\.])(\.\.+)(?:$|[^\.])},
qr{(?<=\p{isAlpha})['`]()(?=\p{isAlpha})},
);
- my $split = 1;
- while ($split){
- $split = 0;
- for my $pat (@patterns){
- my $size = @$offsets;
- for(my $i=0; $i<$size; $i++){
- my $start = $offsets->[$i][0];
- my $length = $offsets->[$i][1]-$start;
- my $s = substr($text,$start,$length);
-
- my $split_points = [];
-
- if($s =~ /^$pat(?!$)/g){
- my $first = $-[1];
- push @$split_points,[$start+$first,$start+$first];
- my $second = $+[1];
- push @$split_points,[$start+$second,$start+$second] if $first != $second;
- $split = 1;
- }
- while($s =~ /(?<!^)$pat(?!$)/g){
- my $first = $-[1];
- push @$split_points,[$start+$first,$start+$first];
- my $second = $+[1];
- push @$split_points,[$start+$second,$start+$second] if $first != $second;
- $split = 1;
- }
- if($s =~ /(?<!^)$pat$/g){
- my $first = $-[1];
- push @$split_points,[$start+$first,$start+$first];
- my $second = $+[1];
- push @$split_points,[$start+$second,$start+$second] if $first != $second;
- $split = 1;
- }
-
- _split_tokens($offsets,$i,[ sort { $a->[0] <=> $b->[0] } @$split_points ]) if @$split_points;
+ for my $pat (@patterns){
+ my $size = @$offsets;
+ for(my $i=0; $i<$size; $i++){
+ my $start = $offsets->[$i][0];
+ my $length = $offsets->[$i][1]-$start;
+ my $s = substr($text,$start,$length);
+
+ my $split_points = [];
+
+ if($s =~ /^$pat(?!$)/g){
+ my $first = $-[1];
+ push @$split_points,[$start+$first,$start+$first];
+ my $second = $+[1];
+ push @$split_points,[$start+$second,$start+$second] if $first != $second;
+ }
+ while($s =~ /(?<!^)$pat(?!$)/g){
+ my $first = $-[1];
+ push @$split_points,[$start+$first,$start+$first];
+ my $second = $+[1];
+ push @$split_points,[$start+$second,$start+$second] if $first != $second;
+ }
+ if($s =~ /(?<!^)$pat$/g){
+ my $first = $-[1];
+ push @$split_points,[$start+$first,$start+$first];
+ my $second = $+[1];
+ push @$split_points,[$start+$second,$start+$second] if $first != $second;
}
+
+ _split_tokens($offsets,$i,[ sort { $a->[0] <=> $b->[0] } @$split_points ]) if @$split_points;
}
}
return _nonbp($text,$offsets);
-#return $offsets;
}
sub _split_tokens {
@@ -158,7 +157,7 @@ sub _split_tokens {
}
-=method offsets2tokens
+=method offsets2tokens($text,$offsets)
Given a list of token boundaries offsets and a text, returns an array with the text split into tokens.
@@ -192,12 +191,6 @@ sub _load_prefixes {
close($prefix);
}
-
-
-=method _nonbp
-
-=cut
-
sub _nonbp {
my ($text,$offsets) = @_;
my $nonbpref = {};
@@ -234,6 +227,42 @@ sub _nonbp {
return [ sort { $a->[0] <=> $b->[0] } (@$new_offsets,@$extra) ];
}
+1;
+__END__
+
+
+=head1 SYNOPSIS
+
+
+ use Lingua::EN::Tokenizer::Offsets qw/token_offsets get_tokens/;
+
+ my $str <<END
+ Hey! Mr. Tambourine Man, play a song for me.
+ I'm not sleepy and there is no place I’m going to.
+ END
+
+ my $offsets = token_offsets($str); ## Get the offsets.
+ foreach my $o (@$offsets) {
+ my $start = $o->[0];
+ my $length = $o->[1]-$o->[0];
+
+ my $token = substr($text,$start,$length) ## Get a token.
+ # ...
+ }
+
+ ### or
+
+ my $tokens = get_tokens($str);
+ foreach my $token (@$tokens) {
+ ## do something with $token
+ }
+
+=head1 ACKNOWLEDGEMENTS
+
+Based on the original tokenizer written by Josh Schroeder and provided by Europarl L<http://www.statmt.org/europarl/>.
+
+=head1 SEE ALSO
+
+L<Lingua::EN::Sentence::Offsets>, L<Lingua::FreeLing3::Tokenizer>
-1;
View
@@ -17,6 +17,7 @@ eq_or_diff "$got\n", $expected, "testing strings";
sub load_strings {
my $original = <<'END'
+I'm testing'this.
Real-time PCR assays using TaqMan or Molecular
Beacon probes were developed and optimized for the
quantification of total bacteria, the nitrite-oxidizing bacteria
@@ -64,6 +65,11 @@ END
;
my $expected = <<'END'
+I
+'m
+testing
+'this
+.
Real-time
PCR
assays
View
@@ -2,3 +2,7 @@
[-Transformer]
transformer = List
+
+[-Encoding]
+encoding = utf-8
+

0 comments on commit 89eb07c

Please sign in to comment.