Skip to content

Commit

Permalink
Add "similar"; add examples; version to 0.05
Browse files Browse the repository at this point in the history
  • Loading branch information
benkasminbullock committed Apr 29, 2021
1 parent 4e58610 commit 4a97ea6
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 16 deletions.
5 changes: 5 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
Revision history for Perl module Unicode::Confusables

0.05 2021-04-29

- Add "similar" which returns a list of similar characters
- Add example program using similar to obfuscate text

0.04 2021-04-29

- Document the version of confusables.txt used
Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.SKIP
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Exclude the author-only build scripts
(build|make-pod|clean|make-confusables)\.pl
(build|make-pod|clean|make-confusables|versionup)\.pl
# Exclude the built module
blib/.*
# Exclude all git files
Expand Down
36 changes: 36 additions & 0 deletions examples/obfuscate.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/home/ben/software/install/bin/perl
use warnings;
use strict;
use utf8;
use Unicode::Confuse ':all';

binmode STDOUT, ":encoding(utf8)";

sub obfuscate
{
for (@_) {
my @letters = split '', $_;
my $out = '';
my $ok;
for my $letter (@letters) {
my @similar = similar ($letter);
if (@similar) {
$ok = 1;
my $n = scalar (@similar);
my $r = int (rand ($n));
$out .= $similar[$r];
}
else {
$out .= $letter;
}
}
if (! $ok) {
print "No confusables in '$_'.\n";
}
else {
print "$_ -> $out\n";
}
}
}

obfuscate ('paypal', '月火水木金土日');
13 changes: 13 additions & 0 deletions examples/synopsis.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/home/ben/software/install/bin/perl
use warnings;
use strict;
use utf8;
use Unicode::Confuse ':all';
binmode STDOUT, ":encoding(utf8)";
if (confusable ('ρ')) {
my $canonical = canonical ('ρ');
print "'ρ' is confusable with $canonical.\n";
my @similar = similar ($canonical);
print "$canonical is also confusable with @similar.\n";
}

32 changes: 29 additions & 3 deletions lib/Unicode/Confuse.pm
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@ use Carp;
use utf8;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw/confusable canonical/;
our @EXPORT_OK = qw/
canonical
confusable
similar
/;
our %EXPORT_TAGS = (
all => \@EXPORT_OK,
);
our $VERSION = '0.04';
our $VERSION = '0.05';
use Unicode::Confuse::Regex;

my $re = $Unicode::Confuse::Regex::re;
Expand All @@ -29,9 +33,31 @@ sub confusable
sub canonical
{
my ($c) = @_;
my $r;
if ($c =~ $re) {
return $data->{confusables}{$c};
$r = $data->{confusables}{$c};
if (! defined $r) {
# $r is already the canonical form
$r = $c;
}
}
return $r;
}

sub similar
{
my ($c) = @_;
my $d = canonical ($c);
if (! $d) {
return ();
}
my @similar;
# The reverse data does not include the canonical form in its
# list.
push @similar, $d;
my $r = $data->{reverse}{$d};
push @similar, @$r;
return @similar;
}

1;
52 changes: 43 additions & 9 deletions lib/Unicode/Confuse.pod.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ This method was added in version [% version %] of the module.

=head1 SYNOPSIS

use [% info.colon %];
[% example("synopsis") %]

=head1 VERSION

Expand All @@ -31,7 +31,7 @@ corresponding to L<git commit [% commit.commit %]|[% info.repo

This Perl module incorporates [% metadata.title %] version [%
metadata.version %] dated [% metadata.date %], copyright [%
metadata.copyright %]. [% metadata.terms %]
metadata.copyright %]. [% metadata.terms %].

=head1 DESCRIPTION

Expand All @@ -45,27 +45,49 @@ different Unicode code points.

my $canonical = canonical ($c);

If C<$c> is a confusable, give the canonical form of C<$c>. If not,
returns the undefined value.
If C<$c> is a confusable, give the canonical form of C<$c>. If $c is
already the canonical form of itself, return $c. If C<$c> is not a
confusable, this returns the undefined value. "Canonical" here just
means the character which is used as a representative of the group of
confusables in the L</Unicode data files>.

=head2 confusable

if (confusable ($c)) {
# do something.
}

Is C<$c> confusable, yes or no? This matches C<$c> against the large
regex in L<Unicode::Confuse::Regex>.
This returns a true or false value depending on whether C<$c> is a
confusable. This matches C<$c> against a large regex in
L<Unicode::Confuse::Regex>.

=head2 similar

my @similar = similar ('p');

Return a list of confusables which are similar to the given input. If
the input is not a confusable, an empty list is returned.

The first character in C<@similar> is the canonical form, and the
remaining characters are the other confusables associated with that
canonical form. These remaining characters, if more than one, are
sorted by code point.

=head3 Example: obfuscate text

This example obfuscates strings by substituting confusable letters
with substitutes picked at random from the confusable data for the
letters.

[% example("obfuscate") %]

=head1 DEPENDENCIES

=over

=item L<File::Slurper>

This is used by the parsing module L<Unicode::Confuse::Parse>. You
don't actually need to use the parsing module to use this module with
the built-in version of the data.
This is used by the parsing module L<Unicode::Confuse::Parse>.

=item L<JSON::Parse>

Expand Down Expand Up @@ -134,6 +156,18 @@ The following links point to the latest data files:

=back

=item L<Unicode Utilities: Confusables|https://util.unicode.org/UnicodeJsps/confusables.jsp>

The utility says

=over

With this demo, you can supply an Input string and see the
combinations that are confusable with it, using data collected by the
Unicode consortium.

=back

=back

[% INCLUDE "author" %]
3 changes: 2 additions & 1 deletion lib/Unicode/Confuse/Parse.pm
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use utf8;
use Carp;
use File::Slurper 'read_text';

our $VERSION = '0.04';
our $VERSION = '0.05';

# A code point
my $cp = qr!([0-9A-F]{4,5})\s+!;
Expand Down Expand Up @@ -100,6 +100,7 @@ sub metadata
next;
}
if ($data =~ /terms of use/i) {
$data =~ s!(https?://.*\.html)!L<$1>!;
$md{terms} = $data;
next;
}
Expand Down
10 changes: 8 additions & 2 deletions make-pod.pl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
use Deploy qw/do_system older/;
use Getopt::Long;
use JSON::Parse 'read_json';
use ExtUtils::ParseXS::Utilities 'trim_whitespace';
use File::Slurper 'write_text';

my $ok = GetOptions (
'force' => \my $force,
Expand Down Expand Up @@ -61,13 +63,17 @@
my $output = $example;
$output =~ s/\.pl$/-out.txt/;
if (older ($output, $example) || $force) {
do_system ("perl -I$Bin/blib/lib -I$Bin/blib/arch $example > $output 2>&1", $verbose);
do_system ("perl -I$Bin/lib $example > $output 2>&1", $verbose);
}
}

chmod 0644, $output;
$tt->process ($input, \%vars, $output, binmode => 'utf8')
$tt->process ($input, \%vars, \my $outtext, binmode => 'utf8')
or die '' . $tt->error ();
trim_whitespace ($outtext);
$outtext .= "\n";
$outtext =~ s!\n\n+!\n\n!g;
write_text ($output, $outtext);
chmod 0444, $output;

exit;
Expand Down
4 changes: 4 additions & 0 deletions t/unicode-confuse.t
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ is (canonical ('Æ'), 'AE', "Got canonical form for Æ");
ok (confusable (''), "㈝ is confusable");
is (canonical (''), '(오전)', "Got canonical form for ㈝");

ok (confusable (''), "Minus sign is confusable");
is (canonical (''), '-', "Got canonical form for −");
ok (confusable ('~'), "Tildes");
is (canonical (''), '~', "Tildes");
done_testing ();

# Local variables:
Expand Down
53 changes: 53 additions & 0 deletions versionup.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/home/ben/software/install/bin/perl

# The CPAN perl-reversion script seems to be making a muddle of things
# sometimes, and it doesn't edit Changes, so I've made my own script.

use warnings;
use strict;
use utf8;
use FindBin '$Bin';
use File::Slurper qw!read_text write_text!;
use Deploy 'make_date';

my $newversion = '0.05';
my $version = '0.04';

my @pmfiles = qw!
lib/Unicode/Confuse.pm
lib/Unicode/Confuse/Parse.pm
!;

for my $file (@pmfiles) {
my $bfile = "$Bin/$file";
my $text = read_text ($bfile);
if ($text =~ s/\Q$version\E\b/$newversion/g) {
print "$file OK\n";
write_text ($bfile, $text);
}
elsif ($text =~ /\Q$newversion/) {
warn "$file already at $newversion";
}
else {
warn "$file failed";
}
}

my $date = make_date ('-');
my $changes = "$Bin/Changes";
my $text = read_text ($changes);
my $latest = <<EOF;
$newversion $date
-
EOF

if ($text =~ s/(\Q$version\E)/$latest$1/g) {
write_text ($changes, $text);
}
else {
warn "$changes failed";
}


0 comments on commit 4a97ea6

Please sign in to comment.