Add "similar"; add examples; version to 0.05

benkasminbullock · Apr 29, 2021 · 4a97ea6 · 4a97ea6
1 parent 4e58610
commit 4a97ea6
Show file tree

Hide file tree

Showing 10 changed files with 194 additions and 16 deletions.
diff --git a/Changes b/Changes
@@ -1,5 +1,10 @@
 Revision history for Perl module Unicode::Confusables
 
+0.05 2021-04-29
+
+- Add "similar" which returns a list of similar characters
+- Add example program using similar to obfuscate text
+
 0.04 2021-04-29
 
 - Document the version of confusables.txt used

diff --git a/MANIFEST.SKIP b/MANIFEST.SKIP
@@ -1,5 +1,5 @@
 # Exclude the author-only build scripts
-(build|make-pod|clean|make-confusables)\.pl
+(build|make-pod|clean|make-confusables|versionup)\.pl
 # Exclude the built module
 blib/.*
 # Exclude all git files

diff --git a/examples/obfuscate.pl b/examples/obfuscate.pl
@@ -0,0 +1,36 @@
+#!/home/ben/software/install/bin/perl
+use warnings;
+use strict;
+use utf8;
+use Unicode::Confuse ':all';
+
+binmode STDOUT, ":encoding(utf8)";
+
+sub obfuscate
+{
+    for (@_) {
+	my @letters = split '', $_;
+	my $out = '';
+	my $ok;
+	for my $letter (@letters) {
+	    my @similar = similar ($letter);
+	    if (@similar) {
+		$ok = 1;
+		my $n = scalar (@similar);
+		my $r = int (rand ($n));
+		$out .= $similar[$r];
+	    }
+	    else {
+		$out .= $letter;
+	    }
+	}
+	if (! $ok) {
+	    print "No confusables in '$_'.\n";
+	}
+	else {
+	    print "$_ -> $out\n";
+	}
+    }
+}
+
+obfuscate ('paypal', '月火水木金土日');
diff --git a/examples/synopsis.pl b/examples/synopsis.pl
@@ -0,0 +1,13 @@
+#!/home/ben/software/install/bin/perl
+use warnings;
+use strict;
+use utf8;
+use Unicode::Confuse ':all';
+binmode STDOUT, ":encoding(utf8)";
+if (confusable ('ρ')) {
+    my $canonical = canonical ('ρ');
+    print "'ρ' is confusable with $canonical.\n";
+    my @similar = similar ($canonical);
+    print "$canonical is also confusable with @similar.\n";
+}
+
diff --git a/lib/Unicode/Confuse.pm b/lib/Unicode/Confuse.pm
@@ -5,11 +5,15 @@ use Carp;
 use utf8;
 require Exporter;
 our @ISA = qw(Exporter);
-our @EXPORT_OK = qw/confusable canonical/;
+our @EXPORT_OK = qw/
+    canonical
+    confusable
+    similar
+/;
 our %EXPORT_TAGS = (
     all => \@EXPORT_OK,
 );
-our $VERSION = '0.04';
+our $VERSION = '0.05';
 use Unicode::Confuse::Regex;
 
 my $re = $Unicode::Confuse::Regex::re;
@@ -29,9 +33,31 @@ sub confusable
 sub canonical
 {
     my ($c) = @_;
+    my $r;
     if ($c =~ $re) {
-	return $data->{confusables}{$c};
+	$r = $data->{confusables}{$c};
+	if (! defined $r) {
+	    # $r is already the canonical form
+	    $r = $c;
+	}
     }
+    return $r;
+}
+
+sub similar
+{
+    my ($c) = @_;
+    my $d = canonical ($c);
+    if (! $d) {
+	return ();
+    }
+    my @similar;
+    # The reverse data does not include the canonical form in its
+    # list.
+    push @similar, $d;
+    my $r = $data->{reverse}{$d};
+    push @similar, @$r;
+    return @similar;
 }
 
 1;
diff --git a/lib/Unicode/Confuse.pod.tmpl b/lib/Unicode/Confuse.pod.tmpl
@@ -21,7 +21,7 @@ This method was added in version [% version %] of the module.
 
 =head1 SYNOPSIS
 
-    use [% info.colon %];
+[% example("synopsis") %]
 
 =head1 VERSION
 
@@ -31,7 +31,7 @@ corresponding to L<git commit [% commit.commit %]|[% info.repo
 
 This Perl module incorporates [% metadata.title %] version [%
 metadata.version %] dated [% metadata.date %], copyright [%
-metadata.copyright %]. [% metadata.terms %]
+metadata.copyright %]. [% metadata.terms %].
 
 =head1 DESCRIPTION
 
@@ -45,27 +45,49 @@ different Unicode code points.
 
    my $canonical = canonical ($c);
 
-If C<$c> is a confusable, give the canonical form of C<$c>. If not,
-returns the undefined value.
+If C<$c> is a confusable, give the canonical form of C<$c>. If $c is
+already the canonical form of itself, return $c. If C<$c> is not a
+confusable, this returns the undefined value. "Canonical" here just
+means the character which is used as a representative of the group of
+confusables in the L</Unicode data files>.
 
 =head2 confusable
 
     if (confusable ($c)) {
         # do something.
     }
 
-Is C<$c> confusable, yes or no? This matches C<$c> against the large
-regex in L<Unicode::Confuse::Regex>.
+This returns a true or false value depending on whether C<$c> is a
+confusable. This matches C<$c> against a large regex in
+L<Unicode::Confuse::Regex>.
+
+=head2 similar
+
+    my @similar = similar ('p');
+
+Return a list of confusables which are similar to the given input. If
+the input is not a confusable, an empty list is returned.
+
+The first character in C<@similar> is the canonical form, and the
+remaining characters are the other confusables associated with that
+canonical form. These remaining characters, if more than one, are
+sorted by code point.
+
+=head3 Example: obfuscate text
+
+This example obfuscates strings by substituting confusable letters
+with substitutes picked at random from the confusable data for the
+letters.
+
+[% example("obfuscate") %]
 
 =head1 DEPENDENCIES
 
 =over
 
 =item L<File::Slurper>
 
-This is used by the parsing module L<Unicode::Confuse::Parse>. You
-don't actually need to use the parsing module to use this module with
-the built-in version of the data.
+This is used by the parsing module L<Unicode::Confuse::Parse>.
 
 =item L<JSON::Parse>
 
@@ -134,6 +156,18 @@ The following links point to the latest data files:
 
 =back
 
+=item L<Unicode Utilities: Confusables|https://util.unicode.org/UnicodeJsps/confusables.jsp>
+
+The utility says
+
+=over
+
+With this demo, you can supply an Input string and see the
+combinations that are confusable with it, using data collected by the
+Unicode consortium.
+
+=back
+
 =back
 
 [% INCLUDE "author" %]
diff --git a/lib/Unicode/Confuse/Parse.pm b/lib/Unicode/Confuse/Parse.pm
@@ -12,7 +12,7 @@ use utf8;
 use Carp;
 use File::Slurper 'read_text';
 
-our $VERSION = '0.04';
+our $VERSION = '0.05';
 
 # A code point
 my $cp = qr!([0-9A-F]{4,5})\s+!;
@@ -100,6 +100,7 @@ sub metadata
 	    next;
 	}
 	if ($data =~ /terms of use/i) {
+	    $data =~ s!(https?://.*\.html)!L<$1>!;
 	    $md{terms} = $data;
 	    next;
 	}

diff --git a/make-pod.pl b/make-pod.pl
@@ -8,6 +8,8 @@
 use Deploy qw/do_system older/;
 use Getopt::Long;
 use JSON::Parse 'read_json';
+use ExtUtils::ParseXS::Utilities 'trim_whitespace';
+use File::Slurper 'write_text';
 
 my $ok = GetOptions (
     'force' => \my $force,
@@ -61,13 +63,17 @@
     my $output = $example;
     $output =~ s/\.pl$/-out.txt/;
     if (older ($output, $example) || $force) {
-	do_system ("perl -I$Bin/blib/lib -I$Bin/blib/arch $example > $output 2>&1", $verbose);
+	do_system ("perl -I$Bin/lib $example > $output 2>&1", $verbose);
     }
 }
 
 chmod 0644, $output;
-$tt->process ($input, \%vars, $output, binmode => 'utf8')
+$tt->process ($input, \%vars, \my $outtext, binmode => 'utf8')
     or die '' . $tt->error ();
+trim_whitespace ($outtext);
+$outtext .= "\n";
+$outtext =~ s!\n\n+!\n\n!g;
+write_text ($output, $outtext);
 chmod 0444, $output;
 
 exit;

diff --git a/t/unicode-confuse.t b/t/unicode-confuse.t
@@ -20,6 +20,10 @@ is (canonical ('Æ'), 'AE', "Got canonical form for Æ");
 ok (confusable ('㈝'), "㈝ is confusable");
 is (canonical ('㈝'), '(오전)', "Got canonical form for ㈝");
 
+ok (confusable ('−'), "Minus sign is confusable");
+is (canonical ('–'), '-', "Got canonical form for −");
+ok (confusable ('~'), "Tildes");
+is (canonical ('∼'), '~', "Tildes");
 done_testing ();
 
 # Local variables:

diff --git a/versionup.pl b/versionup.pl
@@ -0,0 +1,53 @@
+#!/home/ben/software/install/bin/perl
+
+# The CPAN perl-reversion script seems to be making a muddle of things
+# sometimes, and it doesn't edit Changes, so I've made my own script.
+
+use warnings;
+use strict;
+use utf8;
+use FindBin '$Bin';
+use File::Slurper qw!read_text write_text!;
+use Deploy 'make_date';
+
+my $newversion = '0.05';
+my $version = '0.04';
+
+my @pmfiles = qw!
+    lib/Unicode/Confuse.pm
+    lib/Unicode/Confuse/Parse.pm
+!;
+
+for my $file (@pmfiles) {
+    my $bfile = "$Bin/$file";
+    my $text = read_text ($bfile);
+    if ($text =~ s/\Q$version\E\b/$newversion/g) {
+	print "$file OK\n";
+	write_text ($bfile, $text);
+    }
+    elsif ($text =~ /\Q$newversion/) {
+	warn "$file already at $newversion";
+    }
+    else {
+	warn "$file failed";
+    }
+}
+
+my $date = make_date ('-');
+my $changes = "$Bin/Changes";
+my $text = read_text ($changes);
+my $latest = <<EOF;
+$newversion $date
+
+-
+
+EOF
+
+if ($text =~ s/(\Q$version\E)/$latest$1/g) {
+    write_text ($changes, $text);
+}
+else {
+    warn "$changes failed";
+}
+
+