Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
benkasminbullock committed Jun 10, 2011
0 parents commit 9fa7f33
Show file tree
Hide file tree
Showing 3 changed files with 329 additions and 0 deletions.
8 changes: 8 additions & 0 deletions Makefile.PL
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
use strict;
use warnings;
use ExtUtils::MakeMaker;

WriteMakefile (
NAME => 'Data::Kanji::Kanjidic',
VERSION_FROM => 'lib/Data/Kanji/Kanjidic.pm',
);
312 changes: 312 additions & 0 deletions lib/Data/Kanji/Kanjidic.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,312 @@
=head1 Data::Kanji::Kanjidic
Data::Kanji::Kanjidic
=cut
package Data::Kanji::Kanjidic;
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw/parse_kanjidic parse_entry kanji_dictionary_order
grade_stroke_order/;
use warnings;
use strict;
our $VERSION = 0.01;


use strict;
use warnings;
use Encode;
use utf8;
#use encoding "euc-jp";

# This is a list of the letter codes used in kanjidic, plus more
# meaningful long names which will be used as the names of columns in
# a database.

our %codes =
(
'W' => 'KOREAN',
'Y' => 'PINYIN',

# '{' => 'ENGLISH_START',
# '}' => 'ENGLISH_END',

# Codes for kanji classification schemes.

'B' => 'BUSHU',
'C' => 'CLASSIC_RADICAL',
'U' => 'UNICODE',

# The school grade at which the kanji is learnt. These need to be
# checked.

'G' => 'GRADE',
'Q' => 'FOUR_CORNER',
'S' => 'STROKE_COUNT',
'P' => 'SKIP',

# Japanese proficiency test level

'J' => 'JPROF',

# Codes for various books.

'N' => 'NELSON',
'V' => 'NEW_NELSON',
'L' => 'HEISIG',

# The numbers used in P.G. O'Neill's "Japanese Names".

'O' => 'ONEILL',
'K' => 'GAKKEN',
'E' => 'HENSHALL',
'I' => 'SPAHN_HADAMITZKY',
'IN' => 'SH_KANJI_KANA',

# 'M' => 'MOROHASHI',

'MP' => 'MOROHASHI_PAGE',
'MN' => 'MOROHASHI_INDEX',
'H' => 'HALPERN',
'F' => 'FREQUENCY',

'X' => 'CROSS_REF',

# D-type book-specific numbers:

# the index numbers used in "Japanese For Busy People" vols I-III,
# published by the AJLT. The codes are the volume.chapter.

'DB' => 'BUSY_PEOPLE',

# the index numbers used in "The Kanji Way to Japanese Language Power"
# by Dale Crowley.

'DC' => 'KANJI_WAY',

# "Japanese Kanji Flashcards", by Max Hodges and Tomoko Okazaki (White Rabbit Press).

'DF' => 'RABBIT',

# The index numbers used in the "Kodansha Compact Kanji Guide".

'DG' => 'KODANSHA',

# The index numbers used in the 3rd edition of "A Guide To Reading and
# Writing Japanese" edited by Ken Hensall et al.

'DH' => 'HENSHALL',

# The index numbers used in the "Kanji in Context" by Nishiguchi and Kono.

'DJ' => 'KANJIINCONTEXT',

# The index numbers used by Jack Halpern in his Kanji Learners
# Dictionary, published by Kodansha in 1999. The numbers have been
# provided by Mr Halpern.

'DK' => 'HALPERN',

# French "Remembering the kanji"

'DM' => 'FRENCHHEISIG',

# The index numbers used in P.G. O'Neill's Essential Kanji. The
# numbers have been provided by Glenn Rosenthal.

'DO' => 'ONEILL',

# These are the codes developed by Father Joseph De Roo, and published
# in his book "2001 Kanji" (Bonjinsha). Fr De Roo has given his
# permission for these codes to be included.

'DR' => 'DEROO',

# The index numbers used in the early editions of "A Guide To Reading
# and Writing Japanese" edited by Florence Sakade.

'DS' => 'SAKADE',

# The index numbers used in the Tuttle Kanji Cards, compiled by
# Alexander Kask.

'DT' => 'KASK',

# Cross references:

'XJ' => 'CROSSREF',
'XO' => 'CROSSREF',
'XH' => 'CROSSREF',
'XI' => 'CROSSREF',
'XN' => 'NELSONCROSSREF',
'XDR' => 'DEROOCROSSREF',
'T' => 'SPECIAL',

# To-do: give these more meaningful names.

'ZPP' => 'MISCLASSIFICATIONpp',
'ZRP' => 'MISCLASSIFICATIONrp',
'ZSP' => 'MISCLASSIFICATIONsp',
'ZBP' => 'MISCLASSIFICATIONrp',
);

# Parse one string from kanjidic and return it in an associative array.

#$| = 1;

sub parse_entry
{
my ($input) = @_;

# Remove the English entries first.

my $counter;
my @english;
my @onyomi;
my @kunyomi;
my %values;
while ($input =~ s/\{([^\}]+)\}//) {
my $meaning = $1;

# Construct a list of "kokuji" (characters made in Japan).

if ($meaning =~ m/\(kokuji\)/) {
$values{"kokuji"} = 1;
}

# Construct a list of single-kanji counters.

elsif ($meaning =~ m/^counter for (.*)$/) {
if ($values{"counter"}) {
print "Warning: two counters in $input\n";
}
$values{"counter"} = $1;
} else {
push (@english, $meaning);
}
}

(my $kanji, $values{"jiscode"}, my @entries) = split (" ", $input);
$values{kanji} = $kanji;
foreach my $entry (@entries) {
my $found;
if ($entry =~ m/(^[A-Z]+)(.*)/ ) {
my $field = $1;
if ($codes{$field}) {
if (!$values{$field}) {
$values{$field} = $2;
} elsif ($field eq "S") {
$values{S2} = $2;
# print "$values{kanji} has two secont is $2\n";
}

$found = 1;
}
# Kanjidic contains hiragana, katakana, ".", "-" and "ー" (Japanese
# "chouon") characters.
} else {
my $utf8=$entry;
# print "utf8: ", $utf8, "\n";
if ($utf8 =~ m/^([あ-ん\.-]+)$/) {
push (@kunyomi, $utf8);
$found = 1;
} elsif ($utf8 =~ m/^([ア-ンー\.-]+)$/) {
push (@onyomi, $utf8);
$found = 1;
}
}
if (! $found) {
warn "$.: Mystery entry \"$entry\"\n";
}
}

$values{"english"} = \@english;
$values{"onyomi"} = \@onyomi;
$values{"kunyomi"} = \@kunyomi;

# Kanjidic uses the bogus radical numbers of Nelson rather than the
# correct ones.

$values{radical} = $values{B};
$values{radical} = $values{C} if $values{C};

# Just in case there is a problem in kanjidic, this will tell us the
# line where the problem was:

$values{"line_number"} = $.;
return %values;
}

# Order of kanji in a kanji dictionary.

sub kanji_dictionary_order
{
my ($kanjidic_ref, $a, $b) = @_;
# print "$a, $b,\n";
my $valuea = $kanjidic_ref->{$a};
my $valueb = $kanjidic_ref->{$b};
my $radval = $$valuea{radical} - $$valueb{radical};
return $radval if $radval;
my $strokeval = $$valuea{S} - $$valueb{S};
return $strokeval if $strokeval;
my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
return $jisval if $jisval;
return 0;
}

# Comparison function to sort by grade and then stroke order, then JIS
# code value if those are both the same.

sub grade_stroke_order
{
my ($kanjidic_ref, $a, $b) = @_;
# print "$a, $b,\n";
my $valuea = $kanjidic_ref->{$a};
my $valueb = $kanjidic_ref->{$b};
if ($valuea->{G}) {
if ($valueb->{G}) {
my $gradeval = $$valuea{G} - $$valueb{G};
return $gradeval if $gradeval;
}
else {
return -1;
}
} elsif ($valueb->{G}) {
return 1;
}
my $strokeval = $$valuea{S} - $$valueb{S};
return $strokeval if $strokeval;
my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
return $jisval if $jisval;
return 0;
}

sub parse_kanjidic
{
my ($kanjidic_ref, $file_name) = @_;
my $KANJIDIC;

open $KANJIDIC, "<:encoding(euc-jp)", $file_name
or die "Could not open '$file_name': $!";
binmode STDOUT,"utf8";
while (<$KANJIDIC>) {
next if ( m/^\#/ );
my %values = parse_entry ($_);
my @skip = split ("-", $values{P});
$values{skip} = \@skip;
$kanjidic_ref->{$values{kanji}} = \%values;
}
}

1;

__END__
=head1 NAME
Data::Kanji::Kanjidic - parse the "kanjidic" kanji data file
=head1 SYNOPSIS
=head1 FUNCTIONS
9 changes: 9 additions & 0 deletions t/Data-Kanji-Kanjidic.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
use warnings;
use strict;
use Test::More tests => 1;
BEGIN { use_ok('Data::Kanji::Kanjidic') };
use Data::Kanji::Kanjidic;

# Local variables:
# mode: perl
# End:

0 comments on commit 9fa7f33

Please sign in to comment.