Skip to content

Commit

Permalink
Add .json to repo, delete old unused code, version 0.002
Browse files Browse the repository at this point in the history
  • Loading branch information
benkasminbullock committed Oct 10, 2017
1 parent f056a76 commit 8035373
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 45 deletions.
4 changes: 4 additions & 0 deletions Changes
@@ -1,3 +1,7 @@
0.002 2017-10-10

- Documentation

0.001 2017-02-02

- Initial version
2 changes: 1 addition & 1 deletion boot/WWWJDIC.pm.tmpl
Expand Up @@ -4,7 +4,7 @@ require Exporter;
@EXPORT_OK = qw/get_mirrors/;
use warnings;
use strict;
our $VERSION='0.001';
our $VERSION='0.002';
# to encode the results returned:
use Encode qw/encode decode/;
# need utf8 characters like 【 and 】 here
Expand Down
71 changes: 31 additions & 40 deletions boot/scrape.pl
Expand Up @@ -3,26 +3,12 @@
use strict;
use LWP::Simple;
use File::Versions 'make_backup';
my $toppage = "http://gengo.com/wwwjdic/cgi-data/wwwjdic?1C";
#http://www.edrdg.org/cgi-bin/wwwjdic/wwwjdic?1C";
if (0) {
my $source = "WWWJDIC.pm";
die "no $source" unless -f $source;
my %scraped_info;
my $docpage = "http://www.edrdg.org/wwwjdic/wwwjdicinf.html";
get_mirrors(\%scraped_info, $toppage);
get_codes(\%scraped_info, $docpage);
my $destination = "$source.backup";
replace_scrapes ($source, $destination, \%scraped_info);
my $backup = "backup/$source";
make_backup ($backup);
rename $source, "backup/$source" or die $!;
rename $destination, $source or die $!;
exit (0);
}
else {
get_mirrors_nice ($toppage);
}
my $verbose;
my $toppage = "http://www.edrdg.org/cgi-bin/wwwjdic/wwwjdic?1C";
#http://gengo.com/wwwjdic/cgi-data/wwwjdic?1C";
get_mirrors_nice ($toppage);
exit;

sub replace_scrapes
{
my ($source, $destination, $scraped_info) = @_;
Expand All @@ -34,26 +20,28 @@ sub replace_scrapes
my $outputting;
while (<$input>) {
if (/#\s*SCRAPE\s*([A-Z]+)/) {
print $output $_;
print $output $_;
$outputting = 1;
my $name = lc $1;
if ($scraped_info->{$name}) {
print $output "my \%$name = (\n";
print $output "my \%$name = (\n";
my $h = $scraped_info->{$name};
for my $key (sort keys %$h) {
my $out = $h->{$key};
$out =~ s/'/\\'/g;
print $output "'$key' => '$out',\n";
}
print $output ");\n";
} else {
print $output "'$key' => '$out',\n";
}
print $output ");\n";
}
else {
die "No scraped information for '$name'";
}
}
if ($outputting) {
if (/\#\s*END\s+SCRAPE/) {
$outputting = 0;
} else {
}
else {
next;
}
}
Expand All @@ -73,9 +61,7 @@ sub get_mirrors
my $mirrors;
my %mirrors;
for (@lines) {
# print;
if (/Dictionary:/ && /<select/i) {
# print "Found options\n";
$options = 1;
}
if ($options) {
Expand All @@ -92,7 +78,9 @@ sub get_mirrors
my $mirror = $1;
my $place = lc $2;
$mirror =~ s/\?1C//;
print "$place => $mirror\n";
if ($verbose) {
print "$place => $mirror\n";
}
$mirrors{$place} = $mirror;
}
}
Expand All @@ -104,16 +92,19 @@ sub get_mirrors
sub get_mirrors_nice
{
my ($url) = @_;
if ($verbose) {
print "Getting $url\n";
}
my $html = get ($url);
my @lines = split /\n/, $html;
my $options;
my %options;
my $mirrors;
my %mirrors;
for (@lines) {
# print;
# print;
if (/Dictionary:/ && /<select/i) {
# print "Found options\n";
# print "Found options\n";
$options = 1;
}
if ($options) {
Expand All @@ -130,7 +121,7 @@ sub get_mirrors_nice
my $mirror = $1;
my $place = lc $2;
$mirror =~ s/\?1C//;
print "'$place' => '$mirror',\n";
# print "'$place' => '$mirror',\n";
$mirrors{$place} = $mirror;
}
}
Expand All @@ -140,6 +131,9 @@ sub get_mirrors_nice
sub get_codes
{
my ($scraped_info, $url) = @_;
if ($verbose) {
print "Getting $url\n";
}
my $html = get ($url);
my @lines = split /\n/, $html;
my $dictionaries = 0;
Expand All @@ -149,15 +143,12 @@ sub get_codes
my $splitcodes = 0;
my $firstline;
for (@lines) {
# print;
if (/Dictionary File Codes/) {
# print "Found dictionaries\n";
$dictionaries = 1;
}
if ($dictionaries) {
if (m!<TD>\s*<B>\s*([A-Z0-9]+)\s*</B>\s*</TD>\s*<TD>(.*)</TD>!i) {
$dictionaries {$1} = $2;
# print "Dictionary code $1 dictionary '$2'\n";
}
$dictionaries = 0 if (/<\/table>/i);
}
Expand All @@ -167,7 +158,6 @@ sub get_codes
if ($codes) {
if (m!<TD><B>\s*(.*?)\s*</B></TD>\s*<TD>\s*(.*?)\s*</TD>!i) {
add_code (\%codes, $1, $2);
# print "Abbreviation: '$1' = '$2'\n";
}
$codes = 0 if (/<\/table>/i);
}
Expand All @@ -179,10 +169,10 @@ sub get_codes
if ($firstline) {
if (m!<TD>\s*(.*?)\s*</TD>!) {
add_code (\%codes, $firstline, $1);
# print "Abbreviation: '$firstline' = '$1'\n";
}
$firstline = 0;
} elsif (m!<TD><B>\s*(.*?)\s*</B></TD>!) {
}
elsif (m!<TD><B>\s*(.*?)\s*</B></TD>!) {
$firstline = $1;
}
}
Expand All @@ -199,7 +189,8 @@ sub add_code
return if ($code eq "-");
if ($codes->{$code}) {
print STDERR "Duplicate code for '$code'\n";
} else {
}
else {
$codes->{$code} = $meaning;
}
}
3 changes: 0 additions & 3 deletions lib/WWW/.gitignore

This file was deleted.

1 change: 1 addition & 0 deletions lib/WWW/WWWJDIC.json
@@ -0,0 +1 @@
{"options":{"M":"Japanese-Dutch","G":"Japanese-German (WaDoku)","2":"Japanese Names (ENAMDICT)","A":"Engineering/Science","O":"Japanese-Italian","I":"Japanese-Russian","B":"Linguistics","R":"Expanded Text-glossing","6":"Finance/Marketing","5":"Legal Terms","K":"Japanese-Hungarian","C":"River & Water Systems","Q":"Combined Jpn-Eng ","3":"Computing/Telecomms","L":"Japanese-Spanish","P":"Untranslated","J":"Japanese-Swedish","9":"Special Text-glossing","D":"Automobile Industry","4":"Life Sciences/Bio-Med","1":"Jpn-Eng General (EDICT)","7":"Buddhism","H":"Japanese-French","8":"Miscellaneous","E":"Japanese Wordnet","N":"Japanese-Slovenian","F":"Work-in-progress File"},"mirrors":{"canada":"http://www.ottix.net/cgi-bin/wwwjdic/wwwjdic","japan":"http://gengo.com/wwwjdic/cgi-data/wwwjdic","australia_monash":"http://nihongo.monash.edu/cgi-bin/wwwjdic","sweden":"http://wwwjdic.se/cgi-bin/wwwjdic.cgi","germany":"http://wwwjdic.biz/cgi-bin/wwwjdic","usa":"http://www.edrdg.org/cgi-bin/wwwjdic/wwwjdic","australia_melb":"http://nlp.cis.unimelb.edu.au/~jwb/cgi-bin/wwwjdic/wwwjdic"}}
2 changes: 1 addition & 1 deletion lib/WWW/WWWJDIC.pm
Expand Up @@ -4,7 +4,7 @@ require Exporter;
@EXPORT_OK = qw/get_mirrors/;
use warnings;
use strict;
our $VERSION = '0.001';
our $VERSION = '0.002';
use Encode qw/encode decode/;
use utf8;
use LWP::UserAgent;
Expand Down

0 comments on commit 8035373

Please sign in to comment.