diff --git a/Changes b/Changes index 1f67d50..63d40f6 100644 --- a/Changes +++ b/Changes @@ -1,5 +1,9 @@ Revision history for Perl module WWW::Wikipedia::LangTitles +0.02 2016-12-25 + +- Documentation + 0.01 2016-12-24 - Initial version diff --git a/examples/synopsis.pl b/examples/synopsis.pl new file mode 100644 index 0000000..cc72fe3 --- /dev/null +++ b/examples/synopsis.pl @@ -0,0 +1,13 @@ +#!/home/ben/software/install/bin/perl +use warnings; +use strict; +use utf8; +use WWW::Wikipedia::LangTitles 'get_wiki_titles'; +binmode STDOUT, ":encoding(utf8)"; +my $title = 'Three-phase electric power'; +my $links = get_wiki_titles ($title); +print "$title is '$links->{de}' in German.\n"; +my $film = '東京物語'; +my $flinks = get_wiki_titles ($film, lang => 'ja'); +print "映画「$film」はイタリア語で'$flinks->{it}'と名付けた。\n"; + diff --git a/lib/WWW/Wikipedia/LangTitles.pm b/lib/WWW/Wikipedia/LangTitles.pm index 57c9463..c518697 100644 --- a/lib/WWW/Wikipedia/LangTitles.pm +++ b/lib/WWW/Wikipedia/LangTitles.pm @@ -4,29 +4,47 @@ use strict; use Carp; require Exporter; our @ISA = qw(Exporter); -our @EXPORT_OK = qw/get_wiki_titles/; +our @EXPORT_OK = qw/get_wiki_titles make_wiki_url/; our %EXPORT_TAGS = ( all => \@EXPORT_OK, ); -our $VERSION = '0.01'; +our $VERSION = '0.02'; use LWP::UserAgent; -use URI::Escape 'uri_escape'; +use URI::Escape 'uri_escape_utf8'; use JSON::Parse 'parse_json'; +sub make_wiki_url +{ + my ($name, $lang) = @_; + if (! $lang) { + # Defaults to English. + $lang = 'en'; + } + # Have to say "enwiki" or "jawiki" in the URL, since it can be + # "enquote" or something. + if ($lang !~ /wiki$/) { + $lang .= 'wiki'; + } + my $safe_name = $name; + $safe_name = uri_escape_utf8 ($safe_name); + # The URL to get the information from. + my $url = "https://www.wikidata.org/w/api.php?action=wbgetentities&sites=$lang&titles=$safe_name&props=sitelinks/urls|datatype&format=json"; + return $url; +} + sub get_wiki_titles { - # The name of the article to fetch. my ($name, %options) = @_; + my $lang = $options{lang}; my $verbose = $options{verbose}; - my $safe_name = $name; - $safe_name = uri_escape ($safe_name); - # The URL to get the information from. - my $url = "https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=$safe_name&props=sitelinks/urls|datatype&format=json"; + my $url = make_wiki_url ($name, $lang); if ($verbose) { - print "Getting $safe_name from '$url'.\n"; + print "Getting $name from '$url'.\n"; } my $ua = LWP::UserAgent->new (); - my $agent = "Secret Agent Man"; + # Tell the server from what software this request originates, in + # case this module turns out to be problematic for them somehow. + my $agent = __PACKAGE__; $ua = LWP::UserAgent->new (agent => $agent); $ua->default_header ( 'Accept-Encoding' => scalar HTTP::Message::decodable() @@ -36,6 +54,9 @@ sub get_wiki_titles carp "Get $url failed: " . $response->status_line (); return; } + if ($verbose) { + print "$name data was retrieved successfully.\n"; + } my $json = $response->decoded_content (); my $data = parse_json ($json); my $array = $data->{entities}; @@ -44,6 +65,9 @@ sub get_wiki_titles my $sitelinks = $array->{$k}->{sitelinks}; for my $k (keys %$sitelinks) { my $lang = $k; + # Reject these? This is a legacy of the script that this + # used to be, it might be more useful for the CPAN module + # not to reject these. if ($lang =~ /wikiversity|simple|commons|wikiquote|wikibooks/) { next; } @@ -51,6 +75,9 @@ sub get_wiki_titles $lang2title{$lang} = $sitelinks->{$k}->{title}; } } + if ($verbose) { + print "$name operations complete.\n"; + } return \%lang2title; } diff --git a/lib/WWW/Wikipedia/LangTitles.pod.tmpl b/lib/WWW/Wikipedia/LangTitles.pod.tmpl index 38305a9..1a6e04e 100644 --- a/lib/WWW/Wikipedia/LangTitles.pod.tmpl +++ b/lib/WWW/Wikipedia/LangTitles.pod.tmpl @@ -8,7 +8,7 @@ produces output [% INCLUDE $out | xtidy %] -(This example is included as L|https://api.metacpan.org/source/BKB/WWW-Wikipedia-LangTitles-[% version %]/examples/[% pl %]> in the distribution.) +(This example is included as L|https://api.metacpan.org/source/BKB/WWW-Wikipedia-LangTitles-[% info.version %]/examples/[% pl %]> in the distribution.) [% END %] =encoding UTF-8 @@ -18,16 +18,20 @@ WWW::Wikipedia::LangTitles - get interwiki links from Wikipedia. =head1 SYNOPSIS - use WWW::Wikipedia::LangTitles 'get_wiki_titles'; +[% example('synopsis') %] =head1 VERSION -This documents version [% version %] of WWW::Wikipedia::LangTitles corresponding to git -commit [% commit.commit %] released on [% commit.date %]. +This documents version [% info.version %] of +WWW::Wikipedia::LangTitles corresponding to L released +on [% commit.date %]. =head1 DESCRIPTION -Get Wikipedia interwiki link titles. +This module retrieves the Wikipedia interwiki link titles from +wikidata.org. It can be used, for example, to translate a term in +English into other languages, or to get near equivalents. =head1 FUNCTIONS @@ -36,8 +40,61 @@ Get Wikipedia interwiki link titles. my $ref = get_wiki_titles ('Helium'); Returns a hash reference with all the articles in each language, -indexed by the language. For example C<$ref->{ja}> will be equal to ヘ -リウム, the Japanese title of the equivalent Wikipedia article. +indexed by the language. For example C<< $ref->{th} >> will be equal +to ฮีเลียม, the Thai title of the equivalent Wikipedia article. + +The language of the original page can be specified like this: + + use utf8; + my $from_th = get_wiki_titles ('ฮีเลียม', lang => 'th'); + +The URL is encoded using L, so you must +use character strings not byte strings (use "use utf8;" etc.) + +An option "verbose" switches on verbose messages with any true value: + + my $ref = get_wiki_titles ($name, verbose => 1); + +=head2 make_wiki_url + + my $url = make_wiki_url ('helium'); + +Make a URL for the Wikidata page. You will then need to retrieve the +page and parse the JSON yourself. Use a second argument to specify the +language of the page: + + use utf8; + my $url = make_wiki_url ('ฮีเลียม', 'th'); + +The default is C for English. + +=head1 SEE ALSO + +=over + +=item L + +You may be able to convert the language codes to and from the language +names using this module. (I have not tested it yet.) + +=back + +=head1 DEPENDENCIES + +=over + +=item Carp + +L is used to report errors + +=item LWP::UserAgent + +L is used to retrieve the data from Wikidata. + +=item JSON::Parse + +L is used to parse the JSON data from Wikidata. + +=back -=cut [% INCLUDE "author" %] diff --git a/make-pod.pl b/make-pod.pl index 0a013bf..8bda3ad 100755 --- a/make-pod.pl +++ b/make-pod.pl @@ -3,7 +3,7 @@ use strict; use Template; use FindBin '$Bin'; -use Perl::Build qw/get_version get_commit/; +use Perl::Build qw/get_info get_commit/; use Perl::Build::Pod ':all'; use Deploy qw/do_system older/; use Getopt::Long; @@ -19,7 +19,7 @@ base => $Bin, verbose => $verbose, ); -my $version = get_version (%pbv); +my $info = get_info (%pbv); my $commit = get_commit (%pbv); # Names of the input and output files containing the documentation. @@ -30,7 +30,7 @@ # Template toolkit variable holder my %vars = ( - version => $version, + info => $info, commit => $commit, ); diff --git a/xt/scrape.t b/xt/scrape.t new file mode 100644 index 0000000..73c10f5 --- /dev/null +++ b/xt/scrape.t @@ -0,0 +1,18 @@ +use warnings; +use strict; +use utf8; +use FindBin '$Bin'; +use Test::More; +my $builder = Test::More->builder; +binmode $builder->output, ":utf8"; +binmode $builder->failure_output, ":utf8"; +binmode $builder->todo_output, ":utf8"; +binmode STDOUT, ":encoding(utf8)"; +binmode STDERR, ":encoding(utf8)"; +use WWW::Wikipedia::LangTitles 'get_wiki_titles'; +my $out = get_wiki_titles ('Helium'); +is ($out->{ja}, 'ヘリウム', "Got Japanese title for helium"); +my $outja = get_wiki_titles ('ヘリウム', lang => 'ja'); +is ($outja->{en}, 'Helium', "Got English title with lang => ja and helium in ja"); + +done_testing ();