Skip to content

Commit

Permalink
Add more documentation and a new function
Browse files Browse the repository at this point in the history
The new function get_wiki_url gives the URL of the page to retrieve.

xt/scrape.t tests the scraping.

Use new info method for building the pod.
  • Loading branch information
benkasminbullock committed Dec 25, 2016
1 parent 638a1b9 commit 3c7983f
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 21 deletions.
4 changes: 4 additions & 0 deletions Changes
@@ -1,5 +1,9 @@
Revision history for Perl module WWW::Wikipedia::LangTitles

0.02 2016-12-25

- Documentation

0.01 2016-12-24

- Initial version
Expand Down
13 changes: 13 additions & 0 deletions examples/synopsis.pl
@@ -0,0 +1,13 @@
#!/home/ben/software/install/bin/perl
use warnings;
use strict;
use utf8;
use WWW::Wikipedia::LangTitles 'get_wiki_titles';
binmode STDOUT, ":encoding(utf8)";
my $title = 'Three-phase electric power';
my $links = get_wiki_titles ($title);
print "$title is '$links->{de}' in German.\n";
my $film = '東京物語';
my $flinks = get_wiki_titles ($film, lang => 'ja');
print "映画「$film」はイタリア語で'$flinks->{it}'と名付けた。\n";

47 changes: 37 additions & 10 deletions lib/WWW/Wikipedia/LangTitles.pm
Expand Up @@ -4,29 +4,47 @@ use strict;
use Carp;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw/get_wiki_titles/;
our @EXPORT_OK = qw/get_wiki_titles make_wiki_url/;
our %EXPORT_TAGS = (
all => \@EXPORT_OK,
);
our $VERSION = '0.01';
our $VERSION = '0.02';
use LWP::UserAgent;
use URI::Escape 'uri_escape';
use URI::Escape 'uri_escape_utf8';
use JSON::Parse 'parse_json';

sub make_wiki_url
{
my ($name, $lang) = @_;
if (! $lang) {
# Defaults to English.
$lang = 'en';
}
# Have to say "enwiki" or "jawiki" in the URL, since it can be
# "enquote" or something.
if ($lang !~ /wiki$/) {
$lang .= 'wiki';
}
my $safe_name = $name;
$safe_name = uri_escape_utf8 ($safe_name);
# The URL to get the information from.
my $url = "https://www.wikidata.org/w/api.php?action=wbgetentities&sites=$lang&titles=$safe_name&props=sitelinks/urls|datatype&format=json";
return $url;
}

sub get_wiki_titles
{
# The name of the article to fetch.
my ($name, %options) = @_;
my $lang = $options{lang};
my $verbose = $options{verbose};
my $safe_name = $name;
$safe_name = uri_escape ($safe_name);
# The URL to get the information from.
my $url = "https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=$safe_name&props=sitelinks/urls|datatype&format=json";
my $url = make_wiki_url ($name, $lang);
if ($verbose) {
print "Getting $safe_name from '$url'.\n";
print "Getting $name from '$url'.\n";
}
my $ua = LWP::UserAgent->new ();
my $agent = "Secret Agent Man";
# Tell the server from what software this request originates, in
# case this module turns out to be problematic for them somehow.
my $agent = __PACKAGE__;
$ua = LWP::UserAgent->new (agent => $agent);
$ua->default_header (
'Accept-Encoding' => scalar HTTP::Message::decodable()
Expand All @@ -36,6 +54,9 @@ sub get_wiki_titles
carp "Get $url failed: " . $response->status_line ();
return;
}
if ($verbose) {
print "$name data was retrieved successfully.\n";
}
my $json = $response->decoded_content ();
my $data = parse_json ($json);
my $array = $data->{entities};
Expand All @@ -44,13 +65,19 @@ sub get_wiki_titles
my $sitelinks = $array->{$k}->{sitelinks};
for my $k (keys %$sitelinks) {
my $lang = $k;
# Reject these? This is a legacy of the script that this
# used to be, it might be more useful for the CPAN module
# not to reject these.
if ($lang =~ /wikiversity|simple|commons|wikiquote|wikibooks/) {
next;
}
$lang =~ s/wiki$//;
$lang2title{$lang} = $sitelinks->{$k}->{title};
}
}
if ($verbose) {
print "$name operations complete.\n";
}
return \%lang2title;
}

Expand Down
73 changes: 65 additions & 8 deletions lib/WWW/Wikipedia/LangTitles.pod.tmpl
Expand Up @@ -8,7 +8,7 @@ produces output

[% INCLUDE $out | xtidy %]

(This example is included as L<F<[% pl %]>|https://api.metacpan.org/source/BKB/WWW-Wikipedia-LangTitles-[% version %]/examples/[% pl %]> in the distribution.)
(This example is included as L<F<[% pl %]>|https://api.metacpan.org/source/BKB/WWW-Wikipedia-LangTitles-[% info.version %]/examples/[% pl %]> in the distribution.)
[% END %]
=encoding UTF-8

Expand All @@ -18,16 +18,20 @@ WWW::Wikipedia::LangTitles - get interwiki links from Wikipedia.

=head1 SYNOPSIS

use WWW::Wikipedia::LangTitles 'get_wiki_titles';
[% example('synopsis') %]

=head1 VERSION

This documents version [% version %] of WWW::Wikipedia::LangTitles corresponding to git
commit [% commit.commit %] released on [% commit.date %].
This documents version [% info.version %] of
WWW::Wikipedia::LangTitles corresponding to L<git commit [%
commit.commit %]|[% info.repo %]/commit/[% commit.commit %]> released
on [% commit.date %].

=head1 DESCRIPTION

Get Wikipedia interwiki link titles.
This module retrieves the Wikipedia interwiki link titles from
wikidata.org. It can be used, for example, to translate a term in
English into other languages, or to get near equivalents.

=head1 FUNCTIONS

Expand All @@ -36,8 +40,61 @@ Get Wikipedia interwiki link titles.
my $ref = get_wiki_titles ('Helium');

Returns a hash reference with all the articles in each language,
indexed by the language. For example C<$ref->{ja}> will be equal to ヘ
リウム, the Japanese title of the equivalent Wikipedia article.
indexed by the language. For example C<< $ref->{th} >> will be equal
to ฮีเลียม, the Thai title of the equivalent Wikipedia article.

The language of the original page can be specified like this:

use utf8;
my $from_th = get_wiki_titles ('ฮีเลียม', lang => 'th');

The URL is encoded using L<URI::Escape/uri_escape_utf8>, so you must
use character strings not byte strings (use "use utf8;" etc.)

An option "verbose" switches on verbose messages with any true value:

my $ref = get_wiki_titles ($name, verbose => 1);

=head2 make_wiki_url

my $url = make_wiki_url ('helium');

Make a URL for the Wikidata page. You will then need to retrieve the
page and parse the JSON yourself. Use a second argument to specify the
language of the page:

use utf8;
my $url = make_wiki_url ('ฮีเลียม', 'th');

The default is C<en> for English.

=head1 SEE ALSO

=over

=item L<Locale::Codes>

You may be able to convert the language codes to and from the language
names using this module. (I have not tested it yet.)

=back

=head1 DEPENDENCIES

=over

=item Carp

L<Carp> is used to report errors

=item LWP::UserAgent

L<LWP::UserAgent> is used to retrieve the data from Wikidata.

=item JSON::Parse

L<JSON::Parse> is used to parse the JSON data from Wikidata.

=back

=cut
[% INCLUDE "author" %]
6 changes: 3 additions & 3 deletions make-pod.pl
Expand Up @@ -3,7 +3,7 @@
use strict;
use Template;
use FindBin '$Bin';
use Perl::Build qw/get_version get_commit/;
use Perl::Build qw/get_info get_commit/;
use Perl::Build::Pod ':all';
use Deploy qw/do_system older/;
use Getopt::Long;
Expand All @@ -19,7 +19,7 @@
base => $Bin,
verbose => $verbose,
);
my $version = get_version (%pbv);
my $info = get_info (%pbv);
my $commit = get_commit (%pbv);
# Names of the input and output files containing the documentation.

Expand All @@ -30,7 +30,7 @@
# Template toolkit variable holder

my %vars = (
version => $version,
info => $info,
commit => $commit,
);

Expand Down
18 changes: 18 additions & 0 deletions xt/scrape.t
@@ -0,0 +1,18 @@
use warnings;
use strict;
use utf8;
use FindBin '$Bin';
use Test::More;
my $builder = Test::More->builder;
binmode $builder->output, ":utf8";
binmode $builder->failure_output, ":utf8";
binmode $builder->todo_output, ":utf8";
binmode STDOUT, ":encoding(utf8)";
binmode STDERR, ":encoding(utf8)";
use WWW::Wikipedia::LangTitles 'get_wiki_titles';
my $out = get_wiki_titles ('Helium');
is ($out->{ja}, 'ヘリウム', "Got Japanese title for helium");
my $outja = get_wiki_titles ('ヘリウム', lang => 'ja');
is ($outja->{en}, 'Helium', "Got English title with lang => ja and helium in ja");

done_testing ();

0 comments on commit 3c7983f

Please sign in to comment.