Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Initial commit

  • Loading branch information...
commit 492a0badae98befa83b23cf8dd1897710e2c98f5 0 parents
@afiskon authored
8 META.info
@@ -0,0 +1,8 @@
+{
+ "name" : "Sitemap::XML::Parser",
+ "version" : "*",
+ "description" : "Module for parsing sitemap.xml files",
+ "depends" : [ "LWP::Simple", "URI", "XML::Parser::Tiny", "DateTime::Format::W3CDTF" ],
+ "source-type" : "git",
+ "source-url" : "git://github.com/afiskon/p6-sitemap-xml-parser.git"
+}
15 README
@@ -0,0 +1,15 @@
+A Perl 6 module for parsing sitemap.xml files.
+
+All files (unless noted otherwise) can be used, modified and redistributed
+under the terms of the Artistic License Version 2.
+
+To build and test this module, plese get 'ufo' from
+http://github.com/masak/ufo and run
+
+ ufo
+ make
+ make test
+ make install
+
+Credits
+ Alexandr Alexeev <afiskon@gmail.com>
3  TODO.txt
@@ -0,0 +1,3 @@
+* tests for parse-url() and parse-file()
+* seperated module for stemap index parsing -- http://www.sitemaps.org/ru/protocol.html#index
+* gzip support
134 lib/Sitemap/XML/Parser.pm6
@@ -0,0 +1,134 @@
+use v6;
+use XML::Parser::Tiny;
+use DateTime::Format::W3CDTF;
+use LWP::Simple;
+use URI;
+
+class Sitemap::XML::Parser;
+
+has $!parser = XML::Parser::Tiny.new;
+has $!w3cdtf = DateTime::Format::W3CDTF.new;
+
+method parse-url ( Str $url ) {
+ my $data = LWP::Simple.get($url);
+ self.parse($data);
+}
+
+method parse-file ( Str $fname ) {
+ my $data = slurp($fname);
+ self.parse($data);
+}
+
+method parse ( Str $data ) {
+ my $xml = $!parser.parse($data);
+ die "Tag 'urlset' is missing" unless $xml{'body'}{'name'} eq 'urlset';
+
+ my @urls;
+ for @( $xml{'body'}{'data'} ) -> %url {
+ die "Unexpected tag '" ~ %url{'name'} ~ "' found" unless %url{'name'} eq 'url';
+
+ my %info;
+ for @( %url{'data'} ) -> %item {
+ unless %item{'name'} eq any('loc', 'lastmod', 'changefreq', 'priority') {
+ die "Unexpected tag '" ~ %item{'name'} ~ "' found";
+ }
+ unless @( %item{'data'} ).elems == 1 && %item{'data'}[0].isa('Str') {
+ die "Invalid tag value for '" ~ %item{'name'} ~ "'";
+ }
+ %info{ %item{'name'} } = %item{'data'}[0];
+ }
+
+ %info = self!check-loc(%info);
+ %info = self!check-lastmod(%info);
+ %info = self!check-changefreq(%info);
+ %info = self!check-priority(%info);
+
+ @urls.push( $(%info) );
+ }
+ return @urls;
+}
+
+method !check-loc ( %info is copy ) {
+ unless any(%info.keys) eq 'loc' && %info{'loc'} ne '' {
+ die "Tag 'loc' is missing";
+ }
+
+ %info{'loc'} = URI.new(%info{'loc'}, is_validating => True);
+ return %info;
+}
+
+method !check-lastmod ( %info is copy ) {
+ return %info unless any(%info.keys) eq 'lastmod';
+
+ %info{'lastmod'} = $!w3cdtf.parse: %info{'lastmod'};
+ return %info;
+}
+
+method !check-changefreq ( %info ) {
+ return %info unless any(%info.keys) eq 'changefreq';
+
+ unless %info{'changefreq'} eq any(qw/always hourly daily weekly monthly yearly never/) {
+ die "Invalid tag value '" ~ %info{'changefreq'} ~ "' for 'changefreq'"
+ }
+ return %info;
+}
+
+method !check-priority ( %info is copy ) {
+ return %info unless any(%info.keys) eq 'priority';
+
+ my $value = %info{'priority'}.Real;
+ unless $value ~~ 0.0..1.0 {
+ die "Invalid tag value '" ~ %info{'priority'} ~ "' for 'priority'"
+ }
+ %info{'priority'} = $value;
+ return %info;
+}
+
+=begin pod
+
+=head1 NAME
+
+Sitemap::XML::Parser is a module for parsing sitemap.xml files.
+
+=head1 SYNOPSYS
+
+=begin code
+use Sitemap::XML::Parser;
+
+my $parser = Sitemap::XML::Parser.new;
+my $sitemap = $parser.parse-url('http://example.ru/sitemap.xml');
+# $sitemap == [
+# {
+# loc => URI.new('http://example/'),
+# lastmod => DateTime.new('2012-09-06T03:22:42Z'),
+# changefreq => 'daily',
+# priority => 1.0
+# },
+# ....
+# ];
+
+=end code
+
+=head1 DESCRIPTION
+
+Module for parsing sitemap.xml files
+
+=head1 METHODS
+
+=head2 parse-url( Str $url )
+=head2 parse-file( Str $fname )
+=head2 parse( Str $data )
+
+=head1 AUTHOR
+
+Alexandr Alexeev, <eax at cpan.org> (L<http://eax.me/>)
+
+=head1 COPYRIGHT
+
+Copyright 2012 Alexandr Alexeev
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Rakudo Perl 6 itself.
+
+=end pod
+
137 t/main.t
@@ -0,0 +1,137 @@
+use v6;
+use lib 'lib';
+use Test;
+use Sitemap::XML::Parser;
+
+my @invalid = (
+ 'trash',
+ '<bebebe />',
+ '<urlset><bebebe /></urlset>',
+ '<urlset>data</urlset>',
+ '<urlset><![CDATA[data]]></urlset>',
+ '<urlset><url></url></urlset>',
+ '<urlset><url><bebebe /></url></urlset>',
+ '<urlset><url><loc></loc></url></urlset>',
+ '<urlset><url><loc>%%%%%%</loc></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><bebebe /></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><changefreq> monthly </changefreq></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><changefreq>bebebe</changefreq></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><changefreq><tag /></changefreq></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><priority /></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><priority></priority></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><priority>bebebe</priority></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><priority>-0.0000001</priority></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><priority>1.0000001</priority></url></urlset>',
+ '<urlset><url><loc>http://example.ru/</loc><lastmod>bebebe</lastmod></url></urlset>',
+ '<urlset><url><lastmod>2012-08-30T04:20:04Z</lastmod><changefreq>monthly</changefreq><priority>0.2</priority></url></urlset>',
+);
+
+my ( @valid_sitemaps, @valid_results );
+@valid_sitemaps.push:
+ q{<urlset>
+ <url>
+ <loc>http://example.ru/</loc>
+ <lastmod>2012-08-30T04:20:04+00:00</lastmod>
+ <changefreq>monthly</changefreq>
+ <priority>0.2</priority>
+ </url>
+ </urlset>};
+@valid_sitemaps.push:
+ q{<?xml ?><urlset>
+ <url>
+ <priority>0.2</priority>
+ <lastmod>2012-08-30T04:20:04+00:00</lastmod>
+ <loc>http://example.ru/</loc>
+ <changefreq>monthly</changefreq>
+ </url>
+ </urlset>};
+
+for 1..2 {
+ @valid_results.push: [
+ {
+ loc => 'http://example.ru/',
+ lastmod => '2012-08-30T04:20:04Z',
+ changefreq => 'monthly',
+ priority => 0.2
+ },
+ ];
+}
+
+@valid_sitemaps.push:
+ q{<?xml version="1.0" encoding="UTF-8"?>
+ <?xml-stylesheet type="text/xsl" href="http://example.ru/wp-content/plugins/google-sitemap-generator/sitemap.xsl"?>
+<!-- generator="wordpress/3.4.1" -->
+<!-- sitemap-generator-url="http://www.arnebrachhold.de" sitemap-generator-version="3.2.8" -->
+<!-- generated-on="06.09.2012 03:23" -->
+<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url>
+ <loc>http://username:password@xn--d1abbgf6aiiy.xn--p1ai:8080/</loc>
+ <lastmod>2012-09-06T03:22:42+00:00</lastmod>
+ <changefreq>daily</changefreq>
+ <priority>1.0</priority>
+ </url>
+ <url>
+ <loc>http://example.ru/perl6-install/</loc>
+ <priority>0.2</priority>
+ </url>
+ <url>
+ <loc>http://example.ru/erlang/</loc>
+ </url>
+ </urlset>};
+@valid_results.push: [
+ {
+ loc => 'http://username:password@xn--d1abbgf6aiiy.xn--p1ai:8080/',
+ lastmod => '2012-09-06T03:22:42Z',
+ changefreq => 'daily',
+ priority => 1.0
+ },
+ {
+ loc => 'http://example.ru/perl6-install/',
+ priority => 0.2
+ },
+ {
+ loc => 'http://example.ru/erlang/',
+ },
+];
+
+my @changefreq_list = qw/always hourly daily weekly monthly yearly never/;
+for @changefreq_list -> $changefreq {
+ @valid_sitemaps.push:
+ qq{<urlset>
+ <url>
+ <loc>http://example.ru/</loc>
+ <changefreq>$changefreq\</changefreq>
+ </url>
+ </urlset>};
+ @valid_results.push: [
+ {
+ loc => 'http://example.ru/',
+ changefreq => $changefreq,
+ },
+ ];
+}
+
+my $parser = Sitemap::XML::Parser.new;
+for @invalid -> $sitemap {
+ dies_ok({ $parser.parse($sitemap) });
+}
+
+my %valid = zip(@valid_sitemaps, @valid_results);
+for %valid.kv -> $sitemap, $struct {
+ my $rslt = $parser.parse($sitemap);
+
+ for $rslt.values -> $item is rw {
+ ok( $item{'loc'}.isa('URI') );
+ $item{'loc'} = $item{'loc'}.Str;
+
+ next unless defined $item{'lastmod'};
+ ok( $item{'lastmod'}.isa('DateTime') );
+ $item{'lastmod'} = $item{'lastmod'}.Str;
+ }
+
+ ok( $rslt eqv $struct );
+}
+
+done;
+
Please sign in to comment.
Something went wrong with that request. Please try again.