Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Initial commit

  • Loading branch information...
commit 492a0badae98befa83b23cf8dd1897710e2c98f5 0 parents
Alexander Alexeev authored
8  META.info
... ...
@@ -0,0 +1,8 @@
  1
+{
  2
+    "name"        : "Sitemap::XML::Parser",
  3
+    "version"     : "*",
  4
+    "description" : "Module for parsing sitemap.xml files",
  5
+    "depends"     : [ "LWP::Simple", "URI", "XML::Parser::Tiny", "DateTime::Format::W3CDTF" ],
  6
+    "source-type" : "git",
  7
+    "source-url"  : "git://github.com/afiskon/p6-sitemap-xml-parser.git"
  8
+}
15  README
... ...
@@ -0,0 +1,15 @@
  1
+A Perl 6 module for parsing sitemap.xml files.
  2
+
  3
+All files (unless noted otherwise) can be used, modified and redistributed
  4
+under the terms of the Artistic License Version 2.
  5
+
  6
+To build and test this module, plese get 'ufo' from
  7
+http://github.com/masak/ufo and run
  8
+
  9
+    ufo
  10
+    make
  11
+    make test
  12
+    make install
  13
+
  14
+Credits
  15
+    Alexandr Alexeev <afiskon@gmail.com>
3  TODO.txt
... ...
@@ -0,0 +1,3 @@
  1
+* tests for parse-url() and parse-file()
  2
+* seperated module for stemap index parsing -- http://www.sitemaps.org/ru/protocol.html#index
  3
+* gzip support
134  lib/Sitemap/XML/Parser.pm6
... ...
@@ -0,0 +1,134 @@
  1
+use v6;
  2
+use XML::Parser::Tiny;
  3
+use DateTime::Format::W3CDTF;
  4
+use LWP::Simple;
  5
+use URI;
  6
+
  7
+class Sitemap::XML::Parser;
  8
+
  9
+has $!parser = XML::Parser::Tiny.new;
  10
+has $!w3cdtf = DateTime::Format::W3CDTF.new;
  11
+
  12
+method parse-url ( Str $url ) {
  13
+    my $data = LWP::Simple.get($url);
  14
+    self.parse($data);
  15
+}
  16
+
  17
+method parse-file ( Str $fname ) {
  18
+    my $data = slurp($fname);
  19
+    self.parse($data);
  20
+}
  21
+
  22
+method parse ( Str $data ) {
  23
+    my $xml = $!parser.parse($data);
  24
+    die "Tag 'urlset' is missing" unless $xml{'body'}{'name'} eq 'urlset';
  25
+
  26
+    my @urls;
  27
+    for @( $xml{'body'}{'data'} ) -> %url {
  28
+        die "Unexpected tag '" ~ %url{'name'} ~ "' found" unless %url{'name'} eq 'url';
  29
+
  30
+        my %info;
  31
+        for @( %url{'data'} ) -> %item {
  32
+            unless %item{'name'} eq any('loc', 'lastmod', 'changefreq', 'priority') {
  33
+                die "Unexpected tag '" ~ %item{'name'} ~ "' found";
  34
+            }
  35
+            unless @( %item{'data'} ).elems == 1 && %item{'data'}[0].isa('Str') {
  36
+                die "Invalid tag value for '" ~ %item{'name'} ~ "'";
  37
+            }
  38
+            %info{ %item{'name'} } = %item{'data'}[0];
  39
+        }
  40
+
  41
+        %info = self!check-loc(%info);
  42
+        %info = self!check-lastmod(%info);
  43
+        %info = self!check-changefreq(%info);
  44
+        %info = self!check-priority(%info);
  45
+
  46
+        @urls.push( $(%info) );
  47
+    }
  48
+    return @urls;
  49
+}
  50
+
  51
+method !check-loc ( %info is copy ) {
  52
+    unless any(%info.keys) eq 'loc' && %info{'loc'} ne '' {
  53
+        die "Tag 'loc' is missing";
  54
+    }
  55
+
  56
+    %info{'loc'} = URI.new(%info{'loc'}, is_validating => True);
  57
+    return %info;
  58
+}
  59
+
  60
+method !check-lastmod ( %info is copy ) {
  61
+    return %info unless any(%info.keys) eq 'lastmod';
  62
+
  63
+    %info{'lastmod'} = $!w3cdtf.parse: %info{'lastmod'};
  64
+    return %info;
  65
+}
  66
+
  67
+method !check-changefreq ( %info ) {
  68
+    return %info unless any(%info.keys) eq 'changefreq';
  69
+
  70
+    unless %info{'changefreq'} eq any(qw/always hourly daily weekly monthly yearly never/) {
  71
+        die "Invalid tag value '" ~ %info{'changefreq'} ~ "' for 'changefreq'"
  72
+    }
  73
+    return %info;
  74
+}
  75
+
  76
+method !check-priority ( %info is copy ) {
  77
+    return %info unless any(%info.keys) eq 'priority';
  78
+
  79
+    my $value = %info{'priority'}.Real;
  80
+    unless $value ~~ 0.0..1.0 {
  81
+        die "Invalid tag value '" ~ %info{'priority'} ~ "' for 'priority'"
  82
+    }
  83
+    %info{'priority'} = $value;
  84
+    return %info;
  85
+}
  86
+
  87
+=begin pod
  88
+
  89
+=head1 NAME
  90
+
  91
+Sitemap::XML::Parser is a module for parsing sitemap.xml files.
  92
+
  93
+=head1 SYNOPSYS
  94
+
  95
+=begin code
  96
+use Sitemap::XML::Parser;
  97
+
  98
+my $parser = Sitemap::XML::Parser.new;
  99
+my $sitemap = $parser.parse-url('http://example.ru/sitemap.xml');
  100
+# $sitemap == [
  101
+#    {
  102
+#        loc => URI.new('http://example/'),
  103
+#        lastmod => DateTime.new('2012-09-06T03:22:42Z'),
  104
+#        changefreq => 'daily',
  105
+#        priority => 1.0
  106
+#    },
  107
+#    ....
  108
+# ];
  109
+
  110
+=end code
  111
+
  112
+=head1 DESCRIPTION
  113
+
  114
+Module for parsing sitemap.xml files
  115
+
  116
+=head1 METHODS
  117
+
  118
+=head2 parse-url( Str $url )
  119
+=head2 parse-file( Str $fname )
  120
+=head2 parse( Str $data )
  121
+
  122
+=head1 AUTHOR
  123
+
  124
+Alexandr Alexeev, <eax at cpan.org> (L<http://eax.me/>)
  125
+
  126
+=head1 COPYRIGHT
  127
+
  128
+Copyright 2012 Alexandr Alexeev
  129
+
  130
+This program is free software; you can redistribute it and/or modify it
  131
+under the same terms as Rakudo Perl 6 itself.
  132
+
  133
+=end pod
  134
+
137  t/main.t
... ...
@@ -0,0 +1,137 @@
  1
+use v6;
  2
+use lib 'lib';
  3
+use Test;
  4
+use Sitemap::XML::Parser;
  5
+
  6
+my @invalid = (
  7
+    'trash',
  8
+    '<bebebe />',
  9
+    '<urlset><bebebe /></urlset>',
  10
+    '<urlset>data</urlset>',
  11
+    '<urlset><![CDATA[data]]></urlset>',
  12
+    '<urlset><url></url></urlset>',
  13
+    '<urlset><url><bebebe /></url></urlset>',
  14
+    '<urlset><url><loc></loc></url></urlset>',
  15
+    '<urlset><url><loc>%%%%%%</loc></url></urlset>',
  16
+    '<urlset><url><loc>http://example.ru/</loc><bebebe /></url></urlset>', 
  17
+    '<urlset><url><loc>http://example.ru/</loc><changefreq> monthly </changefreq></url></urlset>', 
  18
+    '<urlset><url><loc>http://example.ru/</loc><changefreq>bebebe</changefreq></url></urlset>', 
  19
+    '<urlset><url><loc>http://example.ru/</loc><changefreq><tag /></changefreq></url></urlset>', 
  20
+    '<urlset><url><loc>http://example.ru/</loc><priority /></url></urlset>', 
  21
+    '<urlset><url><loc>http://example.ru/</loc><priority></priority></url></urlset>', 
  22
+    '<urlset><url><loc>http://example.ru/</loc><priority>bebebe</priority></url></urlset>', 
  23
+    '<urlset><url><loc>http://example.ru/</loc><priority>-0.0000001</priority></url></urlset>', 
  24
+    '<urlset><url><loc>http://example.ru/</loc><priority>1.0000001</priority></url></urlset>', 
  25
+    '<urlset><url><loc>http://example.ru/</loc><lastmod>bebebe</lastmod></url></urlset>',
  26
+    '<urlset><url><lastmod>2012-08-30T04:20:04Z</lastmod><changefreq>monthly</changefreq><priority>0.2</priority></url></urlset>',
  27
+);
  28
+
  29
+my ( @valid_sitemaps, @valid_results );
  30
+@valid_sitemaps.push:
  31
+    q{<urlset>
  32
+        <url>
  33
+            <loc>http://example.ru/</loc>
  34
+            <lastmod>2012-08-30T04:20:04+00:00</lastmod>
  35
+            <changefreq>monthly</changefreq>
  36
+            <priority>0.2</priority>
  37
+        </url>
  38
+    </urlset>};
  39
+@valid_sitemaps.push:
  40
+    q{<?xml ?><urlset>
  41
+        <url>
  42
+            <priority>0.2</priority>
  43
+            <lastmod>2012-08-30T04:20:04+00:00</lastmod>
  44
+            <loc>http://example.ru/</loc>
  45
+            <changefreq>monthly</changefreq>
  46
+        </url>
  47
+    </urlset>};
  48
+
  49
+for 1..2 {
  50
+    @valid_results.push: [
  51
+        {
  52
+            loc => 'http://example.ru/',
  53
+            lastmod => '2012-08-30T04:20:04Z',
  54
+            changefreq => 'monthly',
  55
+            priority => 0.2
  56
+        },
  57
+    ];
  58
+}
  59
+
  60
+@valid_sitemaps.push:
  61
+    q{<?xml version="1.0" encoding="UTF-8"?>
  62
+      <?xml-stylesheet type="text/xsl" href="http://example.ru/wp-content/plugins/google-sitemap-generator/sitemap.xsl"?>
  63
+<!-- generator="wordpress/3.4.1" -->
  64
+<!-- sitemap-generator-url="http://www.arnebrachhold.de" sitemap-generator-version="3.2.8" -->
  65
+<!-- generated-on="06.09.2012 03:23" -->
  66
+<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  67
+        xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  68
+        xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">	<url>
  69
+		<loc>http://username:password@xn--d1abbgf6aiiy.xn--p1ai:8080/</loc>
  70
+		<lastmod>2012-09-06T03:22:42+00:00</lastmod>
  71
+		<changefreq>daily</changefreq>
  72
+		<priority>1.0</priority>
  73
+	</url>
  74
+	<url>
  75
+		<loc>http://example.ru/perl6-install/</loc>
  76
+		<priority>0.2</priority>
  77
+	</url>
  78
+	<url>
  79
+		<loc>http://example.ru/erlang/</loc>
  80
+	</url>
  81
+    </urlset>};
  82
+@valid_results.push: [
  83
+    {
  84
+        loc => 'http://username:password@xn--d1abbgf6aiiy.xn--p1ai:8080/',
  85
+        lastmod => '2012-09-06T03:22:42Z',
  86
+        changefreq => 'daily',
  87
+        priority => 1.0
  88
+    },
  89
+    {
  90
+        loc => 'http://example.ru/perl6-install/',
  91
+        priority => 0.2
  92
+    },
  93
+    {
  94
+        loc => 'http://example.ru/erlang/',
  95
+    },
  96
+];
  97
+
  98
+my @changefreq_list = qw/always hourly daily weekly monthly yearly never/;
  99
+for @changefreq_list -> $changefreq {
  100
+    @valid_sitemaps.push:
  101
+        qq{<urlset>
  102
+            <url>
  103
+                <loc>http://example.ru/</loc>
  104
+                <changefreq>$changefreq\</changefreq>
  105
+            </url>
  106
+          </urlset>};
  107
+    @valid_results.push: [
  108
+        {
  109
+            loc => 'http://example.ru/',
  110
+            changefreq => $changefreq,
  111
+        },
  112
+    ];
  113
+}
  114
+
  115
+my $parser = Sitemap::XML::Parser.new;
  116
+for @invalid -> $sitemap {
  117
+    dies_ok({ $parser.parse($sitemap) });
  118
+}
  119
+
  120
+my %valid = zip(@valid_sitemaps, @valid_results);
  121
+for %valid.kv -> $sitemap, $struct {
  122
+    my $rslt = $parser.parse($sitemap);
  123
+
  124
+    for $rslt.values -> $item is rw {
  125
+        ok( $item{'loc'}.isa('URI') );
  126
+        $item{'loc'} = $item{'loc'}.Str;
  127
+
  128
+        next unless defined $item{'lastmod'};
  129
+        ok( $item{'lastmod'}.isa('DateTime') );
  130
+        $item{'lastmod'} = $item{'lastmod'}.Str;
  131
+    }
  132
+
  133
+    ok( $rslt eqv $struct );
  134
+}
  135
+
  136
+done;
  137
+

0 notes on commit 492a0ba

Please sign in to comment.
Something went wrong with that request. Please try again.