-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.pl
59 lines (55 loc) · 1.67 KB
/
index.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
use strict;
use warnings;
use Encode;
use HTML::TreeBuilder;
use HTML::SimpleLinkExtor;
use ElasticSearch;
my $es = ElasticSearch->new();
my $issues = sub {
my @archived;
my $url = "http://devopsweekly.com/archive";
my $extor = HTML::SimpleLinkExtor->new();
$extor->parse_url($url);
my @links = $extor->links;
foreach (@links) { push (@archived, $_) if $_ =~ m"/\d{4}/\d{2}/\d{2}/(issue-)?\d+"; }
return \@archived;
};
my $index = sub {
my $archive = shift;
my @links = @$archive;
for my $link (@links) {
my ($issue_date, $issue_num) = ($link =~ m"/(\d{4}/\d{2}/\d{2})/(?:issue-)?(\d+)");
print $link . "\n";
my $tree = HTML::TreeBuilder->new_from_url("http://devopsweekly.com$link"); # extra '/' results in failed GET
my $news = $tree->look_down('_tag', qr/h(1|2)/, sub { $_[0]->as_text =~ /^News$/ }); # start from News
if (!defined($news)) {
print "Couldn't start indexing $link";
next;
}
my @rest = $news->right;
for my $element (@rest) {
my $content;
my @content_links;
if ($element->tag eq "p" and $element->descendants == 0) {
$content = $element->as_text;
my @right = $element->right;
for my $e (@right) {
if ( $e->tag eq "p" and $e->descendants > 0 ) {
foreach (@{$e->extract_links}) { my $link = @$_[0]; push (@content_links, $link) }
} else { last; }
}
$es->index(
index => "weekly",
type => "devops",
data => {
"content" => decode_utf8($content),
"content_links" => \@content_links,
"issue_date" => $issue_date,
"issue_num" => $issue_num
});
}
}
}
};
my $archive = $issues->();
$index->($archive);