forked from shriphani/pegasus
/
foo.clj
33 lines (23 loc) · 1.11 KB
/
foo.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
(ns pegasus.foo
(:require [pegasus.core :refer [crawl]]
[pegasus.dsl :refer :all])
(:import (java.io StringReader)))
(defn crawl-sp-blog
[]
(crawl {:seeds ["http://blog.shriphani.com"]
:user-agent "Pegasus web crawler"
:corpus-size 20 ;; crawl 20 documents
:job-dir "/tmp/sp-blog-corpus"})) ;; store all crawl data in /tmp/sp-blog-corpus/
(defn crawl-sp-blog-custom-extractor
[]
(crawl {:seeds ["http://blog.shriphani.com"]
:user-agent "Pegasus web crawler"
:extractor (defextractors
(extract :at-selector [:article :header :h2 :a]
:follow :href
:with-regex #"blog.shriphani.com")
(extract :at-selector [:ul.pagination :a]
:follow :href
:with-regex #"blog.shriphani.com"))
:corpus-size 20 ;; crawl 20 documents
:job-dir "/tmp/sp-blog-corpus"})) ;; store all crawl data in /tmp/sp-blog-corpus/