forked from oxffaa/gopher-parse-sitemap
/
sitemap.go
122 lines (107 loc) · 4.17 KB
/
sitemap.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// Package sitemap provides primitives for high effective parsing of huge
// sitemap files.
package sitemap
import (
"encoding/xml"
"io"
"net/http"
"os"
"time"
)
// Entry is an interface describes an element \ an URL in the sitemap file.
// Keep in mind. It is implemented by a totally immutable entity so you should
// minimize calls count because it can produce additional memory allocations.
//
// GetLocation returns URL of the page.
// GetLocation must return a non-nil and not empty string value.
//
// GetLastModified parses and returns date and time of last modification of the page.
// GetLastModified can return nil or a valid time.Time instance.
// Be careful. Each call return new time.Time instance.
//
// GetChangeFrequency returns string value indicates how frequent the page is changed.
// GetChangeFrequency returns non-nil string value. See Frequency consts set.
//
// GetPriority return priority of the page.
// The valid value is between 0.0 and 1.0, the default value is 0.5.
//
// You shouldn't implement this interface in your types.
type Entry interface {
GetLocation() string
GetLastModified() *time.Time
GetPriority() float32
}
// IndexEntry is an interface describes an element \ an URL in a sitemap index file.
// Keep in mind. It is implemented by a totally immutable entity so you should
// minimize calls count because it can produce additional memory allocations.
//
// GetLocation returns URL of a sitemap file.
// GetLocation must return a non-nil and not empty string value.
//
// GetLastModified parses and returns date and time of last modification of sitemap.
// GetLastModified can return nil or a valid time.Time instance.
// Be careful. Each call return new time.Time instance.
//
// You shouldn't implement this interface in your types.
type IndexEntry interface {
GetLocation() string
GetLastModified() *time.Time
}
// EntryConsumer is a type represents consumer of parsed sitemaps entries
type EntryConsumer func(Entry) error
// Parse parses data which provides by the reader and for each sitemap
// entry calls the consumer's function.
func Parse(reader io.Reader, consumer EntryConsumer) error {
return parseLoop(reader, func(d *xml.Decoder, se *xml.StartElement) error {
return entryParser(d, se, consumer)
})
}
// ParseFromFile reads sitemap from a file, parses it and for each sitemap
// entry calls the consumer's function.
func ParseFromFile(sitemapPath string, consumer EntryConsumer) error {
sitemapFile, err := os.OpenFile(sitemapPath, os.O_RDONLY, os.ModeExclusive)
if err != nil {
return err
}
defer sitemapFile.Close()
return Parse(sitemapFile, consumer)
}
// ParseFromSite downloads sitemap from a site, parses it and for each sitemap
// entry calls the consumer's function.
func ParseFromSite(url string, consumer EntryConsumer) error {
res, err := http.Get(url)
if err != nil {
return err
}
defer res.Body.Close()
return Parse(res.Body, consumer)
}
// IndexEntryConsumer is a type represents consumer of parsed sitemaps indexes entries
type IndexEntryConsumer func(IndexEntry) error
// ParseIndex parses data which provides by the reader and for each sitemap index
// entry calls the consumer's function.
func ParseIndex(reader io.Reader, consumer IndexEntryConsumer) error {
return parseLoop(reader, func(d *xml.Decoder, se *xml.StartElement) error {
return indexEntryParser(d, se, consumer)
})
}
// ParseIndexFromFile reads sitemap index from a file, parses it and for each sitemap
// index entry calls the consumer's function.
func ParseIndexFromFile(sitemapPath string, consumer IndexEntryConsumer) error {
sitemapFile, err := os.OpenFile(sitemapPath, os.O_RDONLY, os.ModeExclusive)
if err != nil {
return err
}
defer sitemapFile.Close()
return ParseIndex(sitemapFile, consumer)
}
// ParseIndexFromSite downloads sitemap index from a site, parses it and for each sitemap
// index entry calls the consumer's function.
func ParseIndexFromSite(sitemapURL string, consumer IndexEntryConsumer) error {
res, err := http.Get(sitemapURL)
if err != nil {
return err
}
defer res.Body.Close()
return ParseIndex(res.Body, consumer)
}