/
document.go
113 lines (102 loc) · 2.14 KB
/
document.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package index
import (
"compress/gzip"
"encoding/xml"
"fmt"
"io"
"log"
"os"
"path/filepath"
)
// Document represents a Wikipedia abstract dump document.
type Document struct {
Title string `xml:"title"`
URL string `xml:"url"`
Text string `xml:"abstract"`
Timestamp int
ID int
}
// LoadDocuments loads a Wikipedia abstract dump and returns a slice of documents.
// Dump example from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz
func LoadDocuments(path string) ([]Document, error) {
abspath, err := filepath.Abs(path)
if err != nil {
return nil, err
}
f, err := os.Open(abspath)
if err != nil {
return nil, err
}
defer f.Close()
gz, err := gzip.NewReader(f)
if err != nil {
return nil, err
}
defer gz.Close()
dump := struct {
Documents []Document `xml:"doc"`
}{}
dec := xml.NewDecoder(gz)
dec.Token()
if err := dec.Decode(&dump); err != nil {
return nil, err
}
docs := dump.Documents
for i := range docs {
docs[i].ID = i
}
return docs, nil
}
func LoadDocumentStream(path string) (chan *Document, error) {
abspath, err := filepath.Abs(path)
if err != nil {
return nil, err
}
f, err := os.Open(abspath)
if err != nil {
return nil, err
}
gz, err := gzip.NewReader(f)
if err != nil {
return nil, err
}
ch := make(chan *Document, 10)
dec := xml.NewDecoder(gz)
go func() {
defer f.Close()
defer gz.Close()
id := 0
for {
tok, err := dec.Token()
if tok == nil && err == io.EOF {
ch <- nil
// EOF means we're done.
log.Println("EOF means we're done.")
break
} else if err != nil {
//log.Fatalf("Error decoding token: %s", err.Error())
panic(err)
}
switch ty := tok.(type) {
case xml.StartElement:
if ty.Name.Local == "doc" {
// If this is a start element named "location", parse this element
// fully.
doc := Document{}
if err = dec.DecodeElement(&doc, &ty); err != nil {
//log.Fatalf("Error decoding item: %s", err.Error())
panic(err)
}
id++
doc.ID = id
ch <- &doc
if id % 5000 == 0 {
fmt.Printf("load %d docs\n", id)
}
}
default:
}
}
}()
return ch, nil
}