/
docx.go
127 lines (108 loc) · 3.49 KB
/
docx.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
package main
import "github.com/beevik/etree"
func removeAttributes(someParent etree.Element, tag string, attributes ...string) {
for _, element := range someParent.FindElements("//" + tag) {
for _, attribute := range attributes {
element.RemoveAttr(attribute)
}
}
}
func removeAttributesForEmptyTags(someParent etree.Element, tag string, attributes ...string) {
for _, element := range someParent.FindElements("//" + tag) {
for _, attribute := range attributes {
if element.Text() == "" {
element.RemoveAttr(attribute)
}
}
}
}
func removeElement(someParent etree.Element, tag string) {
for _, element := range someParent.FindElements("//" + tag) {
element.Parent.RemoveElement(element)
}
}
func collapseElement(someParent etree.Element, tag string) {
for _, element := range someParent.FindElements("//" + tag) {
element.Parent.Child = append(element.Parent.Child, element.Child...)
element.Parent.RemoveElement(element)
}
}
func mergeText(someParent etree.Element) {
// find all paragraphs
// find all runs in each paragraphs
// merge text from adjacent runs with same style
for _, paragraph := range someParent.FindElements("//w:p") {
var lastRun, thisRun *etree.Element
for _, run := range paragraph.SelectElements("w:r") {
lastRun, thisRun = thisRun, run
if runsHaveSameStyleAndContainText(lastRun, thisRun) {
mergeRuns(lastRun, thisRun)
}
}
}
}
func mergeRuns(firstRun, secondRun *etree.Element) {
for _, child := range secondRun.ChildElements() {
if child.Tag == "w:t" {
firstTextTag := firstRun.SelectElement("w:t")
if firstTextTag != nil {
newText := firstTextTag.Text() + child.Text()
firstTextTag.SetText(newText)
firstTextTag.Attr = append(firstTextTag.Attr, child.Attr...) // doubles?
}
} else if child.Tag == "w:rPr" {
// it's already there
} else {
firstRun.Child = append(firstRun.Child, child)
}
}
}
func runsHaveSameStyleAndContainText(firstRun, secondRun *etree.Element) bool {
if firstRun.Tag != "w:r" || secondRun.Tag != "w:r" {
Error.Panicf("Expected nodes of type <w:r>, instead got %s and %s", firstRun.Tag, secondRun.Tag)
}
if firstRun.SelectElement("w:t") != nil && secondRun.SelectElement("w:t") != nil {
return getFlatRunStyle(firstRun) == getFlatRunStyle(secondRun)
}
return false
}
func getFlatRunStyle(run *etree.Element) string {
style := run.SelectElement("w:rPr")
if style == nil {
return ""
}
return flattenElement(run)
}
func flattenElement(element *etree.Element) string {
xml := etree.CreateDocument(element)
xml.Indent(0)
flatTags, err := xml.WriteToString()
if err != nil {
Error.Panicf("Error when trying to flatten %s", element.Tag)
}
return flatTags
}
func cleanWordXML(wordXML string) string {
docx := etree.NewDocument()
if err := docx.ReadFromString(wordXML); err != nil {
panic(err)
}
docx.Indent(2)
docx.WriteToFile("wordxml-before.txt")
removeElement(docx.Element, "proofErr")
removeElement(docx.Element, "bookmarkStart")
removeElement(docx.Element, "bookmarkEnd")
removeAttributes(docx.Element, "w:p", "w:rsidR", "w:rsidRDefault", "w:rsidRPr")
removeAttributes(docx.Element, "w:r", "w:rsidR", "w:rsidRDefault", "w:rsidRPr")
//removeAttributesForEmptyTags(docx.Element, "w:t", "xml:space")
//for _, element := range docx.FindElements("//w:t") {
// fmt.Printf("Attributes: %v, Text: [%s]\n", element.Attr, element.Text())
//}
//docx.Indent(2)
docx.WriteToFile("wordxml-after.txt")
result, err := docx.WriteToString()
if err != nil {
panic(err)
}
return result
}