Skip to content

Commit

Permalink
Reads now preserve CDATA sections
Browse files Browse the repository at this point in the history
  • Loading branch information
beevik committed May 2, 2023
1 parent 1288176 commit 5a0225a
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 28 deletions.
26 changes: 15 additions & 11 deletions etree.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,12 @@ func (d *Document) ReadFromString(s string) error {
// WriteTo serializes the document out to the writer 'w'. The function returns
// the number of bytes written and any error encountered.
func (d *Document) WriteTo(w io.Writer) (n int64, err error) {
cw := newCountWriter(w)
b := bufio.NewWriter(cw)
xw := newXmlWriter(w)
b := bufio.NewWriter(xw)
for _, c := range d.Child {
c.writeTo(b, &d.WriteSettings)
}
err, n = b.Flush(), cw.bytes
err, n = b.Flush(), xw.bytes
return
}

Expand Down Expand Up @@ -688,25 +688,26 @@ func (e *Element) RemoveChildAt(index int) Token {
// ReadFrom reads XML from the reader 'ri' and stores the result as a new
// child of this element.
func (e *Element) readFrom(ri io.Reader, settings ReadSettings) (n int64, err error) {
r := newCountReader(ri)
dec := xml.NewDecoder(r)
xr := newXmlReader(ri)
dec := xml.NewDecoder(xr)
dec.CharsetReader = settings.CharsetReader
dec.Strict = !settings.Permissive
dec.Entity = settings.Entity
var stack stack
stack.push(e)
for {
xr.ResetPeek(dec.InputOffset())
t, err := dec.RawToken()
switch {
case err == io.EOF:
if len(stack.data) != 1 {
return r.bytes, ErrXML
return xr.bytes, ErrXML
}
return r.bytes, nil
return xr.bytes, nil
case err != nil:
return r.bytes, err
return xr.bytes, err
case stack.empty():
return r.bytes, ErrXML
return xr.bytes, ErrXML
}

top := stack.peek().(*Element)
Expand All @@ -720,14 +721,17 @@ func (e *Element) readFrom(ri io.Reader, settings ReadSettings) (n int64, err er
stack.push(e)
case xml.EndElement:
if top.Tag != t.Name.Local || top.Space != t.Name.Space {
return r.bytes, ErrXML
return xr.bytes, ErrXML
}
stack.pop()
case xml.CharData:
data := string(t)
var flags charDataFlags
if isWhitespace(data) {
flags = whitespaceFlag
flags |= whitespaceFlag
}
if xr.PeekContainsCdata() {
flags |= cdataFlag
}
newCharData(data, flags, top)
case xml.Comment:
Expand Down
19 changes: 19 additions & 0 deletions etree_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1149,3 +1149,22 @@ func TestWhitespace(t *testing.T) {
cd.SetData("")
checkBoolEq(t, cd.IsWhitespace(), true)
}

func TestPreserveCDATA(t *testing.T) {
s := `<name><![CDATA[My]] <b>name</b> <![CDATA[is]]></name>`

doc := NewDocument()
err := doc.ReadFromString(s)
if err != nil {
t.Fatalf("etree: failed to ReadFromString: %v", err)
}

result, err := doc.WriteToString()
if err != nil {
t.Fatalf("etree: failed to WriteToString: %v", err)
}

if result != s {
t.Errorf("etree: wanted %q, got %q", s, result)
}
}
72 changes: 55 additions & 17 deletions helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package etree

import (
"bufio"
"bytes"
"io"
"strings"
"unicode/utf8"
Expand Down Expand Up @@ -83,38 +84,75 @@ func (f *fifo) grow() {
f.data, f.head, f.tail = buf, 0, count
}

// countReader implements a proxy reader that counts the number of
// bytes read from its encapsulated reader.
type countReader struct {
r io.Reader
// xmlReader implements a proxy reader that counts the number of
// bytes read from its encapsulated reader and detects when a CDATA
// prefix has been parsed.
type xmlReader struct {
r io.ByteReader
bytes int64
peek []byte
last byte
}

func newCountReader(r io.Reader) *countReader {
return &countReader{r: r}
var cdataPrefix = []byte("<![CDATA[")

func newXmlReader(r io.Reader) *xmlReader {
return &xmlReader{
r: bufio.NewReader(r),
bytes: 0,
peek: make([]byte, 0, len(cdataPrefix)),
last: 0,
}
}

func (xr *xmlReader) Read(p []byte) (n int, err error) {
// Since xmlReader implements the io.ByteReader interface, the XML decoder
// bypasses Read in favor of ReadByte.
return 0, nil
}

func (cr *countReader) Read(p []byte) (n int, err error) {
b, err := cr.r.Read(p)
cr.bytes += int64(b)
func (xr *xmlReader) ReadByte() (b byte, err error) {
b, err = xr.r.ReadByte()
if err == nil {
xr.last = b
xr.bytes += 1
if len(xr.peek) < len(cdataPrefix) {
xr.peek = append(xr.peek, b)
}
}
return b, err
}

// countWriter implements a proxy writer that counts the number of
func (xr *xmlReader) ResetPeek(decoderOffset int64) {
xr.peek = xr.peek[0:0]

// If the decoder offset doesn't match the number of bytes read so far,
// then the decoder performed an "unget" on the last byte read. Return
// this byte to the front of the peek buffer.
if decoderOffset != xr.bytes {
xr.peek = append(xr.peek, xr.last)
}
}

func (xr *xmlReader) PeekContainsCdata() bool {
return bytes.Equal(xr.peek, cdataPrefix)
}

// xmlWriter implements a proxy writer that counts the number of
// bytes written by its encapsulated writer.
type countWriter struct {
type xmlWriter struct {
w io.Writer
bytes int64
}

func newCountWriter(w io.Writer) *countWriter {
return &countWriter{w: w}
func newXmlWriter(w io.Writer) *xmlWriter {
return &xmlWriter{w: w}
}

func (cw *countWriter) Write(p []byte) (n int, err error) {
b, err := cw.w.Write(p)
cw.bytes += int64(b)
return b, err
func (xw *xmlWriter) Write(p []byte) (n int, err error) {
n, err = xw.w.Write(p)
xw.bytes += int64(n)
return n, err
}

// isWhitespace returns true if the byte slice contains only
Expand Down

0 comments on commit 5a0225a

Please sign in to comment.