Skip to content

Commit

Permalink
Remove dom tree child node depth count limit from htmlparser. (#37259)
Browse files Browse the repository at this point in the history
Reason:

- The htmlparser::Parser (parser) itself doesn't suffer from any complexity
due to deeply nested nodes. This was done for clients like Validator etc.
The AMP Validator also has it's own max_node_recursion_depth flag and stack
check.

- The parser ability to parse complex document is only limited by heap memory,
there is no pressure on thread stack size.

- Returning null document for deeply nested node is a big divergence from html5
algorithm which lays out rules to return html dom tree in almost every possible
input.

- Originally, the htmlparser_max_nodes_depth_count flag was introduced because
of Node data structure recursive ownership (parent owns first child owns first
sibling), so destructor spent a long time destroying the document and deeply
nested node put a lot of pressure on stack due to recursion. This was fixed
after introducing custom Allocator (allocator.h) which frees blocks of memory
efficiently.

PiperOrigin-RevId: 416939834

Co-authored-by: Amaltas Bohra <amaltas@google.com>
  • Loading branch information
antiphoton and amaltas committed Dec 21, 2021
1 parent bd928e9 commit d0a2f1a
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 59 deletions.
18 changes: 10 additions & 8 deletions validator/cpp/engine/validator-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5692,7 +5692,7 @@ class Validator {
// Currently parser returns nullptr only if document is too complex.
// NOTE: If htmlparser starts returning null document for other reasons, we
// must add new error types here.
if (doc == nullptr) {
if (!doc || !doc->status().ok()) {
context_.AddError(ValidationError::DOCUMENT_TOO_COMPLEX, LineCol(1, 0),
{}, "", &result_);
return result_;
Expand Down Expand Up @@ -5851,14 +5851,16 @@ class Validator {
auto dummy_node = std::make_unique<htmlparser::Node>(
htmlparser::NodeType::ELEMENT_NODE, htmlparser::Atom::BODY);
auto doc = htmlparser::ParseFragment(c->Data(), dummy_node.get());
// Append all the nodes to the original <noscript> parent.
for (htmlparser::Node* cn : doc->FragmentNodes()) {
cn->UpdateChildNodesPositions(node);
UpdateLineColumnIndex(cn);
ValidateNode(cn, ++stack_size);
--stack_size;
if (doc && doc->status().ok()) {
// Append all the nodes to the original <noscript> parent.
for (htmlparser::Node* cn : doc->FragmentNodes()) {
cn->UpdateChildNodesPositions(node);
UpdateLineColumnIndex(cn);
ValidateNode(cn, ++stack_size);
--stack_size;
}
node->RemoveChild(c);
}
node->RemoveChild(c);
}
c = next;
}
Expand Down
13 changes: 13 additions & 0 deletions validator/cpp/htmlparser/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,16 @@ cc_test(
],
)

cc_library(
name = "iterators",
hdrs = [
"iterators.h",
],
deps = [
":node",
],
)

cc_library(
name = "htmlentities",
hdrs = [
Expand Down Expand Up @@ -286,9 +296,11 @@ cc_library(
copts = ["-std=c++17"],
deps = [
":allocator",
":iterators",
":node",
":token",
"@com_google_absl//absl/flags:flag",
"@com_google_absl//absl/status",
],
)

Expand Down Expand Up @@ -339,6 +351,7 @@ cc_library(
":tokenizer",
"@com_google_absl//absl/base",
"@com_google_absl//absl/flags:flag",
"@com_google_absl//absl/status",
],
)

Expand Down
30 changes: 29 additions & 1 deletion validator/cpp/htmlparser/document.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
#include <memory>
#include <vector>

#include "absl/status/status.h"
#include "cpp/htmlparser/allocator.h"
#include "cpp/htmlparser/iterators.h"
#include "cpp/htmlparser/node.h"
#include "cpp/htmlparser/token.h"

Expand Down Expand Up @@ -68,25 +70,49 @@ struct DocumentMetadata {
// The document class is a wrapper for the DOM tree exposed with RootNode().
// All the nodes inside the document are owned by document. The nodes are
// destroyed when Document objects goes out of scope or deleted.
//
// Usage:
// unique_ptr<Document> doc = parser.Parse(html);
// if (!doc->status().ok()) {
// LOG(ERROR) << "Parsing failed. " << doc->status();
// return;
// }
//
// Node* root_node = doc.RootNode();
// ...
//
class Document {
public:
Document();
~Document() = default;


const DocumentMetadata& Metadata() const { return metadata_; }

// Creates a new node. The node is owned by Document and is destroyed when
// document is destructed.
Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);

// Returns OK if Document is result of successful html parsing.
// Accessing any fields/methods when status() != OK is undefined behavior.
absl::Status status() const {
return status_;
}

// Returns the root node of a DOM tree. Node* owned by document.
Node* RootNode() const { return root_node_; }

// Returns list of nodes parsed as a document fragment. All the Nodes are
// owned by the document.
const std::vector<Node*> FragmentNodes() const { return fragment_nodes_; }

using const_iterator = NodeIterator<true>;
using iterator = NodeIterator<false>;

iterator begin() { return iterator{root_node_}; }
iterator end() { return iterator{nullptr}; }
const_iterator cbegin() const { return const_iterator{root_node_}; }
const_iterator cend() const { return const_iterator{nullptr}; }

private:
// Returns a new node with the same type, data and attributes.
// The clone has no parent, no siblings and no children.
Expand All @@ -101,6 +127,8 @@ class Document {
std::vector<Node*> fragment_nodes_{};
std::size_t html_src_bytes_;
DocumentMetadata metadata_;
// Document parsing status.
absl::Status status_ = absl::OkStatus();

friend class Parser;
friend std::unique_ptr<Document> Parse(std::string_view html);
Expand Down
97 changes: 97 additions & 0 deletions validator/cpp/htmlparser/iterators.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#ifndef CPP_HTMLPARSER_ITERATOR_H_
#define CPP_HTMLPARSER_ITERATOR_H_

#include <iterator>
#include <stack>

#include "cpp/htmlparser/node.h"

namespace htmlparser {

class Document;

// A forward iterator that facilitates iterating dom tree (through root node),
// in depth first traversal.
//
// Example usage:
// auto doc = parser.Parse(html);
// for (auto iter = doc.begin(); iter != doc.end(); ++iter) {
// ProcessNode(*iter);
// }
//
// The above dom without NodeIterator require a lot of boiler plate code like
// defining a stack class and data structure, knowledge of Node data structure.
//
// Clients should not access this class directly but get handle from Document
// object.
// auto iter = doc.begin();
// auto const_iter = doc.cbegin();
template <bool Const>
class NodeIterator {
public:
// Member typdefs required by std::iterator_traits
// Not the correct type, and not used anyway.
using difference_type = std::ptrdiff_t;
using value_type = Node;
using pointer = std::conditional_t<Const, const Node*, Node*>;
using reference = std::conditional_t<Const, const Node&, Node&>;
using iterator_category = std::forward_iterator_tag;

reference operator*() const { return *current_node_; }
pointer operator->() const { return current_node_; }

// Prefix increment.
auto& operator++() {
if (current_node_->FirstChild()) {
if (current_node_->NextSibling()) {
stack_.push(current_node_->NextSibling());
}
current_node_ = current_node_->FirstChild();
} else {
current_node_ = current_node_->NextSibling();
}

if (!current_node_) {
if (!stack_.empty()) {
current_node_ = stack_.top();
stack_.pop();
}
}

return *this;
}

// Postfix increment.
auto operator++(int) {
auto result = *this; ++*this; return result;
}

template<bool R>
bool operator==(const NodeIterator<R>& rhs) const {
return current_node_ == rhs.current_node_;
}

template<bool R>
bool operator!=(const NodeIterator<R>& rhs) const {
return current_node_ != rhs.current_node_;
}

operator NodeIterator<true>() const {
return NodeIterator<true>{current_node_};
}

private:
explicit NodeIterator(Node* node) : current_node_(node) {}

friend class Document;
friend class NodeIterator<!Const>;
using node_pointer = std::conditional_t<Const, const Node*, Node*>;
node_pointer current_node_;
// Facilitates depth first traversal.
std::stack<Node*> stack_;
};

} // namespace htmlparser


#endif // CPP_HTMLPARSER_ITERATOR_H_
42 changes: 16 additions & 26 deletions validator/cpp/htmlparser/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#endif // DUMP_NODES

#include "absl/flags/flag.h"
#include "absl/status/status.h"
#include "cpp/htmlparser/atomutil.h"
#include "cpp/htmlparser/comparators.h"
#include "cpp/htmlparser/defer.h"
Expand All @@ -15,12 +16,7 @@
#include "cpp/htmlparser/parser.h"
#include "cpp/htmlparser/strings.h"

ABSL_FLAG(uint32_t, htmlparser_max_nodes_depth_count, 245,
"Maximum depth of open nodes. "
"For: <a><b><c><d></d></c></b/></a>, the stack size is 4 as <a> is "
"kept open until three elements <b>,<c> and <d> are closed. "
"For: <a>a</a><b>b</b><c>c</c><d>d</d> the depth is 1 as they are "
"closed immediately after one child element.");
ABSL_RETIRED_FLAG(uint32_t, htmlparser_max_nodes_depth_count, 245, "retired");

namespace htmlparser {

Expand Down Expand Up @@ -85,16 +81,14 @@ std::unique_ptr<Document> ParseFragmentWithOptions(std::string_view html,

auto doc = parser->Parse();

// doc could be nullptr when, for example, the stack depth >
// htmlparser_max_nodes_depth_count).
if (doc == nullptr) return nullptr;

Node* parent = fragment_parent ? root : doc->root_node_;
for (Node* c = parent->FirstChild(); c;) {
Node* next = c->NextSibling();
doc->fragment_nodes_.push_back(std::move(c));
parent->RemoveChild(c);
c = next;
if (doc->status().ok()) {
Node* parent = fragment_parent ? root : doc->root_node_;
for (Node* c = parent->FirstChild(); c;) {
Node* next = c->NextSibling();
doc->fragment_nodes_.push_back(std::move(c));
parent->RemoveChild(c);
c = next;
}
}

return doc;
Expand Down Expand Up @@ -134,13 +128,6 @@ Parser::Parser(std::string_view html, const ParseOptions& options,
std::unique_ptr<Document> Parser::Parse() {
bool eof = tokenizer_->IsEOF();
while (!eof) {
if (open_elements_stack_.size() >
::absl::GetFlag(FLAGS_htmlparser_max_nodes_depth_count)) {
// Skipping parsing. Document too complex.
delete document_.release();
return nullptr;
}

Node* node = open_elements_stack_.Top();
tokenizer_->SetAllowCDATA(node && !node->name_space_.empty());
// Read and parse the next token.
Expand All @@ -150,7 +137,9 @@ std::unique_ptr<Document> Parser::Parse() {
if (token_type == TokenType::ERROR_TOKEN) {
eof = tokenizer_->IsEOF();
if (!eof && tokenizer_->Error()) {
return nullptr;
document_->status_ = absl::InvalidArgumentError(
"htmlparser::Parser tokenizer error.");
return std::move(document_);
}
}
token_ = tokenizer_->token();
Expand Down Expand Up @@ -1119,7 +1108,7 @@ bool Parser::AfterHeadIM() {
} // Parser::AfterHeadIM.

// Section 12.2.6.4.7.
bool Parser::InBodyIM() {
bool Parser::InBodyIM() { // NOLINT
switch (token_.token_type) {
case TokenType::TEXT_TOKEN: {
std::string d = token_.data;
Expand Down Expand Up @@ -1751,7 +1740,8 @@ bool Parser::InBodyIM() {
}

return true;
} // Parser::InBodyIM.
} // NOLINT(readability/fn_size)
// Parser::InBodyIM end.

void Parser::InBodyEndTagFormatting(Atom tag_atom, std::string_view tag_name) {
// This is the "adoption agency" algorithm, described at
Expand Down
25 changes: 1 addition & 24 deletions validator/cpp/htmlparser/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@
#include "cpp/htmlparser/renderer.h"
#include "cpp/htmlparser/token.h"

ABSL_DECLARE_FLAG(uint32_t, htmlparser_max_nodes_depth_count);

// For operator""s.
using namespace std::string_literals;
using namespace std::string_literals; // NOLINT

#define EXPECT_NOT_NULL(p) EXPECT_TRUE((p) != nullptr)
#define EXPECT_NULL(p) EXPECT_FALSE((p) != nullptr)
Expand Down Expand Up @@ -769,27 +767,6 @@ TEST(ParserTest, NumTermsInTextNodeCountDisabled) {
EXPECT_EQ(body->FirstChild()->NumTerms(), -1);
}

TEST(ParserTest, DocumentComplexityTest) {
::absl::SetFlag(&FLAGS_htmlparser_max_nodes_depth_count, 4);

// Document parsing failed, body contains 4 deeply nested nodes..
htmlparser::Parser p(
"<html><body><a><b><c><m></m></c></b></a></body></html>");
EXPECT_NULL(p.Parse());

// Document parsed, open elements stack less than 4.
htmlparser::Parser p2("<html><body><a><b>foo</b></a></body></html>");
EXPECT_NOT_NULL(p2.Parse());

// Child nodes closing reduces the stack size. So maximum open nodes in the
// following document is 4.
htmlparser::Parser p3("<html><body><a>"
"<b>foo</b><b>foo</b><b>foo</b><b>foo</b><b>foo</b>"
"<b>foo</b><b>foo</b><b>foo</b><b>foo</b><b>foo</b>"
"</a></body></html>");
EXPECT_NOT_NULL(p3.Parse());
}

TEST(ParserTest, DocumentMetadataTest) {
auto doc = htmlparser::Parse("<html><head><base href=\"www.google.com\""
"target=\"blank\">"
Expand Down

0 comments on commit d0a2f1a

Please sign in to comment.