Remove dom tree child node depth count limit from htmlparser. (#37259)

Reason: - The htmlparser::Parser (parser) itself doesn't suffer from any complexity due to deeply nested nodes. This was done for clients like Validator etc. The AMP Validator also has it's own max_node_recursion_depth flag and stack check. - The parser ability to parse complex document is only limited by heap memory, there is no pressure on thread stack size. - Returning null document for deeply nested node is a big divergence from html5 algorithm which lays out rules to return html dom tree in almost every possible input. - Originally, the htmlparser_max_nodes_depth_count flag was introduced because of Node data structure recursive ownership (parent owns first child owns first sibling), so destructor spent a long time destroying the document and deeply nested node put a lot of pressure on stack due to recursion. This was fixed after introducing custom Allocator (allocator.h) which frees blocks of memory efficiently. PiperOrigin-RevId: 416939834 Co-authored-by: Amaltas Bohra <amaltas@google.com>
ampproject · Dec 21, 2021 · d0a2f1a · d0a2f1a
1 parent bd928e9
commit d0a2f1a
Show file tree

Hide file tree

Showing 6 changed files with 166 additions and 59 deletions.
diff --git a/validator/cpp/engine/validator-internal.cc b/validator/cpp/engine/validator-internal.cc
@@ -5692,7 +5692,7 @@ class Validator {
     // Currently parser returns nullptr only if document is too complex.
     // NOTE: If htmlparser starts returning null document for other reasons, we
     // must add new error types here.
-    if (doc == nullptr) {
+    if (!doc || !doc->status().ok()) {
       context_.AddError(ValidationError::DOCUMENT_TOO_COMPLEX, LineCol(1, 0),
                         {}, "", &result_);
       return result_;
@@ -5851,14 +5851,16 @@ class Validator {
         auto dummy_node = std::make_unique<htmlparser::Node>(
             htmlparser::NodeType::ELEMENT_NODE, htmlparser::Atom::BODY);
         auto doc = htmlparser::ParseFragment(c->Data(), dummy_node.get());
-        // Append all the nodes to the original <noscript> parent.
-        for (htmlparser::Node* cn : doc->FragmentNodes()) {
-          cn->UpdateChildNodesPositions(node);
-          UpdateLineColumnIndex(cn);
-          ValidateNode(cn, ++stack_size);
-          --stack_size;
+        if (doc && doc->status().ok()) {
+          // Append all the nodes to the original <noscript> parent.
+          for (htmlparser::Node* cn : doc->FragmentNodes()) {
+            cn->UpdateChildNodesPositions(node);
+            UpdateLineColumnIndex(cn);
+            ValidateNode(cn, ++stack_size);
+            --stack_size;
+          }
+          node->RemoveChild(c);
         }
-        node->RemoveChild(c);
       }
       c = next;
     }

diff --git a/validator/cpp/htmlparser/BUILD b/validator/cpp/htmlparser/BUILD
@@ -253,6 +253,16 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "iterators",
+    hdrs = [
+        "iterators.h",
+    ],
+    deps = [
+        ":node",
+    ],
+)
+
 cc_library(
     name = "htmlentities",
     hdrs = [
@@ -286,9 +296,11 @@ cc_library(
     copts = ["-std=c++17"],
     deps = [
         ":allocator",
+        ":iterators",
         ":node",
         ":token",
         "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -339,6 +351,7 @@ cc_library(
         ":tokenizer",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/status",
     ],
 )
 

diff --git a/validator/cpp/htmlparser/document.h b/validator/cpp/htmlparser/document.h
@@ -4,7 +4,9 @@
 #include <memory>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "cpp/htmlparser/allocator.h"
+#include "cpp/htmlparser/iterators.h"
 #include "cpp/htmlparser/node.h"
 #include "cpp/htmlparser/token.h"
 
@@ -68,25 +70,49 @@ struct DocumentMetadata {
 // The document class is a wrapper for the DOM tree exposed with RootNode().
 // All the nodes inside the document are owned by document. The nodes are
 // destroyed when Document objects goes out of scope or deleted.
+//
+// Usage:
+// unique_ptr<Document> doc = parser.Parse(html);
+// if (!doc->status().ok()) {
+//   LOG(ERROR) << "Parsing failed. " << doc->status();
+//   return;
+// }
+//
+// Node* root_node = doc.RootNode();
+// ...
+//
 class Document {
  public:
   Document();
   ~Document() = default;
 
-
   const DocumentMetadata& Metadata() const { return metadata_; }
 
   // Creates a new node. The node is owned by Document and is destroyed when
   // document is destructed.
   Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);
 
+  // Returns OK if Document is result of successful html parsing.
+  // Accessing any fields/methods when status() != OK is undefined behavior.
+  absl::Status status() const {
+    return status_;
+  }
+
   // Returns the root node of a DOM tree. Node* owned by document.
   Node* RootNode() const { return root_node_; }
 
   // Returns list of nodes parsed as a document fragment. All the Nodes are
   // owned by the document.
   const std::vector<Node*> FragmentNodes() const { return fragment_nodes_; }
 
+  using const_iterator = NodeIterator<true>;
+  using iterator = NodeIterator<false>;
+
+  iterator begin() { return iterator{root_node_}; }
+  iterator end() { return iterator{nullptr}; }
+  const_iterator cbegin() const { return const_iterator{root_node_}; }
+  const_iterator cend() const { return const_iterator{nullptr}; }
+
  private:
   // Returns a new node with the same type, data and attributes.
   // The clone has no parent, no siblings and no children.
@@ -101,6 +127,8 @@ class Document {
   std::vector<Node*> fragment_nodes_{};
   std::size_t html_src_bytes_;
   DocumentMetadata metadata_;
+  // Document parsing status.
+  absl::Status status_ = absl::OkStatus();
 
   friend class Parser;
   friend std::unique_ptr<Document> Parse(std::string_view html);

diff --git a/validator/cpp/htmlparser/iterators.h b/validator/cpp/htmlparser/iterators.h
@@ -0,0 +1,97 @@
+#ifndef CPP_HTMLPARSER_ITERATOR_H_
+#define CPP_HTMLPARSER_ITERATOR_H_
+
+#include <iterator>
+#include <stack>
+
+#include "cpp/htmlparser/node.h"
+
+namespace htmlparser {
+
+class Document;
+
+// A forward iterator that facilitates iterating dom tree (through root node),
+// in depth first traversal.
+//
+// Example usage:
+// auto doc = parser.Parse(html);
+// for (auto iter = doc.begin(); iter != doc.end(); ++iter) {
+//   ProcessNode(*iter);
+// }
+//
+// The above dom without NodeIterator require a lot of boiler plate code like
+// defining a stack class and data structure, knowledge of Node data structure.
+//
+// Clients should not access this class directly but get handle from Document
+// object.
+// auto iter = doc.begin();
+// auto const_iter = doc.cbegin();
+template <bool Const>
+class NodeIterator {
+ public:
+  // Member typdefs required by std::iterator_traits
+  // Not the correct type, and not used anyway.
+  using difference_type = std::ptrdiff_t;
+  using value_type = Node;
+  using pointer = std::conditional_t<Const, const Node*, Node*>;
+  using reference = std::conditional_t<Const, const Node&, Node&>;
+  using iterator_category = std::forward_iterator_tag;
+
+  reference operator*() const { return *current_node_; }
+  pointer operator->() const { return current_node_; }
+
+  // Prefix increment.
+  auto& operator++() {
+    if (current_node_->FirstChild()) {
+      if (current_node_->NextSibling()) {
+        stack_.push(current_node_->NextSibling());
+      }
+      current_node_ = current_node_->FirstChild();
+    } else {
+      current_node_ = current_node_->NextSibling();
+    }
+
+    if (!current_node_) {
+      if (!stack_.empty()) {
+        current_node_ = stack_.top();
+        stack_.pop();
+      }
+    }
+
+    return *this;
+  }
+
+  // Postfix increment.
+  auto operator++(int) {
+    auto result = *this; ++*this; return result;
+  }
+
+  template<bool R>
+  bool operator==(const NodeIterator<R>& rhs) const {
+    return current_node_ == rhs.current_node_;
+  }
+
+  template<bool R>
+  bool operator!=(const NodeIterator<R>& rhs) const {
+    return current_node_ != rhs.current_node_;
+  }
+
+  operator NodeIterator<true>() const {
+    return NodeIterator<true>{current_node_};
+  }
+
+ private:
+  explicit NodeIterator(Node* node) : current_node_(node) {}
+
+  friend class Document;
+  friend class NodeIterator<!Const>;
+  using node_pointer = std::conditional_t<Const, const Node*, Node*>;
+  node_pointer current_node_;
+  // Facilitates depth first traversal.
+  std::stack<Node*> stack_;
+};
+
+}  // namespace htmlparser
+
+
+#endif  // CPP_HTMLPARSER_ITERATOR_H_
diff --git a/validator/cpp/htmlparser/parser.cc b/validator/cpp/htmlparser/parser.cc
@@ -6,6 +6,7 @@
 #endif               // DUMP_NODES
 
 #include "absl/flags/flag.h"
+#include "absl/status/status.h"
 #include "cpp/htmlparser/atomutil.h"
 #include "cpp/htmlparser/comparators.h"
 #include "cpp/htmlparser/defer.h"
@@ -15,12 +16,7 @@
 #include "cpp/htmlparser/parser.h"
 #include "cpp/htmlparser/strings.h"
 
-ABSL_FLAG(uint32_t, htmlparser_max_nodes_depth_count, 245,
-          "Maximum depth of open nodes. "
-          "For: <a><b><c><d></d></c></b/></a>, the stack size is 4 as <a> is "
-          "kept open until three elements <b>,<c> and <d> are closed. "
-          "For: <a>a</a><b>b</b><c>c</c><d>d</d> the depth is 1 as they are "
-          "closed immediately after one child element.");
+ABSL_RETIRED_FLAG(uint32_t, htmlparser_max_nodes_depth_count, 245, "retired");
 
 namespace htmlparser {
 
@@ -85,16 +81,14 @@ std::unique_ptr<Document> ParseFragmentWithOptions(std::string_view html,
 
   auto doc = parser->Parse();
 
-  // doc could be nullptr when, for example, the stack depth >
-  // htmlparser_max_nodes_depth_count).
-  if (doc == nullptr) return nullptr;
-
-  Node* parent = fragment_parent ? root : doc->root_node_;
-  for (Node* c = parent->FirstChild(); c;) {
-    Node* next = c->NextSibling();
-    doc->fragment_nodes_.push_back(std::move(c));
-    parent->RemoveChild(c);
-    c = next;
+  if (doc->status().ok()) {
+    Node* parent = fragment_parent ? root : doc->root_node_;
+    for (Node* c = parent->FirstChild(); c;) {
+      Node* next = c->NextSibling();
+      doc->fragment_nodes_.push_back(std::move(c));
+      parent->RemoveChild(c);
+      c = next;
+    }
   }
 
   return doc;
@@ -134,13 +128,6 @@ Parser::Parser(std::string_view html, const ParseOptions& options,
 std::unique_ptr<Document> Parser::Parse() {
   bool eof = tokenizer_->IsEOF();
   while (!eof) {
-    if (open_elements_stack_.size() >
-        ::absl::GetFlag(FLAGS_htmlparser_max_nodes_depth_count)) {
-      // Skipping parsing. Document too complex.
-      delete document_.release();
-      return nullptr;
-    }
-
     Node* node = open_elements_stack_.Top();
     tokenizer_->SetAllowCDATA(node && !node->name_space_.empty());
     // Read and parse the next token.
@@ -150,7 +137,9 @@ std::unique_ptr<Document> Parser::Parse() {
     if (token_type == TokenType::ERROR_TOKEN) {
       eof = tokenizer_->IsEOF();
       if (!eof && tokenizer_->Error()) {
-        return nullptr;
+        document_->status_ = absl::InvalidArgumentError(
+            "htmlparser::Parser tokenizer error.");
+        return std::move(document_);
       }
     }
     token_ = tokenizer_->token();
@@ -1119,7 +1108,7 @@ bool Parser::AfterHeadIM() {
 }  // Parser::AfterHeadIM.
 
 // Section 12.2.6.4.7.
-bool Parser::InBodyIM() {
+bool Parser::InBodyIM() {  // NOLINT
   switch (token_.token_type) {
     case TokenType::TEXT_TOKEN: {
       std::string d = token_.data;
@@ -1751,7 +1740,8 @@ bool Parser::InBodyIM() {
   }
 
   return true;
-}  // Parser::InBodyIM.
+}  // NOLINT(readability/fn_size)
+// Parser::InBodyIM end.
 
 void Parser::InBodyEndTagFormatting(Atom tag_atom, std::string_view tag_name) {
   // This is the "adoption agency" algorithm, described at

diff --git a/validator/cpp/htmlparser/parser_test.cc b/validator/cpp/htmlparser/parser_test.cc
@@ -9,10 +9,8 @@
 #include "cpp/htmlparser/renderer.h"
 #include "cpp/htmlparser/token.h"
 
-ABSL_DECLARE_FLAG(uint32_t, htmlparser_max_nodes_depth_count);
-
 // For operator""s.
-using namespace std::string_literals;
+using namespace std::string_literals;  // NOLINT
 
 #define EXPECT_NOT_NULL(p) EXPECT_TRUE((p) != nullptr)
 #define EXPECT_NULL(p) EXPECT_FALSE((p) != nullptr)
@@ -769,27 +767,6 @@ TEST(ParserTest, NumTermsInTextNodeCountDisabled) {
   EXPECT_EQ(body->FirstChild()->NumTerms(), -1);
 }
 
-TEST(ParserTest, DocumentComplexityTest) {
-  ::absl::SetFlag(&FLAGS_htmlparser_max_nodes_depth_count, 4);
-
-  // Document parsing failed, body contains 4 deeply nested nodes..
-  htmlparser::Parser p(
-      "<html><body><a><b><c><m></m></c></b></a></body></html>");
-  EXPECT_NULL(p.Parse());
-
-  // Document parsed, open elements stack less than 4.
-  htmlparser::Parser p2("<html><body><a><b>foo</b></a></body></html>");
-  EXPECT_NOT_NULL(p2.Parse());
-
-  // Child nodes closing reduces the stack size. So maximum open nodes in the
-  // following document is 4.
-  htmlparser::Parser p3("<html><body><a>"
-                        "<b>foo</b><b>foo</b><b>foo</b><b>foo</b><b>foo</b>"
-                        "<b>foo</b><b>foo</b><b>foo</b><b>foo</b><b>foo</b>"
-                        "</a></body></html>");
-  EXPECT_NOT_NULL(p3.Parse());
-}
-
 TEST(ParserTest, DocumentMetadataTest) {
   auto doc = htmlparser::Parse("<html><head><base href=\"www.google.com\""
                                "target=\"blank\">"