-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Remove dom tree child node depth count limit from htmlparser. (#37259)
Reason: - The htmlparser::Parser (parser) itself doesn't suffer from any complexity due to deeply nested nodes. This was done for clients like Validator etc. The AMP Validator also has it's own max_node_recursion_depth flag and stack check. - The parser ability to parse complex document is only limited by heap memory, there is no pressure on thread stack size. - Returning null document for deeply nested node is a big divergence from html5 algorithm which lays out rules to return html dom tree in almost every possible input. - Originally, the htmlparser_max_nodes_depth_count flag was introduced because of Node data structure recursive ownership (parent owns first child owns first sibling), so destructor spent a long time destroying the document and deeply nested node put a lot of pressure on stack due to recursion. This was fixed after introducing custom Allocator (allocator.h) which frees blocks of memory efficiently. PiperOrigin-RevId: 416939834 Co-authored-by: Amaltas Bohra <amaltas@google.com>
- Loading branch information
1 parent
bd928e9
commit d0a2f1a
Showing
6 changed files
with
166 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#ifndef CPP_HTMLPARSER_ITERATOR_H_ | ||
#define CPP_HTMLPARSER_ITERATOR_H_ | ||
|
||
#include <iterator> | ||
#include <stack> | ||
|
||
#include "cpp/htmlparser/node.h" | ||
|
||
namespace htmlparser { | ||
|
||
class Document; | ||
|
||
// A forward iterator that facilitates iterating dom tree (through root node), | ||
// in depth first traversal. | ||
// | ||
// Example usage: | ||
// auto doc = parser.Parse(html); | ||
// for (auto iter = doc.begin(); iter != doc.end(); ++iter) { | ||
// ProcessNode(*iter); | ||
// } | ||
// | ||
// The above dom without NodeIterator require a lot of boiler plate code like | ||
// defining a stack class and data structure, knowledge of Node data structure. | ||
// | ||
// Clients should not access this class directly but get handle from Document | ||
// object. | ||
// auto iter = doc.begin(); | ||
// auto const_iter = doc.cbegin(); | ||
template <bool Const> | ||
class NodeIterator { | ||
public: | ||
// Member typdefs required by std::iterator_traits | ||
// Not the correct type, and not used anyway. | ||
using difference_type = std::ptrdiff_t; | ||
using value_type = Node; | ||
using pointer = std::conditional_t<Const, const Node*, Node*>; | ||
using reference = std::conditional_t<Const, const Node&, Node&>; | ||
using iterator_category = std::forward_iterator_tag; | ||
|
||
reference operator*() const { return *current_node_; } | ||
pointer operator->() const { return current_node_; } | ||
|
||
// Prefix increment. | ||
auto& operator++() { | ||
if (current_node_->FirstChild()) { | ||
if (current_node_->NextSibling()) { | ||
stack_.push(current_node_->NextSibling()); | ||
} | ||
current_node_ = current_node_->FirstChild(); | ||
} else { | ||
current_node_ = current_node_->NextSibling(); | ||
} | ||
|
||
if (!current_node_) { | ||
if (!stack_.empty()) { | ||
current_node_ = stack_.top(); | ||
stack_.pop(); | ||
} | ||
} | ||
|
||
return *this; | ||
} | ||
|
||
// Postfix increment. | ||
auto operator++(int) { | ||
auto result = *this; ++*this; return result; | ||
} | ||
|
||
template<bool R> | ||
bool operator==(const NodeIterator<R>& rhs) const { | ||
return current_node_ == rhs.current_node_; | ||
} | ||
|
||
template<bool R> | ||
bool operator!=(const NodeIterator<R>& rhs) const { | ||
return current_node_ != rhs.current_node_; | ||
} | ||
|
||
operator NodeIterator<true>() const { | ||
return NodeIterator<true>{current_node_}; | ||
} | ||
|
||
private: | ||
explicit NodeIterator(Node* node) : current_node_(node) {} | ||
|
||
friend class Document; | ||
friend class NodeIterator<!Const>; | ||
using node_pointer = std::conditional_t<Const, const Node*, Node*>; | ||
node_pointer current_node_; | ||
// Facilitates depth first traversal. | ||
std::stack<Node*> stack_; | ||
}; | ||
|
||
} // namespace htmlparser | ||
|
||
|
||
#endif // CPP_HTMLPARSER_ITERATOR_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters