This repository has been archived by the owner on Apr 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 158
/
collapse_whitespace_filter.cc
123 lines (108 loc) · 4.28 KB
/
collapse_whitespace_filter.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "pagespeed/kernel/html/collapse_whitespace_filter.h"
#include <algorithm>
#include <cstddef>
#include <vector>
#include "base/logging.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/html/html_node.h"
namespace net_instaweb {
class HtmlParse;
namespace {
// Tags within which we should never try to collapse whitespace (note that this
// is not _quite_ the same thing as kLiteralTags in html_lexer.cc):
const HtmlName::Keyword kSensitiveTags[] = {HtmlName::kCode, HtmlName::kPre,
HtmlName::kScript, HtmlName::kStyle,
HtmlName::kTextarea};
bool IsSensitiveKeyword(HtmlName::Keyword keyword) {
const HtmlName::Keyword* end = kSensitiveTags + arraysize(kSensitiveTags);
return std::binary_search(kSensitiveTags, end, keyword);
}
} // namespace
CollapseWhitespaceFilter::CollapseWhitespaceFilter(HtmlParse* html_parse)
: html_parse_(html_parse) {
for (size_t i = 1; i < arraysize(kSensitiveTags); ++i) {
DCHECK(kSensitiveTags[i - 1] < kSensitiveTags[i]);
}
}
CollapseWhitespaceFilter::~CollapseWhitespaceFilter() {}
void CollapseWhitespaceFilter::StartDocument() { keyword_stack_.clear(); }
void CollapseWhitespaceFilter::StartElement(HtmlElement* element) {
HtmlName::Keyword keyword = element->keyword();
if (IsSensitiveKeyword(keyword)) {
keyword_stack_.push_back(keyword);
}
}
void CollapseWhitespaceFilter::EndElement(HtmlElement* element) {
HtmlName::Keyword keyword = element->keyword();
if (!keyword_stack_.empty() && (keyword == keyword_stack_.back())) {
keyword_stack_.pop_back();
} else {
DCHECK(!IsSensitiveKeyword(keyword));
}
}
void CollapseWhitespaceFilter::Characters(HtmlCharactersNode* characters) {
if (keyword_stack_.empty()) {
// Mutate the contents-string in-place for speed.
GoogleString* contents = characters->mutable_contents();
// It is safe to directly mutate the bytes in the string because
// we are only going to shrink it, never grow it.
char* read_ptr = &(*contents)[0];
char* write_ptr = read_ptr;
char* end = read_ptr + contents->size();
int in_whitespace = 0; // Used for pointer-subtraction so newlines dominate
for (; read_ptr != end; ++read_ptr) {
char ch = *read_ptr;
switch (ch) {
// See http://www.w3.org/TR/html401/struct/text.html#h-9.1
case ' ':
case '\t':
case '\r':
case '\f':
// Add whitespace if the previous character was not already
// whitespace. Note that the whitespace may be overwritten
// by a newline. This extra branch could be avoided if we folded
// the current whitespace-state into the switch via an OR.
if (in_whitespace == 0) {
*write_ptr++ = ch;
in_whitespace = 1;
}
break;
case '\n':
// If the previous character was a whitespace, then back up
// so that the 'write' in the default case will overwrite the
// previous whitespace with a newline. Avoid branches.
write_ptr -= in_whitespace;
in_whitespace = 1;
*write_ptr++ = ch;
break;
default:
in_whitespace = 0;
*write_ptr++ = ch;
break;
}
}
contents->resize(write_ptr - contents->data());
}
}
} // namespace net_instaweb