Skip to content

Commit

Permalink
UTF8 encoding/decoding library to deprecated utf8 functions in string…
Browse files Browse the repository at this point in the history
…s.h|cc (#36184)

library.

PiperOrigin-RevId: 398102411

Co-authored-by: Amaltas Bohra <amaltas@google.com>
  • Loading branch information
banaag and amaltas committed Sep 28, 2021
1 parent 800509b commit a66a623
Show file tree
Hide file tree
Showing 2 changed files with 337 additions and 0 deletions.
134 changes: 134 additions & 0 deletions validator/cpp/htmlparser/utf8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#ifndef HTMLPARSER__UTF8_H_
#define HTMLPARSER__UTF8_H_

#include <array>

namespace htmlparser {

// Is this an ascii character, that is byte is code-point in itself.
// (0..0x7f).
#define IS_ASCII(c) (c & 0x80) == 0

// Is this ascii char and a digit.
#define IS_DIGIT(c) (static_cast<uint8_t>(c - 0x30) < 0xa)

#define IS_ALPHABET(c) \
((static_cast<uint8_t>(c - 0x41) < 0x1a) || \
(static_cast<uint8_t>(c - 0x61) < 0x1a))

// Is this code point a Unicode non-character
#define IS_CODEPOINT_NONCHAR(c) \
((c) >= 0xfdd0 && \
((c) <= 0fdef || ((c) & 0xfffe) == 0xfffe) && (c) <= 0x10ffff)

#define IS_CODEPOINT_CHAR(c) \
(static_cast<uint32_t>(c) < 0Xd800 || \
(0Xdfff < c && c <= 0x10ffff && !IS_UNICODE_NONCHAR(c)))

// Counts number of continuation bytes for this codepoint.
#define NUM_TRAIL_BYTES(c) \
(IS_LEAD_BYTE(c) ? \
((static_cast<uint8_t>(c) >= 0xe0) + \
(static_cast<uint8_t>(c) >= 0xf0) + 1) \
: 0)

#define LEAD_BYTE_TRAIL_COUNT(c) \
((static_cast<uint8_t>(c) >= 0xc2) + \
(static_cast<uint8_t>(c) >= 0xe0) + \
(static_cast<uint8_t>(c) >= 0xf0))

#define CODE_POINT_NUM_BYTES(c) \
(static_cast<uint32_t>(c) <= 0x7f ? 1 : \
(static_cast<uint32_t>(c) <= 0x7ff ? 2 : \
(static_cast<uint32_t>(c) <= 0xd7ff ? 3 : \
(static_cast<uint32_t>(c) <= 0xdfff || \
static_cast<uint32_t>(c) > 0x10ffff ? 0 : \
(static_cast<uint32_t>(c) <= 0xffff ? 3 : 4) \
) \
) \
) \
)

#define READ_TRAIL_BYTE(c) \
(((static_cast<uint8_t>(c) & 0xc0) == 0x80) ? \
(c) & 0x3f : 0)

// Valid utf-8 byte sequences and their validity macros.
// Ref: Table 3.7 in https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
// +-------------------+------------+-------------+------------+-------------+
// | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
// +-------------------+------------+-------------+------------+-------------+
// | U+0000..U+007F | 00..7F | | | |
// +-------------------+------------+-------------+------------+-------------+
// | U+0080..U+07FF | C2..DF | 80..BF | | |
// +-------------------+------------+-------------+------------+-------------+
// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
// +-------------------+------------+-------------+------------+-------------+
// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
// +-------------------+------------+-------------+------------+-------------+
// | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
// +-------------------+------------+-------------+------------+-------------+
// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
// +-------------------+------------+-------------+------------+-------------+
// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
// +-------------------+------------+-------------+------------+-------------+
// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
// +-------------------+------------+-------------+------------+-------------+
// | U+100000..U+10FFFF| F4 | 80..8F | 80..BF | 80..BF |
// +-------------------+------------+-------------+------------+-------------+

static constexpr std::array<uint8_t, 16> k3ByteTrailByteValidity {
0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
0x10, 0x30, 0x30
};

static constexpr std::array<uint8_t, 16> k4ByteTrailByteValidity {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x0F, 0x0F, 0x0F, 0x00,
0x00, 0x00, 0x00
};

// Is utf-8 lead byte (0xc2..0xf4).
#define IS_UTF8_LEAD_BYTE(c) (static_cast<uint8_t>(c - 0xc2) <= 0x32)

// Is utf-8 trail byte (0x80..0xBF).
#define IS_UTF8_TRAIL_BYTE(c) (static_cast<int8_t>(c) < -0x40)

// Is utf-8 trail second byte valid.
#define IS_UTF8_TRAIL_2ND_BYTE_VALID(lead_byte, trail_byte) \
lead_byte < 0xe0 ? \
IS_UTF8_TRAIL_BYTE(trail_byte) : \
htmlparser::k3ByteTrailByteValidity[lead_byte & 0xf] & \
(1 << (static_cast<uint8_t>(trail_byte) >> 5))

#define IS_UTF8_TRAIL_3RD_BYTE_VALID(lead_byte, trail_byte) \
lead_byte >= 0xf0 ? \
htmlparser::k4ByteTrailByteValidity[static_cast<uint8_t>(trail_byte) >> 4] & \
(1 << (lead_byte & 7)) : 0

#define _DECODE_UTF8_2(c1, c2) \
((c1 & 0b11111) << 6) | READ_TRAIL_BYTE(c2)

#define _DECODE_UTF8_3(c1, c2, c3) \
((c1 & 0b1111) << 12) | (c2 << 6) | c3

#define _DECODE_UTF8_4(c1, c2, c3, c4) \
((c1 & 0b111) << 18) | \
(READ_TRAIL_BYTE(c2) << 12) | \
(READ_TRAIL_BYTE(c3) << 6) | \
READ_TRAIL_BYTE(c4)

#define _DECODE_UTF8_X(x, A, B, C, D, FUNC, ...) FUNC

#define TO_CODEPOINT(...) \
_DECODE_UTF8_X(, ##__VA_ARGS__, \
_DECODE_UTF8_4(__VA_ARGS__), \
_DECODE_UTF8_3(__VA_ARGS__), \
_DECODE_UTF8_2(__VA_ARGS__))

// (U+d800..U+dfff).
#define IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800)

} // namespace htmlparser


#endif // HTMLPARSER__UTF8_H_
203 changes: 203 additions & 0 deletions validator/cpp/htmlparser/utf8_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#include "utf8.h"

#include "gtest/gtest.h"

TEST(UTF8Test, AsciiCharsTest) {
for (uint8_t i = 'a'; i <= 'z'; ++i) {
EXPECT_TRUE(IS_ASCII(i));
EXPECT_FALSE(IS_DIGIT(i));
EXPECT_TRUE(IS_ALPHABET(i));
}
for (uint8_t i = 'A'; i <= 'Z'; ++i) {
EXPECT_TRUE(IS_ASCII(i));
EXPECT_FALSE(IS_DIGIT(i));
EXPECT_TRUE(IS_ALPHABET(i));
}
for (uint8_t i = '0'; i <= '9'; ++i) {
EXPECT_TRUE(IS_ASCII(i));
EXPECT_TRUE(IS_DIGIT(i));
EXPECT_FALSE(IS_ALPHABET(i));
}
}

TEST(UTF8Test, DecodeUtf8SymbolTest) {
EXPECT_EQ(TO_CODEPOINT(0xf0, 0x9d, 0x8c, 0x86), 119558);
EXPECT_EQ(TO_CODEPOINT(0xe2, 0x8c, 0x98), 9112);
EXPECT_EQ(TO_CODEPOINT(0xc5, 0x9a), 346);
}

TEST(UTF8Test, ReadContinuationByteTest) {
// First two bits 010... not a continuation byte.
EXPECT_EQ(0, READ_TRAIL_BYTE(0b11000001));
// Mask first two valid continuation bits.
EXPECT_EQ(0b00111111, READ_TRAIL_BYTE(0b10111111));
}

TEST(UTF8Test, IsTrailByteTest) {
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x9d));
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x8c));
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x86));
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x98));
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x9a));
EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xf0));
EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xe2));
EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xc5));
}

TEST(UTF8Test, IsLeadingByteTest) {
// Invalid bytes.
for (uint8_t i = 0; i < 194; ++i) {
EXPECT_FALSE(IS_UTF8_LEAD_BYTE(i));
}

// Valid 0xc2..0xf4.
for (uint8_t i = 0xc2; i < 0xf5; ++i) {
EXPECT_TRUE(IS_UTF8_LEAD_BYTE(i));
}

// Invalid bytes 0xf5 to 0xff.
for (uint8_t i = 245; i <= 254; ++i) {
EXPECT_FALSE(IS_UTF8_LEAD_BYTE(i));
}

EXPECT_FALSE(IS_UTF8_LEAD_BYTE(255));
}

TEST(UTF8Test, CodePointByteSequenceCountTest) {
EXPECT_EQ(0, LEAD_BYTE_TRAIL_COUNT('a'));
EXPECT_EQ(1, LEAD_BYTE_TRAIL_COUNT(0xc5));
EXPECT_EQ(2, LEAD_BYTE_TRAIL_COUNT(0xe2));
EXPECT_EQ(3, LEAD_BYTE_TRAIL_COUNT(0xf0));

// c2..df, 1 subsequent byte.
for (uint8_t i = 0xc2; i <= 0xdf; ++i) {
EXPECT_EQ(1, LEAD_BYTE_TRAIL_COUNT(i));
}

// e0..ef, 2 subsequent bytes.
for (uint8_t i = 0xe0; i <= 0xef; ++i) {
EXPECT_EQ(2, LEAD_BYTE_TRAIL_COUNT(i));
}

// f0..f4, 3 subsequent bytes.
for (uint8_t i = 0xf0; i <= 0xf4; ++i) {
EXPECT_EQ(3, LEAD_BYTE_TRAIL_COUNT(i));
}
}

TEST(UTF8Test, CodePointNumBytesTest) {
EXPECT_EQ(1, CODE_POINT_NUM_BYTES('a'));
EXPECT_EQ(2, CODE_POINT_NUM_BYTES(346 /*"Ś"*/));
EXPECT_EQ(3, CODE_POINT_NUM_BYTES(9112));
EXPECT_EQ(4, CODE_POINT_NUM_BYTES(119558 /*"𝌆"*/));
}

TEST(UTF8Test, 3ByteSequenceValidityTest) {
// Lead byte 0xc2..0xdf.
for (uint8_t i = 0xc2; i <= 0xdf; ++i) {
// Not a trail byte.
for (uint8_t j = 0; j < 0x80; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));

// Valid range.
for (uint8_t j = 0x80; j <= 0xbf; ++j) {
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
}

// Lead byte 0xe0.
// Not a trail byte.
uint8_t i = 0xe0;
for (uint8_t j = 0; j < 0xa0; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
for (uint8_t j = 0xa0; j <= 0xbf; ++j) {
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
// Bytes > 0xbf is invalid.
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));

// Lead byte 0xe1..0xec.
for (uint8_t i = 0xe1; i <= 0xec; ++i) {
// Not a trail byte.
for (uint8_t j = 0; j < 0x80; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));

// Valid range.
for (uint8_t j = 0x80; j <= 0xbf; ++j) {
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
}

// Lead byte 0xed.
i = 0xed;
// Not a trail byte.
for (uint8_t j = 0; j < 0x80; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));

// Valid range.
for (uint8_t j = 0x80; j <= 0x9f; ++j) {
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
// Invalid range. 0xa0..0xbf.
for (uint8_t j = 0xa0; j <= 0xbf; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}

// Lead byte 0xee..0xef.
for (uint8_t i = 0xee; i <= 0xef; ++i) {
// Not a trail byte.
for (uint8_t j = 0; j < 0x80; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));

// Valid range.
for (uint8_t j = 0x80; j <= 0xbf; ++j) {
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
}
}
}

TEST(UTF8Test, 4ByteSequenceValidityTest) {
// Not a 4 byte sequence lead byte.
for (uint8_t i = 0; i < 0xf0; ++i) {
for (uint8_t j = 0; j < 0xff; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j));
}
}

// 4byte lead byte 0xf1..0xf3
for (uint8_t i = 0xf1; i <= 0xf3; ++i) {
// Invalid trail byte.
for (uint8_t j = 0; j < 0x80; ++j) {
EXPECT_FALSE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j));
}
// Valid trail byte.
for (uint8_t j = 0x80; j <= 0xbf; ++j) {
EXPECT_TRUE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j));
}
}
}

0 comments on commit a66a623

Please sign in to comment.