diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h index 02c4e4611e02..e45057c9b84d 100644 --- a/lldb/include/lldb/ValueObject/DILLexer.h +++ b/lldb/include/lldb/ValueObject/DILLexer.h @@ -227,8 +227,8 @@ class DILLexer { m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)), m_tokens_idx(0) {} - static llvm::Expected Lex(llvm::StringRef expr, - llvm::StringRef &remainder); + static llvm::Expected + Lex(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &position); bool IsStringLiteral(Token::Kind kind) { return (kind == Token::string_literal || diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index f41889fef921..88ef57cc42f6 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -12,39 +12,14 @@ //===----------------------------------------------------------------------===// #include "lldb/ValueObject/DILLexer.h" -//#include "llvm/ADT/StringMap.h" +#include "clang/Basic/CharInfo.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Unicode.h" +#include namespace lldb_private::dil { -/* -const llvm::StringMap Keywords = { - {"bool", Token::kw_bool}, - {"char", Token::kw_char}, - {"char16_t", Token::kw_char16_t}, - {"char32_t", Token::kw_char32_t}, - {"const", Token::kw_const}, - {"double", Token::kw_double}, - {"dynamic_cast", Token::kw_dynamic_cast}, - {"false", Token::kw_false}, - {"float", Token::kw_float}, - {"int", Token::kw_int}, - {"long", Token::kw_long}, - {"namespace", Token::kw_namespace}, - {"nullptr", Token::kw_nullptr}, - {"reinterpret_cast", Token::kw_reinterpret_cast}, - {"short", Token::kw_short}, - {"signed", Token::kw_signed}, - {"sizeof", Token::kw_sizeof}, - {"static_cast", Token::kw_static_cast}, - {"this", Token::kw_this}, - {"true", Token::kw_true}, - {"unsigned", Token::kw_unsigned}, - {"void", Token::kw_void}, - {"volatile", Token::kw_volatile}, - {"wchar_t", Token::kw_wchar_t}}; -*/ - llvm::StringRef Token::GetTokenName(Kind kind) { switch (kind){ case Token::amp: return "amp"; @@ -153,22 +128,53 @@ llvm::StringRef Token::GetTokenName(Kind kind) { } } -static bool IsLetter (char c) { +static bool IsLetter(char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); } -static bool IsDigit (char c) { return ('0' <= c && c <= '9'); } +static bool IsDigit(char c) { return ('0' <= c && c <= '9'); } + +inline bool IsOperator(unsigned char c) { + using namespace clang::charinfo; + return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0; +} + +static bool IsValidIdentifierContinuation(char c) { + if (c == '$') + return true; + return !IsOperator(c) && !clang::isWhitespace(c); +} + +static std::optional IsWord(llvm::StringRef &remainder) { + llvm::StringRef::iterator cur_pos = remainder.begin(); + llvm::StringRef::iterator start = cur_pos; + + if (IsDigit(*cur_pos)) + return std::nullopt; + + while (cur_pos < remainder.end()) { + uint8_t c = *cur_pos; + if (c < 0x80) { + if (IsValidIdentifierContinuation(c)) { + cur_pos++; + continue; + } else + break; + } + if (llvm::isLegalUTF8Sequence((const llvm::UTF8 *)cur_pos, + (const llvm::UTF8 *)remainder.end())) { + cur_pos += llvm::getNumBytesForUTF8(*cur_pos); + continue; + } + break; + } -static std::optional IsWord(llvm::StringRef expr, - llvm::StringRef &remainder) { - // Find the longest prefix consisting of letters, digits, underscors and - // '$'. If it doesn't start with a digit, then it's a word. - llvm::StringRef candidate = remainder.take_while( - [](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; }); - if (candidate.empty() || IsDigit(candidate[0])) + if (cur_pos == start) return std::nullopt; - remainder = remainder.drop_front(candidate.size()); - return candidate; + + auto length = cur_pos - start; + remainder = remainder.drop_front(length); + return llvm::StringRef(start, length); } static void ConsumeNumberBody(uint32_t &length, char &prev_ch, @@ -224,35 +230,37 @@ static std::optional IsNumber(llvm::StringRef expr, llvm::Expected DILLexer::Create(llvm::StringRef expr) { std::vector tokens; llvm::StringRef remainder = expr; + uint32_t position = 0; do { - if (llvm::Expected t = Lex(expr, remainder)) { + if (llvm::Expected t = Lex(expr, remainder, position)) { tokens.push_back(std::move(*t)); } else { return t.takeError(); } } while (tokens.back().GetKind() != Token::eof); + return DILLexer(expr, std::move(tokens)); } - llvm::Expected DILLexer::Lex(llvm::StringRef expr, - llvm::StringRef &remainder) { - // Skip over whitespace (spaces). + llvm::StringRef &remainder, + uint32_t &position) { + llvm::StringRef::iterator start = remainder.begin(); remainder = remainder.ltrim(); - llvm::StringRef::iterator cur_pos = remainder.begin(); + position += remainder.begin() - start; // Check to see if we've reached the end of our input string. if (remainder.empty()) - return Token(Token::eof, "", (uint32_t)expr.size()); + return Token(Token::eof, "", position); - uint32_t position = cur_pos - expr.begin();; - llvm::StringRef::iterator start = cur_pos; std::optional maybe_number = IsNumber(expr, remainder); if (maybe_number) { std::string number = (*maybe_number).str(); - return Token(Token::numeric_constant, number, position); + auto token = Token(Token::numeric_constant, number, position); + position += number.size(); + return token; } else { - std::optional maybe_word = IsWord(expr, remainder); + std::optional maybe_word = IsWord(remainder); if (maybe_word) { llvm::StringRef word = *maybe_word; Token::Kind kind = llvm::StringSwitch(word) @@ -281,11 +289,12 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, .Case("volatile", Token::kw_volatile) .Case("wchar_t", Token::kw_wchar_t) .Default(Token::identifier); - return Token(kind, word.str(), (uint32_t)position); + auto token = Token(kind, word.str(), position); + position += llvm::sys::unicode::columnWidthUTF8(word.str()); + return token; } } - cur_pos = start; constexpr std::pair operators[] = { {Token::l_square, "["}, {Token::r_square, "]"}, @@ -332,8 +341,11 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, {Token::tilde, "~"}, }; for (auto [kind, str] : operators) { - if (remainder.consume_front(str)) - return Token(kind, str, position); + if (remainder.consume_front(str)) { + auto token = Token(kind, str, position); + position += strlen(str); + return token; + } } // Unrecognized character(s) in string; unable to lex it. diff --git a/lldb/unittests/DIL/DILLexerTests.cpp b/lldb/unittests/DIL/DILLexerTests.cpp index 026a092aa9e8..c6abc8674de6 100644 --- a/lldb/unittests/DIL/DILLexerTests.cpp +++ b/lldb/unittests/DIL/DILLexerTests.cpp @@ -121,8 +121,9 @@ TEST(DILLexerTests, MultiTokenLexTest) { TEST(DILLexerTests, IdentifiersTest) { // These strings should lex into identifier tokens. std::vector valid_identifiers = { - "$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$", - "a_b", "kw_this", "self", "a", "MyName", "kw_namespace"}; + "$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$", + "a_b", "kw_this", "self", "a", "MyName", "kw_namespace", "föo", + "🍫", "שלום"}; // The lexer can lex these strings, but they should not be identifiers. std::vector invalid_identifiers = {"", "::", "(", ")", "234", "2"}; diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp index 8cbdbb1fdc3b..08dfd985a08b 100644 --- a/lldb/unittests/DIL/DILTests.cpp +++ b/lldb/unittests/DIL/DILTests.cpp @@ -3680,3 +3680,20 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) { EXPECT_THAT(Eval("*\"abc\""), IsError("string literals are not supported")); } #endif + +TEST_F(EvalTest, TestUnicodeInput) { + EXPECT_THAT(Eval("フー + 1"), IsEqual("2")); + EXPECT_THAT(Eval("1 + フー"), IsEqual("2")); + EXPECT_THAT(Eval("föo + 1"), IsEqual("4")); + EXPECT_THAT(Eval("שלום + 1"), IsEqual("5")); + + // Check diagnostic pointer location + EXPECT_THAT(Eval("фу + бар"), + IsError(": use of undeclared identifier 'бар'\n" + "фу + бар\n" + " ^")); + EXPECT_THAT(Eval("フー + бар"), + IsError(": use of undeclared identifier 'бар'\n" + "フー + бар\n" + " ^")); +} \ No newline at end of file diff --git a/lldb/unittests/DIL/Inputs/test_binary.cc b/lldb/unittests/DIL/Inputs/test_binary.cc index 64ef8dc217c3..c289f93a1202 100644 --- a/lldb/unittests/DIL/Inputs/test_binary.cc +++ b/lldb/unittests/DIL/Inputs/test_binary.cc @@ -1196,6 +1196,14 @@ static void TestStringParsing() { // BREAK(TestStringParsing) } +static void TestUnicodeInput() { + int フー = 1; + int фу = 2; + int föo = 3; + int שלום = 4; + // BREAK(TestUnicodeInput) +} + namespace test_binary { void main() { @@ -1250,6 +1258,7 @@ void main() { TestCharParsing(); TestStringParsing(); + TestUnicodeInput(); // BREAK HERE }