From affd92eea6feee38b121f78f7bc53086682f4749 Mon Sep 17 00:00:00 2001 From: zzzxl1993 <474696115@qq.com> Date: Mon, 29 Apr 2024 16:45:14 +0800 Subject: [PATCH] [fix](inverted index) special characters cause buffer overflow in Unicode tokenization. --- .../standard95/StandardTokenizerImpl.cpp | 52 ++++--------------- src/core/CLucene/util/stringUtil.h | 4 +- 2 files changed, 12 insertions(+), 44 deletions(-) diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp b/src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp index e8ce8f77687..0dfb116557f 100644 --- a/src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp +++ b/src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp @@ -58,7 +58,7 @@ const std::vector StandardTokenizerImpl::ZZ_ERROR_MSG = { "Error: pushback value was too large"}; StandardTokenizerImpl::StandardTokenizerImpl(lucene::util::Reader* reader) - : zzBuffer(ZZ_BUFFERSIZE), zzReader(reader) {} + : zzReader(reader), zzBuffer((reader == nullptr) ? 0 : reader->size()) {} std::string_view StandardTokenizerImpl::getText() { return std::string_view(zzBuffer.data() + zzStartRead, @@ -67,53 +67,20 @@ std::string_view StandardTokenizerImpl::getText() { bool StandardTokenizerImpl::zzRefill() { if (zzStartRead > 0) { - zzEndRead += zzFinalHighSurrogate; - zzFinalHighSurrogate = 0; - std::copy_n(zzBuffer.begin() + zzStartRead, zzEndRead - zzStartRead, - zzBuffer.begin()); - - zzEndRead -= zzStartRead; - zzCurrentPos -= zzStartRead; - zzMarkedPos -= zzStartRead; - zzStartRead = 0; - } - - int32_t requested = zzBuffer.size() - zzEndRead - zzFinalHighSurrogate; - if (requested == 0) { - return true; + return true; } - int32_t numRead = zzReader->readCopy(zzBuffer.data(), zzEndRead, requested); - if (numRead == 0) { - _CLTHROWA(CL_ERR_Runtime, - "Reader returned 0 characters. See JFlex examples/zero-reader " - "for a workaround."); - } + int32_t numRead = zzReader->readCopy(zzBuffer.data(), 0, zzBuffer.size()); if (numRead > 0) { - zzEndRead += numRead; - - int32_t n = - StringUtil::validate_utf8(std::string_view(zzBuffer.data(), zzEndRead)); - if (n == -1) { - yyResetPosition(); - return true; - } + assert(zzBuffer.size() == numRead); + zzEndRead += numRead; - if (n != 0) { - if (numRead == requested) { - zzEndRead -= n; - zzFinalHighSurrogate = n; - } else { - int32_t c = zzReader->read(); - if (c == -1) { + int32_t n = StringUtil::validate_utf8(std::string_view(zzBuffer.data(), zzBuffer.size())); + if (n != 0) { return true; - } else { - _CLTHROWA(CL_ERR_Runtime, "Why did you come here"); - } } - } - return false; + return false; } return true; @@ -126,6 +93,7 @@ void StandardTokenizerImpl::yyclose() { void StandardTokenizerImpl::yyreset(lucene::util::Reader* reader) { zzReader = reader; + zzBuffer.resize(reader->size()); yyResetPosition(); zzLexicalState = YYINITIAL; } @@ -181,7 +149,7 @@ int32_t StandardTokenizerImpl::getNextToken() { { while (true) { - if (zzCurrentPosL < zzEndReadL) { + if (zzCurrentPosL < zzEndReadL && (zzCurrentPosL - zzStartRead) < ZZ_BUFFERSIZE) { size_t len = 0; zzInput = decodeUtf8ToCodepoint( std::string_view(zzBufferL.data() + zzCurrentPosL, zzEndReadL), diff --git a/src/core/CLucene/util/stringUtil.h b/src/core/CLucene/util/stringUtil.h index e7d41e1d832..278c2c48328 100644 --- a/src/core/CLucene/util/stringUtil.h +++ b/src/core/CLucene/util/stringUtil.h @@ -297,10 +297,10 @@ class StringUtil { } else { if ((c & 0xC0) != 0x80) return -1; codepoint = (codepoint << 6) | (c & 0x3F); - if (!is_valid_codepoint(codepoint)) { + bytes_in_char--; + if (bytes_in_char == 0 && !is_valid_codepoint(codepoint)) { return -1; } - bytes_in_char--; surplus_bytes++; } }