Skip to content

Commit

Permalink
[fix](inverted index) special characters cause buffer overflow in Uni…
Browse files Browse the repository at this point in the history
…code tokenization.
  • Loading branch information
zzzxl1993 committed Apr 29, 2024
1 parent 9f849a4 commit 82e7b69
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 43 deletions.
50 changes: 9 additions & 41 deletions src/core/CLucene/analysis/standard95/StandardTokenizerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ const std::vector<std::string> StandardTokenizerImpl::ZZ_ERROR_MSG = {
"Error: pushback value was too large"};

StandardTokenizerImpl::StandardTokenizerImpl(lucene::util::Reader* reader)
: zzBuffer(ZZ_BUFFERSIZE), zzReader(reader) {}
: zzReader(reader), zzBuffer((reader == nullptr) ? 0 : reader->size()) {}

std::string_view StandardTokenizerImpl::getText() {
return std::string_view(zzBuffer.data() + zzStartRead,
Expand All @@ -67,53 +67,20 @@ std::string_view StandardTokenizerImpl::getText() {

bool StandardTokenizerImpl::zzRefill() {
if (zzStartRead > 0) {
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
std::copy_n(zzBuffer.begin() + zzStartRead, zzEndRead - zzStartRead,
zzBuffer.begin());

zzEndRead -= zzStartRead;
zzCurrentPos -= zzStartRead;
zzMarkedPos -= zzStartRead;
zzStartRead = 0;
}

int32_t requested = zzBuffer.size() - zzEndRead - zzFinalHighSurrogate;
if (requested == 0) {
return true;
return true;
}

int32_t numRead = zzReader->readCopy(zzBuffer.data(), zzEndRead, requested);
if (numRead == 0) {
_CLTHROWA(CL_ERR_Runtime,
"Reader returned 0 characters. See JFlex examples/zero-reader "
"for a workaround.");
}
int32_t numRead = zzReader->readCopy(zzBuffer.data(), 0, zzBuffer.size());
if (numRead > 0) {
zzEndRead += numRead;

int32_t n =
StringUtil::validate_utf8(std::string_view(zzBuffer.data(), zzEndRead));
if (n == -1) {
yyResetPosition();
return true;
}
assert(zzBuffer.size() == numRead);
zzEndRead += numRead;

if (n != 0) {
if (numRead == requested) {
zzEndRead -= n;
zzFinalHighSurrogate = n;
} else {
int32_t c = zzReader->read();
if (c == -1) {
int32_t n = StringUtil::validate_utf8(std::string_view(zzBuffer.data(), zzBuffer.size()));
if (n != 0) {
return true;
} else {
_CLTHROWA(CL_ERR_Runtime, "Why did you come here");
}
}
}

return false;
return false;
}

return true;
Expand All @@ -126,6 +93,7 @@ void StandardTokenizerImpl::yyclose() {

void StandardTokenizerImpl::yyreset(lucene::util::Reader* reader) {
zzReader = reader;
zzBuffer.resize(reader->size());
yyResetPosition();
zzLexicalState = YYINITIAL;
}
Expand Down
4 changes: 2 additions & 2 deletions src/core/CLucene/util/stringUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,10 +297,10 @@ class StringUtil {
} else {
if ((c & 0xC0) != 0x80) return -1;
codepoint = (codepoint << 6) | (c & 0x3F);
if (!is_valid_codepoint(codepoint)) {
bytes_in_char--;
if (bytes_in_char == 0 && !is_valid_codepoint(codepoint)) {
return -1;
}
bytes_in_char--;
surplus_bytes++;
}
}
Expand Down

0 comments on commit 82e7b69

Please sign in to comment.