From bed25b317c7ace86dc32589acdeea7495174c9d8 Mon Sep 17 00:00:00 2001 From: Anton Larin Date: Sat, 5 Aug 2023 13:17:37 +0200 Subject: [PATCH] Fix min_tokens logic for grouping documents: documents with (lengh >= min_tokens) should not be grouped into one document for indexing --- application/parser/token_func.py | 2 +- scripts/parser/token_func.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/parser/token_func.py b/application/parser/token_func.py index 4980de673..aada673fa 100644 --- a/application/parser/token_func.py +++ b/application/parser/token_func.py @@ -25,7 +25,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, extra_info=doc.extra_info) elif len(tiktoken.get_encoding("cl100k_base").encode( - current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens: + current_group.text)) + doc_len < max_tokens and doc_len < min_tokens: current_group.text += " " + doc.text else: docs.append(current_group) diff --git a/scripts/parser/token_func.py b/scripts/parser/token_func.py index e946f9f33..98ab5470a 100644 --- a/scripts/parser/token_func.py +++ b/scripts/parser/token_func.py @@ -24,7 +24,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, extra_info=doc.extra_info) elif len(tiktoken.get_encoding("cl100k_base").encode( - current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens: + current_group.text)) + doc_len < max_tokens and doc_len < min_tokens: current_group.text += " " + doc.text else: docs.append(current_group)