Skip to content

Commit

Permalink
Partition pdf (#301)
Browse files Browse the repository at this point in the history
* マルチプロセスでPDFの解析, unstructured.partition.pdfによる詳細解析

* CIのエラー対応

* チャットボットごとにpartition_pdfの設定を切り替える

* mypyとblackの実行

* CIエラー修正

* レビューコメント反映

* cdk testを更新

* インデント誤りを訂正

* s3 loaderのログ修正

* enablePartitionPdfのメッセージ内容を、時間がかかりコストが増える旨を追記

* conflist解消のケアレスミス

* CIエラー修正

* コメント反映

EMBEDDING_CONTAINER_VCPU, EMBEDDING_CONTAINER_MEMORYからデフォルト値の設定をはずす

retryのパラメータをpostgres用とupdate_sync_status用で別々にする
  • Loading branch information
fsatsuki committed May 18, 2024
1 parent de4f042 commit 4fae0bb
Show file tree
Hide file tree
Showing 23 changed files with 223 additions and 52 deletions.
2 changes: 2 additions & 0 deletions backend/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class EmbeddingConfig(TypedDict):
model_id: str
chunk_size: int
chunk_overlap: int
enable_partition_pdf: bool


# Configure generation parameter for Claude chat response.
Expand Down Expand Up @@ -42,6 +43,7 @@ class EmbeddingConfig(TypedDict):
# NOTE: consider that cohere allows up to 2048 tokens per request
"chunk_size": 1000,
"chunk_overlap": 200,
"enable_partition_pdf": False,
}

# Configure search parameter to fetch relevant documents from vector store.
Expand Down
12 changes: 12 additions & 0 deletions backend/app/repositories/custom_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,12 @@ def find_private_bot_by_id(user_id: str, bot_id: str) -> BotModel:
and "chunk_overlap" in item["EmbeddingParams"]
else 200
),
enable_partition_pdf=(
item["EmbeddingParams"]["enable_partition_pdf"]
if "EmbeddingParams" in item
and "enable_partition_pdf" in item["EmbeddingParams"]
else False
),
),
generation_params=GenerationParamsModel(
**(
Expand Down Expand Up @@ -407,6 +413,12 @@ def find_public_bot_by_id(bot_id: str) -> BotModel:
and "chunk_overlap" in item["EmbeddingParams"]
else 200
),
enable_partition_pdf=(
item["EmbeddingParams"]["enable_partition_pdf"]
if "EmbeddingParams" in item
and "enable_partition_pdf" in item["EmbeddingParams"]
else False
),
),
generation_params=GenerationParamsModel(
**(
Expand Down
1 change: 1 addition & 0 deletions backend/app/repositories/models/custom_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
class EmbeddingParamsModel(BaseModel):
chunk_size: int
chunk_overlap: int
enable_partition_pdf: bool


class KnowledgeModel(BaseModel):
Expand Down
1 change: 1 addition & 0 deletions backend/app/routes/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def get_private_bot(request: Request, bot_id: str):
embedding_params=EmbeddingParams(
chunk_size=bot.embedding_params.chunk_size,
chunk_overlap=bot.embedding_params.chunk_overlap,
enable_partition_pdf=bot.embedding_params.enable_partition_pdf,
),
knowledge=Knowledge(
source_urls=bot.knowledge.source_urls,
Expand Down
3 changes: 3 additions & 0 deletions backend/app/routes/schemas/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
class EmbeddingParams(BaseSchema):
chunk_size: int
chunk_overlap: int
enable_partition_pdf: bool


class GenerationParams(BaseSchema):
Expand Down Expand Up @@ -94,6 +95,8 @@ def is_embedding_required(self, current_bot_model: BotModel) -> bool:
== current_bot_model.embedding_params.chunk_size
and self.embedding_params.chunk_overlap
== current_bot_model.embedding_params.chunk_overlap
and self.embedding_params.enable_partition_pdf
== current_bot_model.embedding_params.enable_partition_pdf
):
pass
else:
Expand Down
16 changes: 16 additions & 0 deletions backend/app/usecases/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ def create_new_bot(user_id: str, bot_input: BotInput) -> BotOutput:
else DEFAULT_EMBEDDING_CONFIG["chunk_overlap"]
)

enable_partition_pdf = (
bot_input.embedding_params.enable_partition_pdf
if bot_input.embedding_params
else DEFAULT_EMBEDDING_CONFIG["enable_partition_pdf"]
)

generation_params = (
bot_input.generation_params.model_dump()
if bot_input.generation_params
Expand Down Expand Up @@ -158,6 +164,7 @@ def create_new_bot(user_id: str, bot_input: BotInput) -> BotOutput:
embedding_params=EmbeddingParamsModel(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_partition_pdf=enable_partition_pdf,
),
generation_params=GenerationParamsModel(**generation_params),
search_params=SearchParamsModel(**search_params),
Expand Down Expand Up @@ -185,6 +192,7 @@ def create_new_bot(user_id: str, bot_input: BotInput) -> BotOutput:
embedding_params=EmbeddingParams(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_partition_pdf=enable_partition_pdf,
),
generation_params=GenerationParams(**generation_params),
search_params=SearchParams(**search_params),
Expand Down Expand Up @@ -239,6 +247,12 @@ def modify_owned_bot(
else DEFAULT_EMBEDDING_CONFIG["chunk_overlap"]
)

enable_partition_pdf = (
modify_input.embedding_params.enable_partition_pdf
if modify_input.embedding_params
else DEFAULT_EMBEDDING_CONFIG["enable_partition_pdf"]
)

generation_params = (
modify_input.generation_params.model_dump()
if modify_input.generation_params
Expand Down Expand Up @@ -266,6 +280,7 @@ def modify_owned_bot(
embedding_params=EmbeddingParamsModel(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_partition_pdf=enable_partition_pdf,
),
generation_params=GenerationParamsModel(**generation_params),
search_params=SearchParamsModel(**search_params),
Expand All @@ -286,6 +301,7 @@ def modify_owned_bot(
embedding_params=EmbeddingParams(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_partition_pdf=enable_partition_pdf,
),
generation_params=GenerationParams(**generation_params),
search_params=SearchParams(**search_params),
Expand Down
2 changes: 2 additions & 0 deletions backend/embedding.requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ unstructured[docx]==0.12.6
unstructured[xlsx]==0.12.6
unstructured[pptx]==0.12.6
unstructured[md]==0.12.6
retry==0.9.2
types-retry==0.9.9.4
30 changes: 27 additions & 3 deletions backend/embedding/loaders/s3.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
import os
import tempfile

import logging
import boto3
from distutils.util import strtobool
from embedding.loaders.base import BaseLoader, Document
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


class S3FileLoader(BaseLoader):
"""Loads a document from a file in S3.
Reference: `langchain_community.document_loaders.S3FileLoader` class
"""

def __init__(self, bucket: str, key: str, mode: str = "single"):
def __init__(
self,
bucket: str,
key: str,
mode: str = "single",
enable_partition_pdf: bool = False,
):
self.bucket = bucket
self.key = key
self.mode = mode
self.enable_partition_pdf = enable_partition_pdf

def _get_elements(self) -> list:
"""Get elements."""
Expand All @@ -23,7 +35,19 @@ def _get_elements(self) -> list:
file_path = f"{temp_dir}/{self.key}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
s3.download_file(self.bucket, self.key, file_path)
return partition(filename=file_path)
extension = os.path.splitext(file_path)[1]

if extension == ".pdf" and self.enable_partition_pdf == True:
logger.info(f"Start partitioning using hi-resolution mode: {file_path}")
return partition_pdf(
filename=file_path,
strategy="hi_res",
infer_table_structure=True,
extract_images_in_pdf=False,
)
else:
logger.info(f"Start partitioning using auto mode: {file_path}")
return partition(filename=file_path)

def _get_metadata(self) -> dict:
return {"source": f"s3://{self.bucket}/{self.key}"}
Expand Down

0 comments on commit 4fae0bb

Please sign in to comment.