Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Partition pdf #301

Merged
merged 15 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class EmbeddingConfig(TypedDict):
model_id: str
chunk_size: int
chunk_overlap: int
enable_partition_pdf: bool


# Configure generation parameter for Claude chat response.
Expand Down Expand Up @@ -42,6 +43,7 @@ class EmbeddingConfig(TypedDict):
# NOTE: consider that cohere allows up to 2048 tokens per request
"chunk_size": 1000,
"chunk_overlap": 200,
"enable_partition_pdf": False,
}

# Configure search parameter to fetch relevant documents from vector store.
Expand Down
12 changes: 12 additions & 0 deletions backend/app/repositories/custom_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,12 @@ def find_private_bot_by_id(user_id: str, bot_id: str) -> BotModel:
and "chunk_overlap" in item["EmbeddingParams"]
else 200
),
enable_partition_pdf=(
statefb marked this conversation as resolved.
Show resolved Hide resolved
item["EmbeddingParams"]["enable_partition_pdf"]
if "EmbeddingParams" in item
and "enable_partition_pdf" in item["EmbeddingParams"]
else False
),
),
generation_params=GenerationParamsModel(
**(
Expand Down Expand Up @@ -407,6 +413,12 @@ def find_public_bot_by_id(bot_id: str) -> BotModel:
and "chunk_overlap" in item["EmbeddingParams"]
else 200
),
enable_partition_pdf=(
item["EmbeddingParams"]["enable_partition_pdf"]
if "EmbeddingParams" in item
and "enable_partition_pdf" in item["EmbeddingParams"]
else False
),
),
generation_params=GenerationParamsModel(
**(
Expand Down
1 change: 1 addition & 0 deletions backend/app/repositories/models/custom_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
class EmbeddingParamsModel(BaseModel):
chunk_size: int
chunk_overlap: int
enable_partition_pdf: bool


class KnowledgeModel(BaseModel):
Expand Down
1 change: 1 addition & 0 deletions backend/app/routes/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def get_private_bot(request: Request, bot_id: str):
embedding_params=EmbeddingParams(
chunk_size=bot.embedding_params.chunk_size,
chunk_overlap=bot.embedding_params.chunk_overlap,
enable_partition_pdf=bot.embedding_params.enable_partition_pdf,
),
knowledge=Knowledge(
source_urls=bot.knowledge.source_urls,
Expand Down
3 changes: 3 additions & 0 deletions backend/app/routes/schemas/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
class EmbeddingParams(BaseSchema):
chunk_size: int
chunk_overlap: int
enable_partition_pdf: bool


class GenerationParams(BaseSchema):
Expand Down Expand Up @@ -94,6 +95,8 @@ def is_embedding_required(self, current_bot_model: BotModel) -> bool:
== current_bot_model.embedding_params.chunk_size
and self.embedding_params.chunk_overlap
== current_bot_model.embedding_params.chunk_overlap
and self.embedding_params.enable_partition_pdf
== current_bot_model.embedding_params.enable_partition_pdf
):
pass
else:
Expand Down
16 changes: 16 additions & 0 deletions backend/app/usecases/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ def create_new_bot(user_id: str, bot_input: BotInput) -> BotOutput:
else DEFAULT_EMBEDDING_CONFIG["chunk_overlap"]
)

enable_partition_pdf = (
bot_input.embedding_params.enable_partition_pdf
if bot_input.embedding_params
else DEFAULT_EMBEDDING_CONFIG["enable_partition_pdf"]
)

generation_params = (
bot_input.generation_params.model_dump()
if bot_input.generation_params
Expand Down Expand Up @@ -158,6 +164,7 @@ def create_new_bot(user_id: str, bot_input: BotInput) -> BotOutput:
embedding_params=EmbeddingParamsModel(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_partition_pdf=enable_partition_pdf,
),
generation_params=GenerationParamsModel(**generation_params),
search_params=SearchParamsModel(**search_params),
Expand Down Expand Up @@ -185,6 +192,7 @@ def create_new_bot(user_id: str, bot_input: BotInput) -> BotOutput:
embedding_params=EmbeddingParams(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_partition_pdf=enable_partition_pdf,
),
generation_params=GenerationParams(**generation_params),
search_params=SearchParams(**search_params),
Expand Down Expand Up @@ -239,6 +247,12 @@ def modify_owned_bot(
else DEFAULT_EMBEDDING_CONFIG["chunk_overlap"]
)

enable_partition_pdf = (
modify_input.embedding_params.enable_partition_pdf
if modify_input.embedding_params
else DEFAULT_EMBEDDING_CONFIG["enable_partition_pdf"]
)

generation_params = (
modify_input.generation_params.model_dump()
if modify_input.generation_params
Expand Down Expand Up @@ -266,6 +280,7 @@ def modify_owned_bot(
embedding_params=EmbeddingParamsModel(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_partition_pdf=enable_partition_pdf,
),
generation_params=GenerationParamsModel(**generation_params),
search_params=SearchParamsModel(**search_params),
Expand All @@ -286,6 +301,7 @@ def modify_owned_bot(
embedding_params=EmbeddingParams(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_partition_pdf=enable_partition_pdf,
),
generation_params=GenerationParams(**generation_params),
search_params=SearchParams(**search_params),
Expand Down
2 changes: 2 additions & 0 deletions backend/embedding.requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ unstructured[docx]==0.12.6
unstructured[xlsx]==0.12.6
unstructured[pptx]==0.12.6
unstructured[md]==0.12.6
retry==0.9.2
types-retry==0.9.9.4
30 changes: 27 additions & 3 deletions backend/embedding/loaders/s3.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
import os
import tempfile

import logging
statefb marked this conversation as resolved.
Show resolved Hide resolved
import boto3
from distutils.util import strtobool
from embedding.loaders.base import BaseLoader, Document
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


class S3FileLoader(BaseLoader):
"""Loads a document from a file in S3.
Reference: `langchain_community.document_loaders.S3FileLoader` class
"""

def __init__(self, bucket: str, key: str, mode: str = "single"):
def __init__(
self,
bucket: str,
key: str,
mode: str = "single",
enable_partition_pdf: bool = False,
):
self.bucket = bucket
self.key = key
self.mode = mode
self.enable_partition_pdf = enable_partition_pdf

def _get_elements(self) -> list:
"""Get elements."""
Expand All @@ -23,7 +35,19 @@ def _get_elements(self) -> list:
file_path = f"{temp_dir}/{self.key}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
s3.download_file(self.bucket, self.key, file_path)
return partition(filename=file_path)
extension = os.path.splitext(file_path)[1]

if extension == ".pdf" and self.enable_partition_pdf == True:
logger.info(f"Start partitioning using hi-resolution mode: {file_path}")
return partition_pdf(
filename=file_path,
strategy="hi_res",
infer_table_structure=True,
extract_images_in_pdf=False,
)
else:
logger.info(f"Start partitioning using auto mode: {file_path}")
return partition(filename=file_path)

def _get_metadata(self) -> dict:
return {"source": f"s3://{self.bucket}/{self.key}"}
Expand Down
Loading
Loading