# Preprocessing for complex PDF

## Setting
 - Auto Reload
 - path for utils

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys, os

def add_python_path(module_path):
    if os.path.abspath(module_path) not in sys.path:
        sys.path.append(os.path.abspath(module_path))
        print(f"python path: {os.path.abspath(module_path)} is added")
    else:
        print(f"python path: {os.path.abspath(module_path)} already exists")
    print("sys.path: ", sys.path)

module_path = ".."
add_python_path(module_path)
module_path = "../../.."
add_python_path(module_path)

python path: /home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot is added
sys.path:  ['/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs', '/home/ec2-user/anaconda3/envs/python3/lib/python310.zip', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10/lib-dynload', '', '/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages', '/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr', '/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot']
python path: /home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr already exists
sys.path:  ['/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs', '/home/ec2-user/anaconda3/envs/python3/lib/python310.zip', '/home/ec2-user/anaconda3/envs/python3

## 1. Bedrock Client 생성

In [4]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

### ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----
- os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
- os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
- os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
- os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."

In [5]:
boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print (colored("\n== FM lists ==", "green"))
pprint (bedrock_info.get_list_fm_models(verbose=False))

Create new client
  Using region: None
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)
[32m
== FM lists ==[0m
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Claude-V2-1': 'anthropic.claude-v2:1',
 'Claude-V3-Sonnet': 'anthropic.claude-3-sonnet-20240229-v1:0',
 'Cohere-Embeddings-En': 'cohere.embed-english-v3',
 'Cohere-Embeddings-Multilingual': 'cohere.embed-multilingual-v3',
 'Command': 'cohere.command-text-v14',
 'Command-Light': 'cohere.command-light-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'amazon.titan-text-express-v1',
 'Titan-Text-G1-Light': 'amazon.titan-text-lite-v1'}


## 2. Titan Embedding 및 LLM 인 Claude-v3-sonnet 모델 로딩

### LLM 로딩 (Claude-v3-sonnet)

In [7]:
from langchain_community.chat_models import BedrockChat
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [8]:
llm_text = BedrockChat(
    model_id=bedrock_info.get_model_id(model_name="Claude-V3-Sonnet"),
    client=boto3_bedrock,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    model_kwargs={
        "max_tokens": 1024,
        "stop_sequences": ["\n\nHuman"],
        # "temperature": 0,
        # "top_k": 350,
        # "top_p": 0.999
    }
)
llm_text

BedrockChat(client=<botocore.client.BedrockRuntime object at 0x7f2b6357c880>, model_id='anthropic.claude-3-sonnet-20240229-v1:0', model_kwargs={'max_tokens': 1024, 'stop_sequences': ['\n\nHuman']}, streaming=True, callbacks=[<langchain_core.callbacks.streaming_stdout.StreamingStdOutCallbackHandler object at 0x7f2b9ad07b50>])

### Embedding 모델 선택

In [12]:
from langchain.embeddings import BedrockEmbeddings

In [187]:
llm_emb = BedrockEmbeddings(
    client=boto3_bedrock,
    model_id=bedrock_info.get_model_id(model_name="Titan-Embeddings-G1")
)
dimension = 1536
print("Bedrock Embeddings Model Loaded")

Bedrock Embeddings Model Loaded


## 3. 데이터 준비 
- https://python.langchain.com/docs/integrations/document_loaders/unstructured_file

### Extract Text, Table and Image from documents

In [36]:
from unstructured.cleaners.core import clean_extra_whitespace
from langchain_community.document_loaders import UnstructuredFileLoader

Parameters
----------
- filename
    - A string defining the target filename path.
- content_type
    - A string defining the file content in MIME type
- file
    - A file-like object using "rb" mode --> open(filename, "rb").
- metadata_filename
    - When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
- url
    - The url for a remote document. Pass in content_type if you want partition to treat the document as a specific content_type.
- include_page_breaks
    - If True, the output will include page breaks if the filetype supports it
- strategy
    - The strategy to use for partitioning PDF/image. Uses a layout detection model if set to 'hi_res', otherwise partition simply extracts the text from the document and processes it.
- encoding
    - The encoding method used to decode the text input. If None, utf-8 will be used.
- headers
    - The headers to be used in conjunction with the HTTP request if URL is set.
- skip_infer_table_types
    - The document types that you want to skip table extraction with.
- ssl_verify
    - If the URL parameter is set, determines whether or not partition uses SSL verification in the HTTP request.
- languages
    - The languages present in the document, for use in partitioning and/or OCR. For partitioning <BR>
    image or pdf documents with Tesseract, you'll first need to install the appropriate <BR>
    Tesseract language pack. For other partitions, language is detected using naive Bayesian <BR>
    filter via `langdetect`. Multiple languages indicates text could be in either language. <BR>
    Additional Parameters: <BR>
        - detect_language_per_element <BR>
            - Detect language per element instead of at the document level.
    
- pdf_infer_table_structure
    - If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
    additional metadata field, "text_as_html," where the value (string) is a just a
    transformation of the data into an HTML <table>.
    The "text" field for a partitioned Table Element is always present, whether True or False.
- extract_images_in_pdf
    - Only applicable if `strategy=hi_res`.
    If True, any detected images will be saved in the path specified by
    'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
    Deprecation Note: This parameter is marked for deprecation. Future versions will use
    'extract_image_block_types' for broader extraction capabilities.
- extract_image_block_types
    - Only applicable if `strategy=hi_res`.
    Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
    saved in the path specified by 'extract_image_block_output_dir' or stored as base64
    encoded data within metadata fields.
- extract_image_block_to_payload
    - Only applicable if `strategy=hi_res`.
    If True, images of the element type(s) defined in 'extract_image_block_types' will be
    encoded as base64 data and stored in two metadata fields: 'image_base64' and
    'image_mime_type'.
    This parameter facilitates the inclusion of element data directly within the payload,
    especially for web-based applications or APIs.
- extract_image_block_output_dir
    - Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
    The filesystem path for saving images of the element type(s)
    specified in 'extract_image_block_types'.
- xml_keep_tags
    - If True, will retain the XML tags in the output. Otherwise it will simply extract
    the text from within the tags. Only applies to partition_xml.
- request_timeout
    - The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
    requests will block indefinitely.
- hi_res_model_name
    - The layout detection model used when partitioning strategy is set to `hi_res`.
- model_name
    - The layout detection model used when partitioning strategy is set to `hi_res`. To be
    deprecated in favor of `hi_res_model_name`.

In [133]:
loader = UnstructuredFileLoader(
    file_path="./data/complex_pdf/sample.pdf",
    mode="elements",

    strategy="hi_res",
    hi_res_model_name="yolox",

    extract_images_in_pdf=True,
    pdf_infer_table_structure=False,

    extract_image_block_output_dir="./fig",
    extract_image_block_to_payload=True,

    post_processors=[clean_extra_whitespace]
)

In [134]:
%%time
docs = loader.load()

In [135]:
tables, images, texts = [], [], []

for doc in docs:

    category = doc.metadata["category"]

    if category == "Table": tables.append(doc)
    elif category == "Image": images.append(doc)
    else: texts.append(doc)
    
print (f' # texts: {len(texts)} \n # tables: {len(tables)} \n # images: {len(images)}')


 # texts: 56 
 # tables: 2 
 # images: 3


### Summarization of table and image
- BedrockChat with claude3: https://medium.com/@dminhk/building-with-anthropics-claude-3-on-amazon-bedrock-and-langchain-%EF%B8%8F-2b842f9c0ca8

In [136]:
from langchain.schema import Document
from langchain_core.messages import HumanMessage
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

In [137]:
system_prompt = "You are an assistant tasked with summarizing table and image."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)

### For images

In [138]:
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''
                 Here is the text: <text>{text}</text>
                 Given image and text, give a concise summary.
                 Don't insert any XML tag such as <text> and </text> when answering.
                 Write in Korean.
        '''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

In [139]:
prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

#summarize_chain = prompt | llm_text | StrOutputParser()
summarize_chain = {"image_base64": lambda x:x[0], "text": lambda x:x[1]} | prompt | llm_text | StrOutputParser()

In [140]:
img_info = [(i.metadata["image_base64"], i.page_content) for i in images]
image_summaries = summarize_chain.batch(img_info, config={"max_concurrency": 1})

제공된 이미지는 책 제목인 "핵심설명서"라는 한글 텍스트를 보여주고 있습니다. 검정색 바탕에 하얀색 한글 글씨체로 제목이 쓰여 있어 명확하게 보입니다. 이 이미지는 아마도 어떤 서적의 표지나 목차에 해당하는 내용일 것으로 추측됩니다.이 그래프는 수익률에 대한 그래프입니다. x축은 기준 가격 대비 기초자산가격의 백분율을 나타내고, y축은 세전수익률을 나타냅니다. 기준 가격인 100%에서 수익률은 0%이며, 기초자산가격이 130%일 때 수익률은 약 70%까지 상승합니다. 기초자산가격이 100% 원금보장 수준 이하일 경우 수익률은 마이너스가 됩니다.이미지는 세전수익률(%)과 직접기준가격대비 기준자산가액(%)의 관계를 보여주는 그래프입니다. 100% 원금보전 지점에서 세전수익률이 21%일 때 상승렬여율이 70%인 것으로 나타납니다. 또한 직접기준가격대비 기준자산가액이 130%일 때까지 그래프가 이어집니다. 이 그래프를 통해 투자 상품의 수익률과 위험수준의 상관관계를 파악할 수 있습니다.

In [142]:
images_preprocessed = []

for origin, summary in zip(images, image_summaries):
    metadata = origin.metadata
    metadata["origin_image"] = [origin.page_content]
    doc = Document(
        page_content=summary,
        metadata=metadata
    )
    images_preprocessed.append(doc)

### For tables

In [144]:
human_prompt = [
    {
        "type": "text",
        "text": '''
                 Here is the table: <table>{table}</table>
                 Given table, give a concise summary.
                 Don't insert any XML tag such as <table> and </table> when answering.
                 Write in Korean.
        '''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

In [145]:
prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

#summarize_chain = prompt | llm_text | StrOutputParser()
summarize_chain = {"table": lambda x:x} | prompt | llm_text | StrOutputParser()

In [146]:
table_info = [t.page_content for t in tables]
table_summaries = summarize_chain.batch(table_info, config={"max_concurrency": 1})

이 표는 만기상환금액을 결정하는 방식을 설명하고 있습니다. 만기평가가격이 최초기준가격의 100% 이상일 경우에는 {(만기평가가격/최초기준가격) × 70%}를 상환받게 되며, 만기평가가격이 최초기준가격의 100% 미만일 경우에는 총액면금액의 100%를 상환받게 됩니다.주어진 표는 KOSPI200 지수와 금 가격의 수익률 및 손익을 보여주고 있습니다. 최초 기준가격 결정일에는 KOSPI200 지수가 250pt, 금 가격이 1,500달러였습니다. 만기평가일 1에서는 KOSPI200 지수가 20% 상승하여 300pt, 금 가격이 10% 상승하여 1,650달러가 되었고, 이에 따라 7.0%의 수익률과 700만원의 세전 이익이 발생했습니다. 반면 만기평가일 2에서는 KOSPI200 지수는 20% 상승한 300pt이었지만, 금 가격은 10% 하락하여 1,350달러가 되어 손익은 0이었습니다.

In [147]:
tables_preprocessed = []

for origin, summary in zip(tables, table_summaries):
    metadata = origin.metadata
    metadata["origin_table"] = [origin.page_content]
    doc = Document(
        page_content=summary,
        metadata=metadata
    )
    tables_preprocessed.append(doc)

### Text Spliter 로 청킹
참고: 검색된 문서/텍스트는 질문에 대답하기에 충분한 정보를 포함할 만큼 커야 합니다. 하지만 LLM 프롬프트에 들어갈 만큼 충분히 작습니다. <BR>
또한 임베딩 모델에는 입력 토큰 길이는 KoSimCSERobert는 512개, titanEmbedding(8,912개) 토큰으로 제한되어 있습니다. <BR>
이 사용 사례를 위해 [RecursiveCharacterTextSplitter](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/recursive_text_splitter.html)를 사용하여 500자가 겹치는 약 92자의 청크를 생성합니다.

In [155]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [160]:
chunk_size = 1024
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ".", " ", ""],
    length_function=len,
)

texts_preprocessed = text_splitter.split_documents(texts)
print(f"Number of documents after split and chunking={len(texts_preprocessed)}")

Number of documents after split and chunking=56


### Merge text, table and image

In [162]:
from itertools import chain

In [179]:
docs_preprocessed = list(chain(texts_preprocessed, tables_preprocessed, images_preprocessed))

In [180]:
docs_preprocessed

[Document(page_content='이 설명서는 금융소비자의 권익 보호 및 금융상품에 대한 이해 증진을 위하여 금융상품의 핵심내용을 쉽게 이해하실 수 있도록 작성한 것입니다. 상품내용을 충분히 이해하신 다음에 청약여부를 결정하시기 바랍니다.', metadata={'source': './data/complex_pdf/sample.pdf', 'detection_class_prob': 0.3390708267688751, 'coordinates': {'points': ((112.57894134521484, 41.28585433959961), (112.57894134521484, 164.21746826171875), (1547.0444444444445, 164.21746826171875), (1547.0444444444445, 41.28585433959961)), 'system': 'PixelSpace', 'layout_width': 1654, 'layout_height': 2339}, 'last_modified': '2024-03-13T13:49:55', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': './data/complex_pdf', 'filename': 'sample.pdf', 'category': 'Title'}),
 Document(page_content='[미래에셋증권 제 253 회 파생결합증권(DLS)(원금보장형)]', metadata={'source': './data/complex_pdf/sample.pdf', 'detection_class_prob': 0.5520003437995911, 'coordinates': {'points': ((309.12457275390625, 357.5935555555554), (309.12457275390625, 398.43634033203125), (1348.2647777777777, 398.4363

## 4. Index 생성

### Index 이름 결정

In [181]:
index_name = "kb_complex_pdf"

In [182]:
import boto3
from utils.ssm import parameter_store

In [183]:
region=boto3.Session().region_name
pm = parameter_store(region)

In [184]:
pm.put_params(
    key="opensearch_index_name",
    value=f'{index_name}',
    overwrite=True,
    enc=False
)

Parameter stored successfully.


### Index 스키마 정의

In [199]:
index_body = {
    'settings': {
        'analysis': {
            'analyzer': {
                'my_analyzer': {
                         'char_filter':['html_strip'],
                    'tokenizer': 'nori',
                    'filter': [
                        #'nori_number',
                        #'lowercase',
                        #'trim',
                        'my_nori_part_of_speech'
                    ],
                    'type': 'custom'
                }
            },
            'tokenizer': {
                'nori': {
                    'decompound_mode': 'mixed',
                    'discard_punctuation': 'true',
                    'type': 'nori_tokenizer'
                }
            },
            "filter": {
                "my_nori_part_of_speech": {
                    "type": "nori_part_of_speech",
                    "stoptags": [
                        "J", "XSV", "E", "IC","MAJ","NNB",
                        "SP", "SSC", "SSO",
                        "SC","SE","XSN","XSV",
                        "UNA","NA","VCP","VSV",
                        "VX"
                    ]
                }
            }
        },
        'index': {
            'knn': True,
            'knn.space_type': 'cosinesimil'  # Example space type
        }
    },
    'mappings': {
        'properties': {
            'metadata': {
                'properties': {
                    'source': {'type': 'keyword'},
                    'page_number': {'type':'long'},
                    'category': {'type':'text'},
                    'file_directory': {'type':'text'},
                    'last_modified': {'type': 'text'},
                    'type': {'type': 'keyword'},
                    'image_base64': {'type':'text'},
                    'origin_image': {'type':'text'},
                    'origin_table': {'type':'text'},
                }
            },
            'text': {
                'analyzer': 'my_analyzer',
                'search_analyzer': 'my_analyzer',
                'type': 'text'
            },
            'vector_field': {
                'type': 'knn_vector',
                'dimension': f"{dimension}" # Replace with your vector dimension
            }
        }
    }
}


## 5. LangChain OpenSearch VectorStore 생성 
### 선수 조건

#### [중요] 아래에 aws parameter store 에 아래 인증정보가 먼저 입력되어 있어야 합니다.

In [200]:
opensearch_domain_endpoint = pm.get_params(
    key="opensearch_domain_endpoint",
    enc=False
)

opensearch_user_id = pm.get_params(
    key="opensearch_user_id",
    enc=False
)

opensearch_user_password = pm.get_params(
    key="opensearch_user_password",
    enc=True
)

In [201]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

### OpenSearch Client 생성

In [202]:
from local_utils.opensearch import opensearch_utils

In [203]:
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

### 오픈 서치 인덱스 생성 
- 오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [204]:
index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )

opensearch_utils.create_index(os_client, index_name, index_body)
index_info = os_client.indices.get(index=index_name)
print("Index is created")
pprint(index_info)

index_name=kb_complex_pdf, exists=False

Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'kb_complex_pdf'}
Index is created
{'kb_complex_pdf': {'aliases': {},
                    'mappings': {'properties': {'metadata': {'properties': {'category': {'type': 'text'},
                                                                            'file_directory': {'type': 'text'},
                                                                            'image_base64': {'type': 'text'},
                                                                            'last_modified': {'type': 'text'},
                                                                            'origin_image': {'type': 'text'},
                                                                            'origin_table': {'type': 'text'},
                                                                            'page_number': {'type': 'long'},
                                            

### 랭체인 인덱스 연결 오브젝트 생성

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [205]:
from langchain.vectorstores import OpenSearchVectorSearch

In [206]:
vector_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss=False,
    engine="faiss",
    space_type="l2",
    bulk_size=100000,
    timeout=60
)
vector_db

<langchain_community.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f29dd5a5ba0>

### OpenSearch 에 문서 삽입

In [208]:
%%time

vector_db.add_documents(
    documents = docs_preprocessed, 
    vector_field = "vector_field",
    bulk_size = 1000000
)

CPU times: user 221 ms, sys: 4.89 ms, total: 226 ms
Wall time: 18.7 s


['df5ca164-dc0c-4372-bbbf-69009062cefc',
 '2d937f4a-80c7-4cd7-b593-fc15d2cfa7ca',
 '91b4195f-698e-4d65-99e1-afabe4aae732',
 'c706e167-6c4e-4ed1-b996-13c9fa5ef206',
 'e04cd18e-c186-4500-a9e4-7fd773b54d48',
 '0c257944-f23d-4240-90f3-1046f0a64c21',
 'fa8d31c7-056b-4033-b6fa-a59f5ad5421a',
 'e74bf8a2-f108-445b-9473-bd42a8130e5b',
 'dafd5c5d-b321-4c12-b10e-023480f94e18',
 'c2f3f44e-5bdf-4f36-b6f6-46b2fa9664b6',
 '8784f0ab-f69f-497e-b07a-77021a5b5f34',
 'baf1397c-aca7-4a88-8f20-d01a7c7f255f',
 'b310e3e4-f0be-4396-9c52-f1eb18d2fb8b',
 'e3114220-37e8-40e2-92d1-3ef20bdffe4f',
 'de058151-c8f7-487f-92e6-1dead5f05220',
 'fc640b62-fb7b-4147-baf6-00780153ebaa',
 '86e27b6d-ddbe-436b-aa5c-d5db48685889',
 'fdf3164b-d5cc-4acf-8636-386d14f8c252',
 'eb27c2ea-6fd9-435d-a244-f6c85133506b',
 'e7cce0da-9d3a-4745-8663-f4e4ad1791a1',
 '152ae929-8d62-43fe-a8bf-0335534203b1',
 '8ea08465-9daa-46ed-b8bb-8349bcf1cd85',
 '44ffe1b8-4f00-409b-99ae-df086e1d2160',
 '1a5e70a5-4902-4a43-892b-077290d9c6d2',
 '1e28dec3-e994-

In [220]:
vector_db.similarity_search("상품")

[Document(page_content='상품 개요', metadata={'source': './data/complex_pdf/sample.pdf', 'coordinates': {'points': [[139.33333333333331, 487.45555555555535], [139.33333333333331, 520.7888888888887], [324.15555555555557, 520.7888888888887], [324.15555555555557, 487.45555555555535]], 'system': 'PixelSpace', 'layout_width': 1654, 'layout_height': 2339}, 'last_modified': '2024-03-13T13:49:55', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': './data/complex_pdf', 'filename': 'sample.pdf', 'category': 'Title'}),
 Document(page_content='○ 상품위험등급: 4 등급', metadata={'source': './data/complex_pdf/sample.pdf', 'detection_class_prob': 0.6981909275054932, 'coordinates': {'points': [[82.37476348876953, 871.400888888889], [82.37476348876953, 900.8624267578125], [405.93522222222214, 900.8624267578125], [405.93522222222214, 871.400888888889]], 'system': 'PixelSpace', 'layout_width': 1654, 'layout_height': 2339}, 'last_modified': '2024-03-13T13:49:55', 'filetype': 'ap

## 6. 검색 및 질의 응답 테스트

In [214]:
from utils.rag import retriever_utils
from local_utils.rag import show_context_used

### 하이브리드 검색

In [250]:
query = "청약 금액에 맞춰서 안분배정하는 경우는?"

search_filter=[
    #{"term": {"metadata.source": "신한은행"}},
    #{"term": {"metadata.type": "인터넷뱅킹"}},
]

In [253]:
%%time
similar_docs_hybrid = retriever_utils.search_hybrid(
    query=query,
    k=7,
    index_name=index_name,
    os_client=os_client,
    filter=search_filter,
    fusion_algorithm="RRF", # ["RRF", "simple_weighted"]
    ensemble_weights=[0.49, 0.51], # semantic, lexical
    async_mode=True,
    llm_emb=llm_emb,
    verbose=False
)
#show_context_used(similar_docs_hybrid)

CPU times: user 12.1 ms, sys: 314 µs, total: 12.4 ms
Wall time: 259 ms


In [254]:
system_prompt = "You are a master answer bot designed to answer investor's questions."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)

In [255]:
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''
                I'm going to give you a context and images. Read the context carefully, because I'm going to ask you a question about it.
                
                Here is the context: <context>{context}</context>
                
                First, find a few paragraphs or sentences from the context that are most relevant to answering the question.
                Then, answer the question as much as you can.
                
                Skip the preamble and go straight into the answer.
                Don't insert any XML tag such as <context> and </context> when answering.
                Answer in Korean.
                
                Here is the question: <question>{question}</question>
                
                If the question cannot be answered by the context, say "No relevant context".
        '''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

In [256]:
human_prompt = [
    {
        "type": "text",
        "text": '''
                I'm going to give you a context and tables. Read the context carefully, because I'm going to ask you a question about it.
                
                Here is the context: <context>{context}</context>
                Here is the context: <table>{table}</table>
                
                First, find a few paragraphs or sentences from the context that are most relevant to answering the question.
                Then, answer the question as much as you can.
                
                Skip the preamble and go straight into the answer.
                Don't insert any XML tag such as <context> and </context> when answering.
                Answer in Korean.
                
                Here is the question: <question>{question}</question>
                
                If the question cannot be answered by the context, say "No relevant context".
        '''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

In [261]:
prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

chain = prompt | llm_text | StrOutputParser()

response = chain.invoke(
    {
        "context": similar_docs_hybrid,
        #"table": similar_docs_hybrid[4].metadata["origin_table"],
        "table": "None",
        "question": query
    }
)

공모결과 총 청약금액이 모집금액의 100% 이상인 경우 청약금액에 비례하여 안분배정합니다.

In [242]:
response = chain.invoke(
    {
        "context": similar_docs_hybrid,
        "table": similar_docs_hybrid[4].metadata["origin_table"],
        "question": question
        '''
    }
)

SyntaxError: incomplete input (1740779636.py, line 6)

In [69]:
docs[-4].page_content#.metadata["]

'HE=HE 21.00% 4STEHE 70% 130% m=oz=spzgpy| IR AT (%) (B 22 7| =X 100% HSEF'

In [241]:
show_context_used(similar_docs_hybrid)

-----------------------------------------------
1. Chunk: 84 Characters
-----------------------------------------------
○ 만기상환 ② : 두 기초자산의 만기평가가격이 KOSPI200 은 300pt, 금가격지수는 $1,350 인 경우 ⇒ 1 억원 지급(100% 원금보장)
metadata:
 {'source': './data/complex_pdf/sample.pdf', 'detection_class_prob': 0.8142080307006836,
'coordinates': {'points': [[91.33333333333334, 1936.5397777777778], [91.33333333333334,
2006.0692138671875], [1398.1574444444443, 2006.0692138671875], [1398.1574444444443,
1936.5397777777778]], 'system': 'PixelSpace', 'layout_width': 1654, 'layout_height': 2339},
'last_modified': '2024-03-13T13:49:55', 'filetype': 'application/pdf', 'languages': ['eng'],
'page_number': 1, 'parent_id': '93fdaec2fc7e323322bec792d16587e2', 'file_directory':
'./data/complex_pdf', 'filename': 'sample.pdf', 'category': 'NarrativeText', 'id':
'05e76b92-033a-4e26-b7d0-a177d84f150e'}
-----------------------------------------------
2. Chunk: 30 Characters
-----------------------------------------------
※ 만기수익률=(만기