# 03 - Layout aware text extraction with Amazon Textract

In [None]:
%pip install -q amazon-textract-textractor[pdf] pdf2image pydantic "anthropic[bedrock]"

In [None]:
!sudo apt-get update -y 2> /dev/null && sudo apt install poppler-utils -y 2> /dev/null

In [None]:
!ls raw_documents/

In [None]:
!ls raw_documents/prepared/

In [None]:
!ls raw_documents/prepared/Amazon/

In [None]:
!python -m json.tool raw_documents/prepared/metadata.json

## Extraction with textractor

In [None]:
import sagemaker

default_sagemaker_bucket = sagemaker.Session().default_bucket()
sagemaker_execution_role = sagemaker.get_execution_role()

In [11]:
import boto3
from textractor import Textractor
from textractor.data.constants import TextractFeatures

region = boto3.session.Session().region_name
# extractor = Textractor(profile_name="default")
extractor = Textractor(region_name=region)

input_document = "raw_documents/prepared/Amazon/annual_report_2022.pdf"

document = extractor.start_document_analysis(
    file_source=input_document,
    s3_upload_path=f"s3://{default_sagemaker_bucket}/input_documents/",
    s3_output_path=f"s3://{default_sagemaker_bucket}/output_documents/",
    features=[TextractFeatures.LAYOUT],
    save_image=False
)

In [12]:
document.document

In [None]:
document.pages[0]

In [None]:
print(document.pages[4].to_markdown())

## Use LLM to review and improve the extracted document

Here we use Anthropic Claude 3 models through Amazon Bedrock to improve the markdown file extracted by Amazon Textract further, so it is ready for the LLM to answer question properly later on.

In [19]:
import boto3
import json
import logging
from botocore.exceptions import ClientError

bedrock = boto3.client("bedrock", region_name="us-west-2")
bedrock_runtime = boto3.client("bedrock-runtime", region_name="us-west-2")

In [20]:
# bedrock.list_foundation_models()

In [21]:
# llm_model_id = "anthropic.claude-3-haiku-20240307-v1:0"
llm_model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

In [22]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def generate_message(bedrock_runtime, model_id, system_prompt, messages, max_tokens):

    body=json.dumps(
        {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": max_tokens,
            "system": system_prompt,
            "messages": messages
        }
    )
    response = bedrock_runtime.invoke_model(body=body, modelId=model_id)
    response_body = json.loads(response.get('body').read())

    return response_body


def call_llm(user_input, model_id, system_prompt, bedrock_runtime, max_tokens=1000):
    """Handle calls to Anthropic Claude message api."""
    try:
        # Prompt with user turn only.
        user_message =  {"role": "user", "content": user_input}
        messages = [user_message]
        return generate_message(bedrock_runtime, model_id, system_prompt, messages, max_tokens)
    except ClientError as err:
        message=err.response["Error"]["Message"]
        logger.error("A client error occurred: %s", message)
        print("A client error occured: " +
            format(message))



Below we test the help functions by calling the LLM

In [None]:
%%time
user_input = "hello"
system_prompt = "reply in a friendly manner"

call_llm(user_input, llm_model_id, system_prompt, bedrock_runtime, max_tokens=1000)

In [24]:
user_prompt = """
Improve the markdown while keeping all original information. Put the improved markdown inside a <results> xml tags with no explanation:
\n{markdown_doc}
""".strip()

system_prompt = "Your task is to review and improve the results of Amazon textract in markdown."


def improve_textract_markdown_output(document, llm_model_id):
    improved_markdown = []
    for i in range(len(document.pages)):
        user_input = user_prompt.format(markdown_doc=document.pages[i].to_markdown())
        result = call_llm(user_input, llm_model_id, system_prompt, bedrock_runtime, max_tokens=3000)
        # Extract the text between the <results> XML tags only.
        improved_markdown.append(result["content"][0]["text"].split("<results>")[-1].split("</results>")[0].strip())
    return improved_markdown

In [None]:
import os
raw_base_directory = "raw_documents"
prepared_base_directory = os.path.join(raw_base_directory, "prepared/")
prepared_base_directory

In [27]:
import json

with open(
    os.path.join(prepared_base_directory, "metadata.json"), "r"
) as prepared_pdfs_metadata_obj:
    prepared_pdfs_metadata = json.load(prepared_pdfs_metadata_obj)


In [None]:
prepared_pdfs_metadata

In [29]:
def extract_pages_as_markdown(input_document):

    document = extractor.start_document_analysis(
        file_source=input_document,
        s3_upload_path=f"s3://{default_sagemaker_bucket}/input_documents/",
        s3_output_path=f"s3://{default_sagemaker_bucket}/output_documents/",
        features=[TextractFeatures.LAYOUT],
        save_image=False
    )

    res = improve_textract_markdown_output(document, llm_model_id)
    pages = [{"page": indx, "page_text": text} for indx, text in enumerate(res)]
    return pages


def extract_docs_into_markdown(docs_metadata):
    results = []
    for doc_meta in docs_metadata:
        doc_result_with_metadata = {}
        doc_result_with_metadata["metadata"] = doc_meta
        doc_result_with_metadata["name"] = doc_meta["doc_url"].split("/")[-1]
        doc_result_with_metadata["source_location"] = doc_meta["doc_url"]
        doc_result_with_metadata["pages"] = extract_pages_as_markdown(doc_meta["local_pdf_path"])
        results.append(doc_result_with_metadata)
    return results

In [None]:
%%time
results = extract_docs_into_markdown(prepared_pdfs_metadata)

In [None]:
results[0]

In [32]:
from utils.helpers import store_list_to_s3
ssm = boto3.client("ssm")

In [34]:
s3_bucket_name_parameter = "/AgenticLLMAssistantWorkshop/AgentDataBucketParameter"
s3_bucket_name = ssm.get_parameter(Name=s3_bucket_name_parameter)
s3_bucket_name = s3_bucket_name["Parameter"]["Value"]
processed_documents_s3_key = "documents_processed.json"

In [37]:
store_list_to_s3(s3_bucket_name, processed_documents_s3_key, results)