# Process Documents Containing Scientific Formulas and Charts using Anthropic Claude on Amazon Bedrock
---
<div class="alert alert-block alert-info"> 
    <b>NOTE:</b> You will need to use a Jupyter Kernel with Python 3.11 or above to use this notebook. If you are using an Amazon Sagemaker Notebook Instance use conda_python3. If you are using SageMaker Studio, you can use the `Data Science 3.0` image.
</div>

<div class="alert alert-block alert-warning"> 
    <b>NOTE:</b> You will need 3rd party model access to Anthropic Claude Sonnet 3.5 V2 model to be able to run this notebook. Verify if you have access to the model by going to <a href="https://console.aws.amazon.com/bedrock" target="_blank">Amazon Bedrock console</a> > left menu "Model access". The "Access status" for Anthropic Claude must be in "Access granted" status in green. If you do not have access, then click "Edit" button on the top right > select the model checkbox > click "Save changes" button at the bottom. You should have access to the model within a few moments.
</div>


## Contents

Prerequisites and Environment Setup
1. Data Preparation
1. Formula Extraction
1. Chart and Graph Analysis
1. Metadata Generation
1. Comprehensive Document Processing 
1. Knowledge Base Integration
1. Query Capabilities
1. Cleanup


### Prerequisites and Environment Setup
---
This section sets up the required environment and dependencies for processing scientific documents. It includes installing essential packages like poppler for PDF processing and defining helper functions for interacting with Amazon Bedrock. The setup is crucial for enabling the document processing pipeline.

In [None]:
!conda install -c conda-forge poppler -y
!python -m pip install sagemaker
!python -m pip install filetype
!python -m pip install pdf2image
!python -m pip install retrying
!python -m pip install opensearch-py
!python -m pip install python-dotenv

In [None]:
import json
import sagemaker
import requests
import os
import boto3
from typing import Dict, Any, List
from datetime import datetime
from IPython.display import display, Markdown, Latex, Image
import filetype
from pdf2image import convert_from_path
from pdf2image import pdfinfo_from_path
from pathlib import Path
from utils.knowledge_base import BedrockKnowledgeBase
import time
from dotenv import load_dotenv

load_dotenv()

s3 = boto3.client("s3")
bedrock = boto3.client(service_name='bedrock-runtime')
session = boto3.Session()
role = sagemaker.get_execution_role()
region = sagemaker.Session().boto_region_name
foundation_model = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"


# Get the current timestamp
current_time = time.time()

# Get current accountid
sts_client = boto3.client("sts")
account_id = sts_client.get_caller_identity()["Account"]

# Format the timestamp as a string
timestamp_str = time.strftime("%Y%m%d%H%M%S", time.localtime(current_time))[-7:]

# Create the suffix using the timestamp
suffix = f"{timestamp_str}"
knowledge_base_name_hierarchical = 'hierarchical-kb'
knowledge_base_bucket_name = os.getenv("BUCKET_NAME", f"{account_id}-bedrock-kb-{suffix}")
knowledge_base_description = "Knowledge Base containing research PDF."


print(f"SageMaker Execution Role is {role}. Current region is {region}")

In [None]:
def get_file_type(file_path):
    file_types = {
        'application/pdf': 'pdf',
        'image/jpeg': 'jpeg',
        'image/png': 'png',
        'image/webp': 'webp',
        'text/plain': 'txt',
        'text/csv': 'csv'
    }

    file_type = None
    kind = filetype.guess(file_path)
    if kind and kind.mime in file_types:
        file_type = file_types[kind.mime]
    
    return file_type

messages = []
def stream_conversation(message, modelId=foundation_model, file_paths=[], temp=0.2):
    """
    Sends messages to a model and streams back the response.
    Args:
        message: The text message to send to the model.
        modelId: The ID of the model to use for the conversation.
        file_paths: A list of file paths to include in the conversation.
        temp: The temperature for the model inference.
        
    Returns:
        A generator that yields the streaming response.
    """
    temperature = temp
    top_k = 200
    inference_config = {"temperature": temperature}
    additional_model_fields = {"top_k": top_k}
    model_id = modelId

    system_prompts = [{"text": "You are a helpful assistant."}]

    content = []
    file_types = {
        'application/pdf': 'pdf',
        'image/jpeg': 'jpeg',
        'image/png': 'png',
        'image/webp': 'webp',
        'text/plain': 'txt',
        'text/csv': 'csv'
    }

    for file_path in file_paths:
        if file_path:
            with open(file_path, "rb") as open_file:
                file_bytes = open_file.read()
            
            # get the filename
            file_name = os.path.basename(file_path)

            #clean filename to remove any non alphanumeric characters
            file_name = ''.join(e for e in file_name if e.isalnum())

            file_type = get_file_type(file_path)
            if file_type is not None:

                if file_type in "png jpeg gif webp":
                    content.append({"image": {"format": file_type, "source": {"bytes": file_bytes}}})
                elif file_type == "pdf":
                    content.append({"document": {"format": file_type, "name": file_name, "source": {"bytes": file_bytes}}})

    content.append({"text": message})

    message = {
        "role": "user",
        "content": content,
    }
    message_list = [message]

    response = bedrock.converse_stream(
        modelId=model_id,
        messages=message_list,
        system=system_prompts,
        inferenceConfig=inference_config,
        additionalModelRequestFields=additional_model_fields
    )

    stream = response.get('stream')
    
    output = "inspecting stream ...\n"
    message = {}
    if stream:
        streaming_text = ""
        
        for event in stream:
            if 'messageStart' in event:
                output += f"\nRole: {event['messageStart']['role']}"

            if 'contentBlockDelta' in event:
                #yield event['contentBlockDelta']['delta']['text']
                streaming_text += event['contentBlockDelta']['delta']['text']

            if 'messageStop' in event:
                output += f"\nStop reason: {event['messageStop']['stopReason']}"
                
                message = {
                    "role": "assistant",
                    "content": streaming_text
                }

            if 'metadata' in event:
                metadata = event['metadata']
                if 'usage' in metadata:
                    output += "\nToken usage"
                    output += f"Input tokens: {metadata['usage']['inputTokens']}"
                    output += f":Output tokens: {metadata['usage']['outputTokens']}"
                    output += f":Total tokens: {metadata['usage']['totalTokens']}"
                if 'metrics' in metadata:
                    output += f"Latency: {metadata['metrics']['latencyMs']} milliseconds"

    return output, message

                    

In [None]:
def pdf_to_images(file_path):
    saved_files = []  # List to store paths of saved files
    
    filename = os.path.basename(file_path)
    base_name, ext = os.path.splitext(filename)
    dir = os.path.dirname(file_path)

    abs_path = os.path.abspath(f"""{dir}/{base_name}""")
    os.makedirs(abs_path, exist_ok=True)

    img_info = pdfinfo_from_path(file_path)
    po = convert_from_path(file_path, 150)
    
    for i in range(len(po)):
        out_file = f"""{abs_path}/page_{i}.png"""
        po[i].save(out_file, 'PNG')
        saved_files.append(out_file)  # Add the file path to the list
        
    return saved_files  # Return the list of saved file paths

def save_output(source_file_path, output_ext, content):
    filename = os.path.basename(source_file_path)
    base_name, ext = os.path.splitext(filename)
    dir = os.path.dirname(source_file_path)

    abs_path = os.path.abspath(f"""{dir}/temp""")
    os.makedirs(abs_path, exist_ok=True)

    output_file_path = os.path.join(abs_path, f"""{base_name}.{output_ext}""")

    with open(output_file_path, 'w') as file:
        file.write(content)
    return output_file_path

def parse_response(text):
    
    # Initialize variables to store contents
    markdown_content = ""
    metadata_content = ""
    
    # Find markdown content
    markdown_start = text.find("<markdown>")
    markdown_end = text.find("</markdown>")
    if markdown_start != -1 and markdown_end != -1:
        markdown_content = text[markdown_start + len("<markdown>"):markdown_end].strip()
    
    # Find metadata content
    metadata_start = text.find("<metadata>")
    metadata_end = text.find("</metadata>")
    if metadata_start != -1 and metadata_end != -1:
        metadata_content = text[metadata_start + len("<metadata>"):metadata_end].strip()
    
    return markdown_content, metadata_content

### 1. Data Preperation
---
This step handles the acquisition and initial processing of scientific documents from arXiv. It downloads sample documents and converts PDFs into individual PNG images per page, making them suitable for processing by Claude's multi-modal capabilities. This conversion is essential because Claude works better with image formats when analyzing visual content.

In [None]:
sample_file = 'https://arxiv.org/pdf/2003.10304'

response = requests.get(sample_file)

if response.status_code == 200:
    #get the filename from the url
    file_type = get_file_type(response.content)
    filename = os.path.basename(sample_file)
    file_path = f"""./samples/{filename}.{file_type}"""
    # Write the content to a local file
    abs_path = os.path.abspath(os.path.dirname(file_path))
    os.makedirs(abs_path, exist_ok=True)
    with open(file_path, 'wb') as file:
        file.write(response.content)
    sample_file_pages = pdf_to_images(file_path)

sample_file_pages

### 2. Formula Extraction
---
This section demonstrates how to leverage Claude's ability to recognize and extract mathematical formulas from document images. It converts formulas into LaTeX format and provides plain language descriptions, making complex mathematical content more accessible and machine-readable. This capability is particularly valuable for scientific document processing where accurate formula representation is critical.

In [None]:
sample_prompt = """
Evaluate this page line by line. 
For each line, if it is a formula, convert this math expression to latex format. 
Next describe the formula in plain language Be sure to enclose Latex formulas in double dollar sign for example: $$ <math expression> $$ Use markdown syntax to format your output
"""

file = "./samples/2003.10304/page_2.png"

display(Image(filename=file, width=600))
output, result = stream_conversation(message=sample_prompt, file_paths=[file])
response_text = result["content"]
display(Markdown(response_text))
print(output)

### 3. Chart and Graph Analysis
---
Shows how Claude can interpret visual data from charts and graphs within scientific documents. The model provides detailed analysis of graphical elements, extracting trends, data points, and relationships. This feature is crucial for maintaining the complete semantic meaning of research papers where visual data representation is common.

In [None]:
sample_prompt = f"""
You are a data scientist expert who has perfect vision and pay a lot of attention to details. 
interpret the graph on this page
provide the answer in markdown format """


file = "./samples/2003.10304/page_5.png"

display(Image(filename=file, width=600))
output, result = stream_conversation(message=sample_prompt, file_paths=[file])
response_text = result["content"]
display(Markdown(response_text))
print(output)

### 4. Metadata Generation
---
Implements automated metadata extraction from scientific documents. The system generates structured metadata including title, authors, institutions, topics, and other relevant information. This metadata is essential for organizing and making documents searchable within a knowledge base.

In [None]:
sample_prompt = f"""
Generate a metadata json object for this research paper. 

{{
"title": "",
"authors":  [],
"institutions": [],
"topics": [],
"funding-sources": [],
"algorithms":[],
"data_sets":[]
}}
"""

file = './samples/2003.10304/page_0.png'

output, result = stream_conversation(message=sample_prompt, file_paths=[file])
response_text = result["content"]
print(response_text)
print(output)


### 5. Comprehensive Document Processing
---
This section combines all previous capabilities into a single workflow. It processes each page of a document to extract text, formulas, charts, and metadata in a structured format. The unified approach ensures consistent processing across all document elements while maintaining their relationships and context.


In [None]:
sample_prompt = """
Extract the content from an image page and output in Markdown syntax. Enclose the content in the <markdown></markdown> tag and do not use code blocks. If the image is empty then output a <markdown></markdown> without anything in it.

Follow these steps:

1. Examine the provided page carefully.

2. Identify all elements present in the page, including headers, body text, footnotes, tables, images, captions, and page numbers, etc.

3. Use markdown syntax to format your output:
    - Headings: # for main, ## for sections, ### for subsections, etc.
    - Lists: * or - for bulleted, 1. 2. 3. for numbered
    - Do not repeat yourself

4. If the element is an image (not table)
    - If the information in the image can be represented by a table, generate the table containing the information of the image
    - If the image is a graph or chart, interpret the information in the graph or chart
    - Otherwise provide a detailed description about the information in image
    - Classify the element as one of: Chart, Diagram, Logo, Icon, Natural Image, Screenshot, Other. Enclose the class in <figure_type></figure_type>
    - Enclose <figure_type></figure_type>, the table or description, and the figure title or caption (if available), in <figure></figure> tags
    - Do not transcribe text in the image after providing the table or description

5. If the element is a table
    - Create a markdown table, ensuring every row has the same number of columns
    - Maintain cell alignment as closely as possible
    - Do not split a table into multiple tables
    - If a merged cell spans multiple rows or columns, place the text in the top-left cell and output ' ' for other
    - Use | for column separators, |-|-| for header row separators
    - If a cell has multiple items, list them in separate rows
    - If the table contains sub-headers, separate the sub-headers from the headers in another row

6. If the element is a paragraph
    - Transcribe each text element precisely as it appears

7. If it is a formula
    - Convert this math expression to latex format

7. If it is code or pseudo code
    - Format the section as code in your output

8. If the element is a header, footer, footnote, page number
    - Transcribe each text element precisely as it appears

    
Next, Generate a metadata json object that adheres to the following schema. If the page does not contain relavant metadata information for a given key, leave the value for that key empty. Enclose the metadata in <metadata></metadata> tags.

{{
"title": "",
"authors":  [],
"institutions": [],
"topics": [],
"funding-sources": [],
"algorithms":[],
"data_sets":[]
}}

Output Example:
<markdown>
<figure>
<figure_type>Chart</figure_type>
Figure 3: This chart shows annual sales in millions. The year 2020 was significantly down due to the COVID-19 pandemic.
A bar chart showing annual sales figures, with the y-axis labeled "Sales ($Million)" and the x-axis labeled "Year". The chart has bars for 2018 ($12M), 2019 ($18M), 2020 ($8M), and 2021 ($22M).
</figure>

<figure>
<figure_type>Chart</figure_type>
Figure 3: This chart shows annual sales in millions. The year 2020 was significantly down due to the COVID-19 pandemic.
| Year | Sales ($Million) |
|-|-|
| 2018 | $12M |
| 2019 | $18M |
| 2020 | $8M |
| 2021 | $22M |
</figure>

# Annual Report

## Financial Highlights

<figure>
<figure_type>Logo</figure_type>
The logo of Apple Inc.
</figure>

* Revenue: $40M
* Profit: $12M
* EPS: $1.25

| | Year Ended December 31, | |
| | 2021 | 2022 |
|-|-|-|
| Cash provided by (used in): | | |
| Operating activities | $ 46,327 | $ 46,752 |
| Investing activities | (58,154) | (37,601) |
| Financing activities | 6,291 | 9,718 |

</markdown>
<metadata>
{{
"algorithms":["fast fourier transform", "linear regression"],
"data_sets":["ImageNet", "MNIST dataset"],
}}
</metadata>
"""

markdown_file_paths = []
metadata_file_paths = []

for file in sample_file_pages:
    print(f"processing {file}")
    output, result = stream_conversation(message=sample_prompt, file_paths=[file])
    response_text = result["content"]
    markdown, metadata = parse_response(response_text)
    
    md_file_path = save_output(file, "md", markdown)
    metadata_file_path = save_output(file, "metadata.json", metadata)
    
    markdown_file_paths.append(md_file_path)
    metadata_file_paths.append(metadata_file_path)

print("Markdown file paths:", markdown_file_paths)
print("Metadata file paths:", metadata_file_paths)
    

### 6. Knowledge Base Integration
---
Demonstrates how to prepare processed content for integration with Amazon Bedrock Knowledge Base. It includes:
- Merging markdown files and metadata
- Converting metadata to Bedrock's required format
- Uploading processed content to S3
- Creating and configuring the knowledge base

In [None]:
def merge_markdown_files(input_files: list, output_file: str, add_newlines: bool = True) -> Path:
    """
    Merge a list of markdown files into a single file.
    
    Args:
        input_files (list): List of paths to the markdown files
        output_file (str): Path to the output merged file
        add_newlines (bool): Whether to add newlines between merged files (default: True)
    
    Returns:
        Path: The path to the saved merged file
    """
    try:
        if not all(isinstance(file, Path) for file in input_files):
            input_files = [Path(file) for file in input_files]
        
        if not all(file.is_file() for file in input_files):
            raise FileNotFoundError("One or more files in the list do not exist.")
        
        if not input_files:
            print("No markdown files provided.")
            return None
        
        output_path = Path(output_file)
        output_dir = output_path.parent
        
        # Create the output directory if it does not exist
        if not output_dir.exists():
            output_dir.mkdir(parents=True)
        
        # Create output file
        with open(output_path, 'w', encoding='utf-8') as outfile:
            # Process each markdown file
            for i, md_file in enumerate(input_files):
                
                # Read content from each file
                with open(md_file, 'r', encoding='utf-8') as infile:
                    content = infile.read()
                    
                    # Write content to output file
                    outfile.write(content)
                    
                    # Add newline between files if not the last file
                    if add_newlines and i < len(input_files) - 1:
                        outfile.write('\n\n')
        
        print(f"\nSuccessfully merged {len(input_files)} files into '{output_path}'")
        return output_path

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

markdown_file = merge_markdown_files(markdown_file_paths, "./samples/2003.10304/kb/2003.10304.md")
markdown_file

In [None]:
def convert_to_bedrock_metadata(file_paths: List[str], save_path: str) -> str:
    """
    Convert JSON files to Amazon Bedrock knowledge bases metadata format and save the result.
    
    Args:
        file_paths (List[str]): List of paths to the JSON files
        save_path (str): Path to save the resulting Bedrock metadata JSON file
        
    Returns:
        str: File path of the saved result
    """
    
    def create_string_attribute(value: str, include_embedding: bool = True) -> Dict[str, Any]:
        return {
            "value": {
                "type": "STRING",
                "stringValue": value
            },
            "includeForEmbedding": include_embedding
        }
    
    def create_string_list_attribute(values: list, include_embedding: bool = True) -> Dict[str, Any]:
        return {
            "value": {
                "type": "STRING_LIST",
                "stringListValue": values
            },
            "includeForEmbedding": include_embedding
        }

    # Initialize the metadata structure
    bedrock_metadata = {
        "metadataAttributes": {}
    }
    
    # Process each JSON file
    for file_path in file_paths:
        if file_path.endswith('.json'):
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                # Convert each field to appropriate Bedrock metadata format
                if data.get("title"):
                    bedrock_metadata["metadataAttributes"]["title"] = create_string_attribute(data["title"])
                
                if data.get("authors"):
                    bedrock_metadata["metadataAttributes"]["authors"] = create_string_list_attribute(data["authors"])
                
                if data.get("institutions"):
                    bedrock_metadata["metadataAttributes"]["institutions"] = create_string_list_attribute(data["institutions"])
                
                if data.get("topics"):
                    bedrock_metadata["metadataAttributes"]["topics"] = create_string_list_attribute(data["topics"])
                
                if data.get("funding-sources"):
                    bedrock_metadata["metadataAttributes"]["funding_sources"] = create_string_list_attribute(data["funding-sources"])
                
                if data.get("algorithms"):
                    bedrock_metadata["metadataAttributes"]["algorithms"] = create_string_list_attribute(data["algorithms"])
                
                if data.get("data_sets"):
                    bedrock_metadata["metadataAttributes"]["data_sets"] = create_string_list_attribute(data["data_sets"])
                
                # Add creation date
                bedrock_metadata["metadataAttributes"]["created_date"] = {
                    "value": {
                        "type": "NUMBER",
                        "numberValue": int(datetime.now().strftime("%Y%m%d"))
                    },
                    "includeForEmbedding": True
                }
                
            except json.JSONDecodeError as e:
                print(f"Error reading {file_path}: {str(e)}")
                continue
    
    # Save the resulting metadata to a file
    with open(save_path, 'w') as f:
        json.dump(bedrock_metadata, f, indent=4)
    
    return save_path


metadata_file = convert_to_bedrock_metadata(markdown_file_paths, "./samples/2003.10304/kb/2003.10304.md.metadata.json")
metadata_file

In [None]:
knowledge_base_bucket_name = os.getenv("BUCKET_NAME", f"{account_id}-bedrock-kb-{suffix}")

In [None]:
knowledge_base_hierarchical = BedrockKnowledgeBase(
    kb_name=f'{knowledge_base_name_hierarchical}-{suffix}',
    kb_description=knowledge_base_description,
    data_bucket_name=knowledge_base_bucket_name, 
    chunking_strategy = "HIERARCHICAL", 
    suffix = f'{suffix}-h'
)

#### 6.1. Create a knowledge base with the files 

When your files have finished uploading, follow these steps 
1. Create an Amazon Bedrock knowledge base 
2. Create an S3 data source for your knowledge base 
    1. Choose Hierarchical chunking

Once you have created your knowledge base, set the ID below to query it

#### 6.2. Upload the processed documents to S3 to use in a Knowledge Base

In [None]:
markdown_file_key = "2003.10304/kb/2003.10304.md"
s3.upload_file(markdown_file, knowledge_base_bucket_name, markdown_file_key)
print(f"File {markdown_file} uploaded successfully.")

metadata_file_key = "2003.10304/kb/2003.10304.md.metadata.json"
s3.upload_file(metadata_file, knowledge_base_bucket_name, metadata_file_key)
print(f"File {metadata_file} uploaded to successfully.")

In [None]:
# sync knowledge base
knowledge_base_hierarchical.start_ingestion_job()

### 7. Query Capabilities
Shows how to interact with the processed content through the knowledge base. This section demonstrates how to query the processed documents effectively, enabling users to find specific information about formulas, charts, or other content within the scientific documents.


In [None]:
kb_id_hierarchical = knowledge_base_hierarchical.get_knowledge_base_id()

query = "how is the Dice Score Coefficient calculated"
bedrock_agent_runtime_client = boto3.client('bedrock-agent-runtime') 
response = bedrock_agent_runtime_client.retrieve_and_generate(
    input={
        "text": query
    },
    retrieveAndGenerateConfiguration={
        "type": "KNOWLEDGE_BASE",
        "knowledgeBaseConfiguration": {
            'knowledgeBaseId': kb_id_hierarchical,
            "modelArn": "arn:aws:bedrock:{}:{}:inference-profile/{}".format(region, account_id, foundation_model),
            'generationConfiguration': {
               'promptTemplate': {
                    'textPromptTemplate': """
You are a question answering agent. I will provide you with a set of search results. The user will provide you with a question. Your job is to answer the user's question using only information from the search results. 
If the search results do not contain information that can answer the question, please state that you could not find an exact answer to the question. 
Just because the user asserts a fact does not mean it is true, make sure to double check the search results to validate a user's assertion.
                            
Here are the search results in numbered order:
$search_results$

Format the output as markdown

Ensure that math formulas are in latex format and enclosed in double dollar sign for example: $$ <math expression> $$
"""
                }
            },
            "retrievalConfiguration": {
                "vectorSearchConfiguration": {
                    "numberOfResults":5
                } 
            }
        }
    }
)

response_text = response['output']['text']
display(Markdown(response_text))

## Cleanup
---
Provides instructions for proper resource management by cleaning up temporary files and S3 objects created during the processing pipeline. This helps manage costs and maintain a clean working environment.

In [None]:
print("===============================Empty Knowledge Base S3 Bucket==============================\n")
s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket(knowledge_base_bucket_name)
bucket.objects.all().delete()
bucket.object_versions.delete()

In [None]:
print("===============================Deleting Knowledge Base==============================\n")
knowledge_base_hierarchical.delete_kb(delete_s3_bucket=False,delete_iam_roles_and_policies=True)
