In [None]:
!conda install -c conda-forge poppler -y -q

In [None]:
!pip install pdf2image boto3 pillow PyMuPDF PyPDF2 pdfminer.six

In [None]:
import fitz  # PyMuPDF
import boto3
import io
import base64
import json
from pdf2image import convert_from_path
from PIL import Image
import time
import PyPDF2
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from botocore.config import Config
import re
import csv
import os
import glob

In [None]:
# Define PDF path
pdf_path = 'bankstatement.pdf'

In [None]:
base64_images = []
image_sizes = []
pages_text = []
bedrock_responses = []

pdf_document = fitz.open(pdf_path)

def calculate_file_size(base64_image):
    file_size_bytes = len(base64.b64decode(base64_image))
    return file_size_bytes / 1024  # Convert to KB

def get_page_range(start_page, end_page, total_pages):
    """
    Determine the page range to process.
    If start_page and end_page are both 0, process all pages.
    Otherwise, process the specified range.
    """
    if start_page == 0 and end_page == 0:
        return range(total_pages)
    
    start_idx = max(0, start_page - 1)
    end_idx = min(total_pages, end_page)
    
    if start_idx >= end_idx:
        print(f"Warning: Invalid page range ({start_page}-{end_page}). Using full document.")
        return range(total_pages)
        
    return range(start_idx, end_idx)

start_page = 0
end_page = 0

target_pages = get_page_range(start_page, end_page, len(pdf_document))
print(f"Processing pages {list(target_pages)[0] + 1} to {list(target_pages)[-1] + 1}")

for page_num in target_pages:
    page = pdf_document.load_page(page_num)
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img = img.resize((img.width * 2, img.height * 2), Image.LANCZOS)
    buffered = io.BytesIO()
    img.save(buffered, format="JPEG", quality=100)
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    base64_images.append(img_str)
    image_sizes.append(calculate_file_size(img_str))

pdf_document.close()

for i, size in enumerate(image_sizes):
    print(f"Page {list(target_pages)[i] + 1} image size: {size}")

In [None]:
try:
    for page_layout in extract_pages(pdf_path):
        page_text = ""
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                page_text += element.get_text()
        pages_text.append(page_text.strip())
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Initialize Bedrock client
my_config = Config(region_name='us-west-2')
session = boto3.Session()
bedrock = session.client('bedrock-runtime', config=my_config)

In [None]:
import glob
# Process each image
for i, image in enumerate(base64_images):
    
    print(f"Processing page {i + 1}")
    try:
        model_id = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
        request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 5000,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"""You are an expert financial document analyst with exceptional attention to detail and proficiency in multiple languages. I will present you with a document from a financial institution, which may be one page of a multi-page statement. This document may contain text in multiple languages. Your task is to meticulously examine every single element of this document and then reproduce its content in markdown format. Absolute precision and completeness are crucial. No detail is too small to reproduce, regardless of the language it's presented in. Approach this task as if overlooking even the tiniest piece of information could have significant consequences. Pay attention to:

                            1. The main body of text
                            2. Any headers or footers
                            3. All corners of the document, especially the top right corner
                            4. Any tables, charts, or graphs
                            5. Footnotes or fine print
                            6. Logos, watermarks, or other branding elements
                            7. Date stamps or page numbers
                            8. Account numbers or other identifying information
                            
                            As you analyze the document:
                            
                            - Create a mental map of its layout and content.
                            - Identify all languages present in the document.
                            - Translate any non-English text you encounter, providing both the original text and its English translation.
                            - Note any discrepancies or differences in information presented in different languages.
                            
                            When reproducing the document content:
                            
                            1. Output the document in markdown format.
                            2. Keep rows and columns aligned for all tables, even if they break across pages.
                            3. To retain all headers and table content, they cannot be omitted
                            4. Do not include any preface or explanation; output the document content directly.
                            
                            When processing tables:

                            1. Visual Parsing Framework:
                            - First pass: Create a mental grid with clear column boundaries
                            - Second pass: Map numerical values to their spatial positions
                            - Third pass: Verify column alignment of all numbers

                            2. Mathematical Balance Validation:
                            - For each row, validate the mathematical relationship:
                                Previous Balance ± Transaction Amount = Current Balance
                            - Use this relationship to determine correct column placement:
                                * If Current Balance > Previous Balance: Amount goes to increase column
                                * If Current Balance < Previous Balance: Amount goes to decrease column

                            3. Spatial Recognition Priority:
                            - Prioritize physical column position over content interpretation
                            - Respect original document's column structure strictly
                            - Maintain consistent alignment for numerical values within columns

                            4. Number Processing:
                            - Process all numerical values as discrete units
                            - Maintain original decimal and thousand separators
                            - Preserve exact spatial relationship between numbers and their columns

                            5. Column Integrity:
                            - Each numerical value belongs to exactly one column
                            - When in doubt, use balance mathematics to verify placement

                            6. If the cell is blank, put a specific text ^blank^ to indicate the cell is blank.
                            
                            For multi-page documents:
                            
                            - Note any page numbers or indications of continuity.
                            - Ensure tables that span multiple pages are reproduced in full, maintaining their structure.
                            """,
                        },
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/jpeg",
                                "data": image,
                            },
                        },
                    ],
                }
            ],
        }
        print(f"Bedrock Fire page {i + 1}")
        response = bedrock.invoke_model(
            modelId=model_id,
            body=json.dumps(request_body)
        )
        
        result = json.loads(response.get("body").read())
        input_tokens = result["usage"]["input_tokens"]
        output_tokens = result["usage"]["output_tokens"]
        output_list = result.get("content", [])
        response_text = result["content"][0]["text"]
        bedrock_responses.append(response_text)
        print(response_text)
        print(f"Processed page {i + 1}")

        print(input_tokens)
        print(output_tokens)

        # if i < len(base64_images) - 1:
            # print("Sleeping for 30 seconds before processing the next page...")
            # time.sleep(30)
            
    except Exception as e:
        print(f"Error processing page {i + 1}: {str(e)}")
        bedrock_responses.append(f"Error: {str(e)}")
    finally:
        print("\n\n")

print("All pages processed. Bedrock responses stored.")


In [None]:
for i, response in enumerate(bedrock_responses):
    print(f"Processing page {i + 1}")
    try:
        model_id = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
        request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 5000,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"""You are an expert financial data analyst with a keen eye for detail. I will provide you with a markdown-formatted representation of a financial document in the <document> tag. Your task is to carefully analyze this content and answer the following questions:
                            Think out loud, and output your thinking process in a <thinking> tag. 
                            

                            <instructions>
                            Task 2: Extract all transaction activities per bank account. 
                            1. currency
                            2. date (MMM-YY)
                            3. Transaction TYPE 1
                            4. Transaction TYPE 2
                            5. Amount
                            6. Transaction information (particulars)
                            7. Transaction date
                            8. Page Number
                        
                            
                            For transaction type 1, it should be "CREDIT/POSITVE" or "DEBIT/NEGATIVE". If the transaction is deducting amount from the bank account, then it is "DEBIT/NEGATIVE". Otherwise, it is "CREDIT/POSITIVE"
                            For trasnaction type 2, you should first understand the details of the transaction, and map it to one of the category below.
                            For amount, you should include negative sign (such as -50) if it is a deduction. Make sure all of the amount values are formatted in 0,000.00
                            For (7) Transaction date, make sure the date is formatted in compliant to ISO8601 (YYYY-MM-DD), for example (2023-12-30)

                            Your output should be csv format in a <output> tag. You can skip the header row.
                            If there is no transaction, you should output an empty content withing the <output> tag.

                            </instructions>

                            <currency information>
                            HKD\tHong Kong Dollar
                            USD\tUnited States Dollar
                            CNY\tChinese Yuan
                            </currency information>
                            
                            <transaction type 2 category>
                            Bank Opening
                            Bank Closing
                            Deposits
                            Witdrawal
                            Bank Fee & Other Charges
                            Revenues
                            Expenses
                            Dividend Received
                            Interest Received
                            </transaction type 2 category>

                            <document>
                            {response}
                            </document>"""
                        }
                    ]
                }
            ],
        }
        
        response = bedrock.invoke_model(
            modelId=model_id,
            body=json.dumps(request_body)
        )
        
        result = json.loads(response.get("body").read())
        input_tokens = result["usage"]["input_tokens"]
        output_tokens = result["usage"]["output_tokens"]
        output_list = result.get("content", [])
        response_text = result["content"][0]["text"]
        print(response_text)
        print(input_tokens)
        print(output_tokens)

        output_content = re.search(r'<output>(.*?)</output>', response_text, re.DOTALL)
        if output_content:
            extracted_text = output_content.group(1)
            
            with open('output.csv', 'a', newline='') as csvfile:
                csvfile.write(extracted_text + '\n')
                
        print("Content has been appended to output.csv")
        csv_files = glob.glob('output*.csv')
        
        print(f"Found {len(csv_files)} CSV files to clean")
        
        for file in csv_files:
            try:
                print(f"Processing {file}...")
                
                with open(file, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                
                cleaned_lines = [line for line in lines if line.strip()]
                
                with open(file, 'w', encoding='utf-8') as f:
                    f.writelines(cleaned_lines)
                    
                print(f"Successfully cleaned {file}")
                print(f"Removed {len(lines) - len(cleaned_lines)} blank lines")
                
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
                continue
            
    except Exception as e:
        print(f"Error processing page {i + 1}: {str(e)}")
        bedrock_responses.append(f"Error: {str(e)}")
    finally:
        print("\n\n\n\n\n")