# Step 1: Install dependencies

In [None]:
!pip install pdf2image tqdm boto3
!pip install numpy Pillow

# Step 2: Normalize PDFs to PNGs

In [None]:
import os
from pdf2image import convert_from_path
from tqdm import tqdm

# List all PDF files in current directory
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]

# Process each PDF file
for pdf_file in tqdm(pdf_files, desc='Converting PDFs to PNGs'):
    base_name = os.path.splitext(pdf_file)[0]
    
    # Convert PDF to images
    images = convert_from_path(pdf_file)
    
    # Save each page as PNG
    for i, image in enumerate(images, start=1):
        output_file = f'{base_name}_{i:02d}.png'
        image.save(output_file, 'PNG')

# Step 3: Extract content from all PNG files

In [None]:
import boto3
from tqdm import tqdm
import re

# Initialize AWS Textract client
textract = boto3.client('textract', region_name='us-east-1')

def get_base_filename(filename):
    # Remove page number suffix if it exists (e.g., '_01', '_02')
    return re.sub(r'_\d+$', '', os.path.splitext(filename)[0])

# Get all PNG files
png_files = [f for f in os.listdir('.') if f.endswith('.png')]

# Group files by base name
file_groups = {}
for png_file in png_files:
    base_name = get_base_filename(png_file)
    if base_name not in file_groups:
        file_groups[base_name] = []
    file_groups[base_name].append(png_file)

# Process each group of files
for base_name, files in tqdm(file_groups.items(), desc='Processing PNG files'):
    output_txt = f'{base_name}.txt'
    
    # Skip if TXT file already exists
    if os.path.exists(output_txt):
        continue
    
    all_text = []
    
    # Sort files to ensure correct page order
    files.sort()
    
    for png_file in files:
        with open(png_file, 'rb') as image:
            # Call Textract
            response = textract.detect_document_text(
                Document={'Bytes': image.read()}
            )
            
            # Extract text from response
            page_text = '\n'.join([item['Text'] for item in response['Blocks'] if item['BlockType'] == 'LINE'])
            all_text.append(page_text)
    
    # Write combined text to file
    with open(output_txt, 'w') as f:
        f.write('\n'.join(all_text))

# Step 4: Process all the information

In [None]:
import boto3
import json
from datetime import datetime
from tqdm import tqdm

# Initialize Bedrock client
client = boto3.client("bedrock-runtime", region_name="us-east-1")
MODEL_ID = "us.amazon.nova-pro-v1:0"

def process_text_with_bedrock(text_content):
    prompt = f"Go through this transcript of an electric bill and parse all the relevant information (name, address, phone, usage, cost, etc) into a JSON in the following format:\n\n{{\
  \"name\": \"<customer name>\",\
  \"account\": \"<account number>\",\
  \"address\": \"<address broken down into sub-properties for address, city, zip, etc>\",\
  \"phone\": \"<phone number>\",\
  \"email\": \"<customer email>\",\
  \"dueDate\": \"<Due date in YYYY-MM-DD format>\",\
  \"amount\": <amount in number format>,\
  \"usage\": <total kWh usage in number format>\
}}\n\nDon't include anything on the response other than the JSON.\n\nHere's the content:\n{text_content}"

    request_body = {
        "schemaVersion": "messages-v1",
        "messages": [{"role": "user", "content": [{"text": prompt}]}],
        "inferenceConfig": {"maxTokens": 2048, "topP": 0.9, "topK": 20, "temperature": 0.7}
    }

    response = client.invoke_model_with_response_stream(
        modelId=MODEL_ID,
        body=json.dumps(request_body)
    )

    # Process the response stream
    full_response = ""
    stream = response.get("body")
    if stream:
        for event in stream:
            chunk = event.get("chunk")
            if chunk:
                chunk_json = json.loads(chunk.get("bytes").decode())
                content_block_delta = chunk_json.get("contentBlockDelta")
                if content_block_delta:
                    full_response += content_block_delta.get("delta", {}).get("text", "")

    # Extract JSON from response
    try:
        start_idx = full_response.find('{')
        end_idx = full_response.rfind('}') + 1
        json_str = full_response[start_idx:end_idx]
        return json.loads(json_str)
    except Exception as e:
        print(f"Error parsing JSON: {e}")
        return None

# Get all TXT files
txt_files = [f for f in os.listdir('.') if f.endswith('.txt')]

# Process each TXT file
for txt_file in tqdm(txt_files, desc='Processing text files'):
    json_file = f"{os.path.splitext(txt_file)[0]}.json"
    
    # Skip if JSON file already exists
    if os.path.exists(json_file):
        continue
    
    # Read text content
    with open(txt_file, 'r') as f:
        text_content = f.read()
    
    # Process with Bedrock
    result = process_text_with_bedrock(text_content)
    
    if result:
        # Save JSON result
        with open(json_file, 'w') as f:
            json.dump(result, f, indent=2)