# 1. PACKAGE INSTALLATION

In [None]:
# Install required packages
!pip install boto3
!pip install pillow
!pip install ipywidgets

# 2. IMPORTS AND CONFIGURATIONS

In [None]:
# Import necessary libraries
import os
import json
import boto3
import base64
from PIL import Image
from collections import defaultdict
from io import BytesIO

# Define paths and configurations
ROOT_FOLDER = 'images'
OUTPUT_FILE = 'image_sonnet.json'
SUPPORTED_FORMATS = ('.jpg', '.jpeg', '.png', '.gif', '.bmp')
IGNORE_PATTERNS = ('.ipynb_checkpoints', '-checkpoint')
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION = "us-east-1"
MODEL_ID = "anthropic.claude-3-5-sonnet-20240620-v1:0"


# Create output file if it doesn't exist
if not os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'w') as f:
        json.dump({}, f)
    print(f"Created empty {OUTPUT_FILE}")

# 3. MODEL INITIALIZATION

In [None]:
# instantiate a bedrock client using boto3
session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
bedrock_runtime_client = session.client("bedrock-runtime", region_name=AWS_REGION)

# 4. TEST CONNECTION

In [None]:
# Test model access
test_invoke = bedrock_runtime_client.invoke_model(
    modelId=MODEL_ID,
    body=json.dumps({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 200,
        "messages": [{
            "role": "user",
            "content": [{
                "type": "text",
                "text": "hello world"
              }
            ]
          }
        ]
    }
    )
)
print("Sonnet Model access confirmed")

# 5. HELPER FUNCTIONS

In [None]:
def nested_dict():
    """Create a nested defaultdict for hierarchical storage."""
    return defaultdict(nested_dict)

def convert_defaultdict_to_dict(d):
    """Convert defaultdict to regular dict for JSON serialization."""
    if isinstance(d, defaultdict):
        d = {k: convert_defaultdict_to_dict(v) for k, v in d.items()}
    return d

def encode_image(image_path):
    """Convert image to base64 encoding."""
    with Image.open(image_path) as img:
        # Convert to RGB if needed
        if img.mode != 'RGB':
            img = img.convert('RGB')
        # Convert to JPEG format
        buffer = BytesIO()
        img.save(buffer, format='JPEG')
        return base64.b64encode(buffer.getvalue()).decode('utf-8')


def process_image(image_path):
    """Process a single image using Amazon Bedrock's Claude 3 Sonnet model."""
    
    # Encode image
    base64_image = encode_image(image_path)
    
    # Prepare the prompt
    prompt = """Analyze and comprehensively describe the following image in a manner optimized for legal and regulatory indexing and retrieval, ensuring all details are factual and explicitly supported by visible content. Your description will be used for identifying this image in a graph database to support a Retrieval-Augmented Generation (RAG) pipeline for British Columbia (BC) laws. Structure your description according to the following format:

1. Image Type and Category:
- Specify the primary type of image (e.g., diagram, chart, seal, form, table, map, figure, etc.).
- If applicable, identify subcategories, such as "organizational chart," "geographical map," "tax form," or "compliance table."

2. Identifier Information:
- Extract and list any visible document numbers, legal references, or codes.
- Include dates, version numbers, or other temporal markers.
- Note any page numbers or section markers, as well as location indicators (e.g., “Section 5.2” or “Appendix B”).

3. Content Description:
- Summarize the main subject or topic reflected in the image (e.g., “Building Code Regulation Exemptions” or “District Zoning Compliance Map”).
- Extract key terms and specific language visible in the image, especially technical or legal terminology.
- Include all measurements, quantities, percentages, or numerical data.
- Explicitly list proper nouns, regulatory bodies, names of laws, acts, or agencies.

4. Visual Structure and Layout:
- Describe the image's overall organization and structure (e.g., hierarchical elements, visually grouped sections, or thematic divisions).
- Specify relationships between elements (e.g., arrows representing steps in a process, lines indicating relationships, or columns and rows in a table).
- Note any use of color, bolding, or other visual emphasis that enhances meaning or denotes priority.

5. Distinctive Features:
- Identify any unique or notable elements, such as seals, emblems, watermarks, or jurisdiction-specific markings.
- Include symbols, special characters, or formatting that stand out (e.g., "red warning labels," "italicized legal clauses").
- Describe any unusual visual arrangements or stylistic choices.

Guidelines for Description:
- Use precise, searchable language that prioritizes accuracy and completeness.
- DO NOT USE speculative language such as “it appears,” “it might,” or “it seems.”
- Responses should be formulated in a confident and precise tone, without subjective interpretation.
- Include as much specificity as possible, as these descriptions will assist in indexing the image for efficient retrieval.
- Use clear, searchable legal and regulatory terminology wherever applicable.

YOU MUST focus on delivering a carefully considered response with the aim of maximizing retrieval accuracy and relevance.""" 
    
    # Prepare the request body
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2000,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": base64_image
                        }
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ]
    }

    # Make the API call
    response = bedrock_runtime_client.invoke_model(
        modelId=MODEL_ID,
        body=json.dumps(body)
    )
    
    # Parse and return the response
    response_body = json.loads(response['body'].read())
    return response_body['content'][0]['text']

# 6. MAIN PROCESSING LOGIC

In [None]:
def main():
    # Initialize results dictionary
    results = nested_dict()
    
    # Load existing descriptions if any
    try:
        with open(OUTPUT_FILE, 'r') as f:
            existing_results = json.load(f)
            # Convert existing results to nested defaultdict
            for key, value in existing_results.items():
                if isinstance(value, dict):
                    results[key].update(value)
                else:
                    results[key] = value
        print(f"Loaded existing results from {OUTPUT_FILE}")
    except json.JSONDecodeError:
        print(f"Starting with empty results as {OUTPUT_FILE} is empty or invalid")

    # Keep track of all possible image paths
    all_image_paths = set()
    processed_images = set()

    # First pass: collect all image paths and already processed images
    for dirpath, dirnames, filenames in os.walk(ROOT_FOLDER):
        # Remove checkpoint directories
        dirnames[:] = [d for d in dirnames if not any(pattern in d for pattern in IGNORE_PATTERNS)]
        
        # Filter for valid image files
        image_files = [
            f for f in filenames 
            if f.lower().endswith(SUPPORTED_FORMATS) 
            and not any(pattern in f for pattern in IGNORE_PATTERNS)
        ]

        for filename in image_files:
            # Get relative path from root folder
            rel_path = os.path.relpath(dirpath, ROOT_FOLDER)
            
            # Store full path for processing
            full_path = os.path.join(dirpath, filename)
            all_image_paths.add(full_path)

            # Check if image is already in results
            current_dict = results
            if rel_path != '.':
                try:
                    for path_part in rel_path.split(os.sep):
                        current_dict = current_dict[path_part]
                    if filename in current_dict:
                        processed_images.add(full_path)
                except (KeyError, TypeError):
                    continue

    # Calculate images that need processing
    images_to_process = all_image_paths - processed_images
    
    # Print summary
    print(f"\nProcessing Summary:")
    print(f"Total images found: {len(all_image_paths)}")
    print(f"Already processed: {len(processed_images)}")
    print(f"Remaining to process: {len(images_to_process)}")
    
    # If no new images to process, exit
    if not images_to_process:
        print("\nNo new images to process. Exiting...")
        return

    # Ask for confirmation before proceeding
    proceed = input(f"\nProceed with processing {len(images_to_process)} images? (y/n): ")
    if proceed.lower() != 'y':
        print("Processing cancelled by user.")
        return

    # Second pass: process only new images
    count = 0
    total = len(images_to_process)
    
    for image_path in sorted(images_to_process):  # Sort for consistent ordering
        count += 1
        rel_path = os.path.relpath(os.path.dirname(image_path), ROOT_FOLDER)
        filename = os.path.basename(image_path)
        
        print(f"\nProcessing image {count}/{total}: {image_path}")
        
        # Navigate to correct position in results dictionary
        current_dict = results
        if rel_path != '.':
            for path_part in rel_path.split(os.sep):
                current_dict = current_dict[path_part]
        
        try:
            current_dict[filename] = process_image(image_path)
            print(f"✓ Successfully processed: {image_path}")
            
            # Save after each successful processing
            with open(OUTPUT_FILE, 'w') as f:
                json.dump(convert_defaultdict_to_dict(results), f, indent=4)
            print(f"✓ Progress saved to {OUTPUT_FILE}")
            
        except Exception as e:
            print(f"✕ Error processing {image_path}: {str(e)}")
            continue

    print(f"\nProcessing complete!")
    print(f"Total images processed in this run: {count}")
    print(f"Results saved to: {OUTPUT_FILE}")

# 7. EXECUTION

In [None]:
if __name__ == "__main__":
    main()