# 1. PACKAGE INSTALLATION

In [None]:
# Install required packages
!pip install boto3
!pip install pillow
!pip install ipywidgets

# 2. IMPORTS AND CONFIGURATIONS

In [2]:
# Import necessary libraries
import os
import json
import boto3
import base64
from PIL import Image
from collections import defaultdict
from io import BytesIO

# Define paths and configurations
ROOT_FOLDER = 'images-2'
OUTPUT_FILE = 'image_nova.json'
SUPPORTED_FORMATS = ('.jpg', '.jpeg', '.png', '.gif', '.bmp')
IGNORE_PATTERNS = ('.ipynb_checkpoints', '-checkpoint')
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION = "us-east-1"
SONNET_MODEL_ID = "anthropic.claude-3-5-sonnet-20240620-v1:0"
NOVA_MODEL_ID = "amazon.nova-pro-v1:0"


# Create output file if it doesn't exist
if not os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'w') as f:
        json.dump({}, f)
    print(f"Created empty {OUTPUT_FILE}")

# 3. MODEL INITIALIZATION

In [3]:
# instantiate a bedrock client using boto3
session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
bedrock_runtime_client = session.client("bedrock-runtime", region_name=AWS_REGION)

# 4. TEST CONNECTION

In [None]:
# Test model access
test_invoke_sonnet = bedrock_runtime_client.invoke_model(
    modelId=SONNET_MODEL_ID,
    body=json.dumps({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 100,
        "messages": [{
            "role": "user",
            "content": [{
                "type": "text",
                "text": "hello world"
              }
            ]
          }
        ]
    }
    )
)
print("Sonnet Model access confirmed")

test_invoke_nova = bedrock_runtime_client.invoke_model(
    modelId=NOVA_MODEL_ID,
    body=json.dumps({
        "schemaVersion": "messages-v1",
        "inferenceConfig": {
          "max_new_tokens": 100
        },
        "messages": [
        {
        "role": "user",
        "content": [
          {
            "text": "hello world"
          }
        ]
      }
    ]
    })
)
print("Nova Model access confirmed")

# 5. HELPER FUNCTIONS

In [None]:
def nested_dict():
    """Create a nested defaultdict for hierarchical storage."""
    return defaultdict(nested_dict)

def convert_defaultdict_to_dict(d):
    """Convert defaultdict to regular dict for JSON serialization."""
    if isinstance(d, defaultdict):
        d = {k: convert_defaultdict_to_dict(v) for k, v in d.items()}
    return d

def encode_image(image_path):
    """Convert image to base64 encoding."""
    with Image.open(image_path) as img:
        # Convert to RGB if needed
        if img.mode != 'RGB':
            img = img.convert('RGB')
        # Convert to JPEG format
        buffer = BytesIO()
        img.save(buffer, format='JPEG')
        return base64.b64encode(buffer.getvalue()).decode('utf-8')


def process_image(image_path, model_type="sonnet"):
    """Process a single image using either Claude 3.5 Sonnet or Nova-Pro model"""
    
    # Encode image
    base64_image = encode_image(image_path)
    
    prompt = """
    Analyze this image and provide a standardized description using the following structure. Your description will be used for legal and regulatory information retrieval in British Columbia. If you see a mathematical expression, treat it as a legitimate image type requiring the same detailed analysis.

1. Image Type and Category:
- Primary classification (form, map, mathematical expression, table, diagram, etc.)
- Specific subcategory with domain context (e.g., "legal application form," "territorial boundary map," "regulatory compliance table")

2. Document Context:
- All visible identifiers (form numbers, file references, document titles)
- Temporal information (dates, versions)
- Location references (appendix numbers, section markers)
- Jurisdictional markers (e.g., "British Columbia," "Provincial Court")

3. Content Elements:
- Primary purpose or subject matter
- Key terms, legal references, and technical language
- All numerical data, coordinates, or mathematical notation
- Geographic references, boundary descriptions, or spatial information
- Form fields and their purposes (if applicable)
- Mathematical expressions and their components (if applicable)

4. Structural Components:
- Organization pattern (columns, sections, hierarchical levels)
- Relationships between elements
- Visual navigation aids (headers, borders, groupings)
- Any coordinate systems or scaling (for maps)
- Formula composition (for mathematical expressions)

5. Technical Specifications:
- Official markings or seals
- Required fields or sections
- Color coding or visual emphasis
- Special characters or notation
- Legend information (if present)

Rules for Description:

1. Treat all image types (including mathematical expressions) as valid subjects for full analysis
2. Use definitive language without qualifiers like "appears to" or "seems to"
3. Include all visible text verbatim where relevant
4. Maintain professional, technical terminology
5. Focus on elements that support document retrieval and identification
6. Describe mathematical expressions using proper mathematical terminology
7. Include all geographic or spatial references precisely

Your description must support accurate retrieval in a legal knowledge base and clearly distinguish this document from similar items in the collection.
    """
    
    if model_type == "sonnet":
        
        # Prepare the Sonnet request body
        body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 5000,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/jpeg",
                                "data": base64_image
                            }
                        },
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }
            ]
        }
        model_id = SONNET_MODEL_ID
        
    else:  # Nova-Pro
        # Prepare the Nova-Pro request body
        body = {
            "schemaVersion": "messages-v1",
            "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "image": {
                            "format": "jpeg",
                            "source": {"bytes": base64_image},
                        }
                    },
                    {
                        "text": prompt
                    }
                ],
            }
        ],
            "inferenceConfig": {
            "max_new_tokens": 5000,
        }
        }
        model_id = NOVA_MODEL_ID

    # Make the API call
    response = bedrock_runtime_client.invoke_model(
        modelId=model_id,
        body=json.dumps(body)
    )
    
    # Parse and return the response based on model type
    response_body = json.loads(response['body'].read())
    
    if model_type == "sonnet":
        return response_body['content'][0]['text']
    else:  # Nova-Pro
        return response_body['output']['message']['content'][0]['text']

# 6. MAIN PROCESSING LOGIC

In [None]:
def main():
    # Add model selection prompt
    print("\nAvailable models:")
    print("1. Claude 3.5 Sonnet V1")
    print("2. Nova-Pro")
    model_choice = input("Select model (1 or 2): ")
    
    model_type = "sonnet" if model_choice == "1" else "nova"
    
    # Initialize results dictionary
    results = nested_dict()
    
    # Load existing descriptions if any
    try:
        with open(OUTPUT_FILE, 'r') as f:
            existing_results = json.load(f)
            # Convert existing results to nested defaultdict
            for key, value in existing_results.items():
                if isinstance(value, dict):
                    results[key].update(value)
                else:
                    results[key] = value
        print(f"Loaded existing results from {OUTPUT_FILE}")
    except json.JSONDecodeError:
        print(f"Starting with empty results as {OUTPUT_FILE} is empty or invalid")

    # Keep track of all possible image paths
    all_image_paths = set()
    processed_images = set()

    # First pass: collect all image paths and already processed images
    for dirpath, dirnames, filenames in os.walk(ROOT_FOLDER):
        # Remove checkpoint directories
        dirnames[:] = [d for d in dirnames if not any(pattern in d for pattern in IGNORE_PATTERNS)]
        
        # Filter for valid image files
        image_files = [
            f for f in filenames 
            if f.lower().endswith(SUPPORTED_FORMATS) 
            and not any(pattern in f for pattern in IGNORE_PATTERNS)
        ]

        for filename in image_files:
            # Get relative path from root folder
            rel_path = os.path.relpath(dirpath, ROOT_FOLDER)
            
            # Store full path for processing
            full_path = os.path.join(dirpath, filename)
            all_image_paths.add(full_path)

            # Check if image is already in results
            current_dict = results
            if rel_path != '.':
                try:
                    for path_part in rel_path.split(os.sep):
                        current_dict = current_dict[path_part]
                    if filename in current_dict:
                        processed_images.add(full_path)
                except (KeyError, TypeError):
                    continue

    # Calculate images that need processing
    images_to_process = all_image_paths - processed_images
    
    # Print summary
    print(f"\nProcessing Summary:")
    print(f"Total images found: {len(all_image_paths)}")
    print(f"Already processed: {len(processed_images)}")
    print(f"Remaining to process: {len(images_to_process)}")
    
    # If no new images to process, exit
    if not images_to_process:
        print("\nNo new images to process. Exiting...")
        return

    # Ask for confirmation before proceeding
    proceed = input(f"\nProceed with processing {len(images_to_process)} images? (y/n): ")
    if proceed.lower() != 'y':
        print("Processing cancelled by user.")
        return

    # Second pass: process only new images
    count = 0
    total = len(images_to_process)
    
    for image_path in sorted(images_to_process):  # Sort for consistent ordering
        count += 1
        rel_path = os.path.relpath(os.path.dirname(image_path), ROOT_FOLDER)
        filename = os.path.basename(image_path)
        
        print(f"\nProcessing image {count}/{total}: {image_path}")
        
        # Navigate to correct position in results dictionary
        current_dict = results
        if rel_path != '.':
            for path_part in rel_path.split(os.sep):
                current_dict = current_dict[path_part]
        
        try:
                current_dict[filename] = process_image(image_path, model_type=model_type)
                print(f"✓ Successfully processed: {image_path}")
                
                # Save after each successful processing
                with open(OUTPUT_FILE, 'w') as f:
                    json.dump(convert_defaultdict_to_dict(results), f, indent=4)
                print(f"✓ Progress saved to {OUTPUT_FILE}")
                
        except Exception as e:
            print(f"✕ Error processing {image_path}: {str(e)}")
            continue

    print(f"\nProcessing complete!")
    print(f"Total images processed in this run: {count}")
    print(f"Results saved to: {OUTPUT_FILE}")

# 7. EXECUTION

In [None]:
if __name__ == "__main__":
    main()