In [None]:
import os
import sys
import pathlib
import base64
import pandas as pd
from io import BytesIO

# Try to import PDF libraries
try:
    import PyPDF2
    PDF_LIBRARY = "PyPDF2"
except ImportError:
    try:
        import pypdf
        PDF_LIBRARY = "pypdf"
    except ImportError:
        PDF_LIBRARY = None

In [None]:
# Add the packages directory to the Python path
# In Jupyter notebooks, __file__ is not available, so we use os.getcwd() instead
TOP_DIR = pathlib.Path(os.getcwd()).parent.parent.parent.parent
PACKAGES_DIR = TOP_DIR / "packages"
sys.path.append(str(PACKAGES_DIR))

In [None]:
BMDS_BASE_URL = os.environ["BMDS_BASE_URL"]
if not BMDS_BASE_URL:
    raise ValueError("BMDS_BASE_URL is not set")

BMDS_WORKSPACE_TOKEN = os.environ["BMDS_WORKSPACE_TOKEN"]
if not BMDS_WORKSPACE_TOKEN:
    raise ValueError("BMDS_WORKSPACE_TOKEN is not set")

# Get organization ID from environment
BMDS_ORG_ID = os.environ.get("BMDS_ORG_ID")
if not BMDS_ORG_ID:
    raise ValueError("BMDS_ORG_ID is not set")

BMDS_AAI_AIS_BATCH_TAG_ID = os.environ.get("BMDS_AAI_AIS_BATCH_TAG_ID")
if not BMDS_AAI_AIS_BATCH_TAG_ID:
    raise ValueError("BMDS_AAI_AIS_BATCH_TAG_ID is not set")

In [None]:
from docrouter_sdk import DocRouterClient
from docrouter_sdk.models.document import ListDocumentsResponse


In [None]:
client = DocRouterClient(
        base_url=BMDS_BASE_URL,
        api_token=BMDS_WORKSPACE_TOKEN
    )

In [None]:
AAIAIS_PDF="/Users/andrei/Documents/Analytiq/Customers/BMDS/data/AAIAIS20210512.pdf"

In [None]:
# Read the PDF file and split into pages
def split_pdf_into_pages(pdf_path, output_dir):
    """Split a PDF file into individual pages and return list of page data"""
    pages_data = []
    
    # Check if file exists and is readable
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    
    if not os.access(pdf_path, os.R_OK):
        raise PermissionError(f"Permission denied: Cannot read {pdf_path}")
    
    print(f"Reading PDF file: {pdf_path}")
    print(f"File size: {os.path.getsize(pdf_path)} bytes")
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            
            print(f"Found {num_pages} pages in PDF")
            
            for page_num in range(num_pages):
                # Create a new PDF writer for this page
                pdf_writer = PyPDF2.PdfWriter()
                pdf_writer.add_page(pdf_reader.pages[page_num])
                
                # Create page filename
                page_filename = f"AAIAIS_page_{page_num + 1:03d}.pdf"
                page_path = output_dir / page_filename
                
                # Write the page to a file
                with open(page_path, 'wb') as output_file:
                    pdf_writer.write(output_file)
                
                # Read the page data for upload
                with open(page_path, 'rb') as page_file:
                    page_data = page_file.read()
                
                pages_data.append({
                    'filename': page_filename,
                    'path': page_path,
                    'data': page_data,
                    'page_number': page_num + 1
                })
                
                print(f"  Created page {page_num + 1}: {page_filename}")
    
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        raise
    
    return pages_data

In [None]:
# Define output directory
output_dir = pathlib.Path("/Users/andrei/Documents/Analytiq/Customers/BMDS/data/output")
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Reading PDF file: {AAIAIS_PDF}")
print(f"Output directory: {output_dir}")

# Split the PDF into pages
pages_data = split_pdf_into_pages(AAIAIS_PDF, output_dir)

print(f"\nCompleted! Created {len(pages_data)} PDF pages in {output_dir}")
print(f"Created page files: {[page['filename'] for page in pages_data]}")


In [None]:
# Upload PDF pages to workspace (only missing ones)

print("Checking which PDF pages are already in the workspace...")

# Get current documents in workspace
existing_documents = client.documents.list(BMDS_ORG_ID)
existing_filenames = {doc.document_name for doc in existing_documents.documents}

print(f"Found {len(existing_filenames)} existing documents in workspace")
print(f"Existing filenames: {sorted(existing_filenames)}")

# Find PDF pages that need to be uploaded
pdf_pages_to_upload = []
for page_data in pages_data:
    if page_data['filename'] not in existing_filenames:
        pdf_pages_to_upload.append(page_data)
    else:
        print(f"  ✓ {page_data['filename']} already exists in workspace")

print(f"\nPDF pages to upload: {[page['filename'] for page in pdf_pages_to_upload]}")

# Upload missing PDF pages
if pdf_pages_to_upload:
    print(f"\nUploading {len(pdf_pages_to_upload)} PDF pages...")
    
    documents_to_upload = []
    for page_data in pdf_pages_to_upload:
        # Encode PDF data as base64
        pdf_base64 = base64.b64encode(page_data['data']).decode()
        
        # Create document data
        document_data = {
            "name": page_data['filename'],
            "content": f"data:application/pdf;base64,{pdf_base64}",
            "tag_ids": [BMDS_AAI_AIS_BATCH_TAG_ID],
            "metadata": {
                "source": "aaiais_pdf_split",
                "original_document": "AAIAIS20210512.pdf",
                "page_number": page_data['page_number'],
                "total_pages": len(pages_data)
            }
        }
        documents_to_upload.append(document_data)
        print(f"  Prepared {page_data['filename']} for upload with tag {BMDS_AAI_AIS_BATCH_TAG_ID}")
    
    # Upload all documents at once
    upload_result = client.documents.upload(BMDS_ORG_ID, documents_to_upload)
    print(f"\nUpload completed!")
    print(f"Uploaded {len(upload_result['documents'])} documents:")
    for doc in upload_result['documents']:
        print(f"  - {doc['document_name']} (ID: {doc['document_id']})")
else:
    print("\nAll PDF pages are already in the workspace - no upload needed!")


In [None]:
# List documents in the BMDS workspace
print("Listing documents in BMDS workspace...")
documents = client.documents.list(BMDS_ORG_ID)

print(f"Found {documents.total_count} documents")
print("\nDocument details:")
for i, doc in enumerate(documents.documents, 1):
    print(f"{i}. ID: {doc.id}")
    print(f"   Name: {doc.document_name}")
    print(f"   Created: {doc.created_at}")
    print(f"   Size: {doc.size_bytes} bytes")
    if doc.metadata:
        print(f"   Metadata: {doc.metadata}")
    print()

# Show summary of AAIAIS pages
aaiais_pages = [doc for doc in documents.documents if doc.document_name.startswith("AAIAIS_page_")]
if aaiiais_pages:
    print(f"\nAAIAIS PDF Pages Summary:")
    print(f"Total AAIAIS pages in workspace: {len(aaiiais_pages)}")
    print("Page numbers:", sorted([int(doc.document_name.split('_')[-1].split('.')[0]) for doc in aaiiais_pages]))
else:
    print("\nNo AAIAIS pages found in workspace.")
