In [None]:
import os
import sys
import pathlib
import base64
import pandas as pd
from io import BytesIO

# Try to import PDF libraries
try:
    import PyPDF2
    PDF_LIBRARY = "PyPDF2"
except ImportError:
    try:
        import pypdf
        PDF_LIBRARY = "pypdf"
    except ImportError:
        PDF_LIBRARY = None

In [None]:
# Add the packages directory to the Python path
# In Jupyter notebooks, __file__ is not available, so we use os.getcwd() instead
TOP_DIR = pathlib.Path(os.getcwd()).parent.parent.parent.parent
PACKAGES_DIR = TOP_DIR / "packages" / "sigagent_sdk" / "src"
sys.path.append(str(PACKAGES_DIR))

In [None]:
BMDS_BASE_URL = os.environ["BMDS_BASE_URL"]
if not BMDS_BASE_URL:
    raise ValueError("BMDS_BASE_URL is not set")

BMDS_WORKSPACE_TOKEN = os.environ["BMDS_WORKSPACE_TOKEN"]
if not BMDS_WORKSPACE_TOKEN:
    raise ValueError("BMDS_WORKSPACE_TOKEN is not set")

# Get organization ID from environment
BMDS_ORG_ID = os.environ.get("BMDS_ORG_ID")
if not BMDS_ORG_ID:
    raise ValueError("BMDS_ORG_ID is not set")

BMDS_AAI_AIS_BATCH_TAG_ID = os.environ.get("BMDS_AAI_AIS_BATCH_TAG_ID")
if not BMDS_AAI_AIS_BATCH_TAG_ID:
    raise ValueError("BMDS_AAI_AIS_BATCH_TAG_ID is not set")

BMDS_AAI_AIS_BATCH_PAGE_TAG_ID = os.environ.get("BMDS_AAI_AIS_BATCH_PAGE_TAG_ID")
if not BMDS_AAI_AIS_BATCH_PAGE_TAG_ID:
    raise ValueError("BMDS_AAI_AIS_BATCH_PAGE_TAG_ID is not set")

BMDS_AAI_AIS_PROMPT_REVID = os.environ.get("BMDS_AAI_AIS_PROMPT_REVID")
if not BMDS_AAI_AIS_PROMPT_REVID:
    raise ValueError("BMDS_AAI_AIS_PROMPT_REVID is not set")

In [None]:
from sigagent_sdk import SigAgentClient
from sigagent_sdk.models.document import ListDocumentsResponse

def list_all_documents(client, organization_id, tag_ids=None, name_search=None, metadata_search=None, verbose=True):
    """
    Utility function to fetch all documents using pagination.
    The API has a maximum limit of 100, so we need to paginate through all results.
    
    Args:
        client: SigAgentClient instance
        organization_id: Organization ID to filter documents by
        tag_ids: List of tag IDs to filter by (optional)
        name_search: Search term for document names (optional)
        metadata_search: Metadata search parameters (optional)
        verbose: Whether to print progress information (default: True)
    
    Returns:
        ListDocumentsResponse-like object with all documents
    """
    all_documents = []
    skip = 0
    limit = 100  # API maximum limit
    
    while True:
        if verbose:
            print(f"Fetching documents (skip={skip}, limit={limit})...")
        
        # Make the API call with current pagination parameters
        response = client.documents.list(
            organization_id=organization_id,
            skip=skip,
            limit=limit,
            tag_ids=tag_ids,
            name_search=name_search,
            metadata_search=metadata_search
        )
        
        # Add documents from this batch
        all_documents.extend(response.documents)
        
        if verbose:
            print(f"  Retrieved {len(response.documents)} documents (total so far: {len(all_documents)})")
        
        # If we got fewer documents than the limit, we've reached the end
        if len(response.documents) < limit:
            break
            
        # Move to next page
        skip += limit
    
    if verbose:
        print(f"Completed! Retrieved {len(all_documents)} documents total")
    
    # Return a response-like object with all documents
    return type('ListDocumentsResponse', (), {
        'documents': all_documents,
        'total_count': len(all_documents),
        'skip': 0,
        'limit': len(all_documents)
    })()

def list_documents_with_pagination(client, organization_id, tag_ids=None, name_search=None, metadata_search=None, page_size=100, max_pages=None):
    """
    Alternative utility function that yields documents page by page for memory efficiency.
    Useful when dealing with very large document sets.
    
    Args:
        client: SigAgentClient instance
        organization_id: Organization ID to filter documents by
        tag_ids: List of tag IDs to filter by (optional)
        name_search: Search term for document names (optional)
        metadata_search: Metadata search parameters (optional)
        page_size: Number of documents per page (max 100, default: 100)
        max_pages: Maximum number of pages to fetch (optional, for testing)
    
    Yields:
        List of documents for each page
    """
    skip = 0
    page_count = 0
    
    while True:
        if max_pages and page_count >= max_pages:
            break
            
        print(f"Fetching page {page_count + 1} (skip={skip}, limit={page_size})...")
        
        # Make the API call with current pagination parameters
        response = client.documents.list(
            organization_id=organization_id,
            skip=skip,
            limit=page_size,
            tag_ids=tag_ids,
            name_search=name_search,
            metadata_search=metadata_search
        )
        
        if not response.documents:
            break
            
        yield response.documents
        
        # If we got fewer documents than the page size, we've reached the end
        if len(response.documents) < page_size:
            break
            
        # Move to next page
        skip += page_size
        page_count += 1
    
    print(f"Completed pagination! Processed {page_count + 1} pages")


In [None]:
client = SigAgentClient(
        base_url=BMDS_BASE_URL,
        api_token=BMDS_WORKSPACE_TOKEN
    )

In [None]:
AAIAIS_PDF="/Users/andrei/Documents/Analytiq/Customers/BMDS/data/AAIAIS20210512.pdf"

In [None]:
# Read the PDF file and split into pages
def split_pdf_into_pages(pdf_path, output_dir):
    """Split a PDF file into individual pages and return list of page data"""
    pages_data = []
    
    # Check if file exists and is readable
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    
    if not os.access(pdf_path, os.R_OK):
        raise PermissionError(f"Permission denied: Cannot read {pdf_path}")
    
    print(f"Reading PDF file: {pdf_path}")
    print(f"File size: {os.path.getsize(pdf_path)} bytes")
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            
            print(f"Found {num_pages} pages in PDF")
            
            for page_num in range(num_pages):
                # Create a new PDF writer for this page
                pdf_writer = PyPDF2.PdfWriter()
                pdf_writer.add_page(pdf_reader.pages[page_num])
                
                # Create page filename
                page_filename = f"AAIAIS_page_{page_num + 1:03d}.pdf"
                page_path = output_dir / page_filename
                
                # Write the page to a file
                with open(page_path, 'wb') as output_file:
                    pdf_writer.write(output_file)
                
                # Read the page data for upload
                with open(page_path, 'rb') as page_file:
                    page_data = page_file.read()
                
                pages_data.append({
                    'filename': page_filename,
                    'path': page_path,
                    'data': page_data,
                    'page_number': page_num + 1
                })
                
                print(f"  Created page {page_num + 1}: {page_filename}")
    
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        raise
    
    return pages_data

In [None]:
# Define output directory
output_dir = pathlib.Path("/Users/andrei/Documents/Analytiq/Customers/BMDS/data/output")
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Reading PDF file: {AAIAIS_PDF}")
print(f"Output directory: {output_dir}")

# Split the PDF into pages
pages_data = split_pdf_into_pages(AAIAIS_PDF, output_dir)

print(f"\nCompleted! Created {len(pages_data)} PDF pages in {output_dir}")
print(f"Created page files: {[page['filename'] for page in pages_data]}")


In [None]:
# Upload PDF pages to workspace (only missing ones)

print("Checking which PDF pages are already in the workspace...")

# Get current documents in workspace (filtered by the same tag used for uploads)
# Use pagination helper to get all documents (API limit is 100)
existing_documents = list_all_documents(client, BMDS_ORG_ID, tag_ids=[BMDS_AAI_AIS_BATCH_PAGE_TAG_ID])
existing_filenames = {doc.document_name for doc in existing_documents.documents}

print(f"Found {len(existing_filenames)} existing documents in workspace")
print(f"Existing filenames: {sorted(existing_filenames)}")

# Find PDF pages that need to be uploaded
pdf_pages_to_upload = []
for page_data in pages_data:
    if page_data['filename'] not in existing_filenames:
        pdf_pages_to_upload.append(page_data)
    else:
        print(f"  ✓ {page_data['filename']} already exists in workspace")

print(f"\nPDF pages to upload: {[page['filename'] for page in pdf_pages_to_upload]}")

In [None]:
len(pdf_pages_to_upload)

In [None]:
# Upload missing PDF pages one by one
if pdf_pages_to_upload:
    print(f"\nUploading {len(pdf_pages_to_upload)} PDF pages individually...")
    
    uploaded_documents = []
    failed_uploads = []
    
    for i, page_data in enumerate(pdf_pages_to_upload, 1):
        try:
            print(f"\n[{i}/{len(pdf_pages_to_upload)}] Uploading {page_data['filename']}...")
            
            # Encode PDF data as base64
            pdf_base64 = base64.b64encode(page_data['data']).decode()
            
            # Create document data for single upload
            document_data = {
                "name": page_data['filename'],
                "content": f"data:application/pdf;base64,{pdf_base64}",
                "tag_ids": [BMDS_AAI_AIS_BATCH_PAGE_TAG_ID],
                "metadata": {
                    "source": "aaiais_pdf_split",
                    "original_document": os.path.basename(AAIAIS_PDF),
                    "page_number": str(page_data['page_number']),
                    "total_pages": str(len(pages_data))
                }
            }
            
            # Upload single document
            upload_result = client.documents.upload(BMDS_ORG_ID, [document_data])
            
            if upload_result and 'documents' in upload_result and len(upload_result['documents']) > 0:
                doc = upload_result['documents'][0]
                uploaded_documents.append(doc)
                print(f"  ✓ Successfully uploaded {doc['document_name']} (ID: {doc['document_id']})")
            else:
                failed_uploads.append({
                    'filename': page_data['filename'],
                    'error': 'No document returned from upload'
                })
                print(f"  ✗ Failed to upload {page_data['filename']}: No document returned")
                
        except Exception as e:
            failed_uploads.append({
                'filename': page_data['filename'],
                'error': str(e)
            })
            print(f"  ✗ Failed to upload {page_data['filename']}: {str(e)}")
    
    # Print summary
    print(f"\n" + "="*50)
    print(f"UPLOAD SUMMARY")
    print(f"="*50)
    print(f"Total pages to upload: {len(pdf_pages_to_upload)}")
    print(f"Successfully uploaded: {len(uploaded_documents)}")
    print(f"Failed uploads: {len(failed_uploads)}")
    
    if uploaded_documents:
        print(f"\nSuccessfully uploaded documents:")
        for doc in uploaded_documents:
            print(f"  - {doc['document_name']} (ID: {doc['document_id']})")
    
    if failed_uploads:
        print(f"\nFailed uploads:")
        for failed in failed_uploads:
            print(f"  - {failed['filename']}: {failed['error']}")
            
else:
    print("\nAll PDF pages are already in the workspace - no upload needed!")


In [None]:
# List documents in the BMDS workspace (filtered by the same tag used for uploads)
print("Listing documents in BMDS workspace...")
# Use pagination helper to get all documents (API limit is 100)
documents = list_all_documents(client, BMDS_ORG_ID, tag_ids=[BMDS_AAI_AIS_BATCH_PAGE_TAG_ID])

print(f"Found {documents.total_count} documents")
print("\nDocument details:")
for i, doc in enumerate(documents.documents, 1):
    print(f"{i}. ID: {doc.id}")
    print(f"   Name: {doc.document_name}")
    print(f"   Uploaded: {doc.upload_date}")
    print(f"   Uploaded by: {doc.uploaded_by}")
    print(f"   State: {doc.state}")
    print(f"   Tag IDs: {doc.tag_ids}")
    if doc.metadata:
        print(f"   Metadata: {doc.metadata}")
    print()


In [None]:
# Wait for all AAIAIS page documents (by tag and original_document metadata) to reach LLM-completed state
import time

TARGET_TAG_ID = BMDS_AAI_AIS_BATCH_PAGE_TAG_ID
ORIGINAL_DOC = os.path.basename(AAIAIS_PDF)
max_retries = 600  # up to ~50 minutes at 5s intervals
sleep_seconds = 5

print("Waiting for all AAIAIS page documents to reach 'llm_completed'...")

for attempt in range(1, max_retries + 1):
    docs_resp = list_all_documents(
        client,
        BMDS_ORG_ID,
        tag_ids=[TARGET_TAG_ID],
        metadata_search={"original_document": ORIGINAL_DOC},
        verbose=False,
    )
    total = len(docs_resp.documents)
    completed = sum(1 for d in docs_resp.documents if (d.state or "").lower() == "llm_completed")

    print(f"Attempt {attempt}: {completed}/{total} completed (original_document={ORIGINAL_DOC})")

    if total > 0 and completed == total:
        print("All documents are in 'llm_completed' state.")
        break

    time.sleep(sleep_seconds)
else:
    # If we exit the for-loop normally (no break), we timed out
    pending = [d for d in docs_resp.documents if (d.state or "").lower() != "llm_completed"]
    print(f"Timed out waiting for completion. Pending: {len(pending)} documents")
    for d in pending[:10]:
        print(f" - {d.document_name} (ID: {d.id}) state={d.state}")


In [None]:
# res = client.prompts.list(BMDS_ORG_ID)
# for p in res.prompts:
#     print(f"name={p.name}")
#     print(f"prompt_revid={p.prompt_revid}")
#     print(f"prompt_version={p.prompt_version}")
#     print(f"prompt_id={p.prompt_id}")
#     print("")


In [None]:
# Fetch and display LLM results for each AAIAIS page document (filtered by original_document)
from sigagent_sdk.models.llm import LLMResult

ORIGINAL_DOC = os.path.basename(AAIAIS_PDF)

# Re-list documents (same tag + metadata filter) to iterate in a stable order by page number
resp = list_all_documents(
    client,
    BMDS_ORG_ID,
    tag_ids=[BMDS_AAI_AIS_BATCH_PAGE_TAG_ID],
    metadata_search={"original_document": ORIGINAL_DOC},
    verbose=False,
)

def page_num_from_name(name: str):
    try:
        return int(name.split('_')[-1].split('.')[0])
    except Exception:
        return 0

# Sort by page number ascending
sorted_docs = sorted(resp.documents, key=lambda d: page_num_from_name(d.document_name))

results = []
for i, d in enumerate(sorted_docs, 1):
    try:
        llm_res: LLMResult = client.llm.get_result(BMDS_ORG_ID, d.id, prompt_revid=BMDS_AAI_AIS_PROMPT_REVID, fallback=True)
        results.append({
            "document_id": d.id,
            "document_name": d.document_name,
            "page_number": page_num_from_name(d.document_name),
            "llm_result": llm_res.llm_result
        })
        print(f"{i:03d}. {d.document_name} → OK")
    except Exception as e:
        print(f"{i:03d}. {d.document_name} → ERROR: {e}")

# Optional: display first few results
print("\nSample results (first 3):")
for entry in results[:3]:
    print(f"- {entry['document_name']}: keys={list(entry['llm_result'].keys()) if isinstance(entry['llm_result'], dict) else type(entry['llm_result'])}")


In [None]:
results

In [None]:
# Split the PDF into pages
# Upload each page with the classication tag
# Wait for LLM to complete
# Get classification dictionary of all pages
# Say you want to extract schedule. Tag each page that is a schedule with the schedule tag
# Run the LLM prompt for that tag on all those pages. Wait a different way for LLM completion. Get result. Reassemble all pages.
# From the reassembled schedule, get the list of patients. 
# For each patient, run the page-specific prompt for the pages associated to the patient.