#Single PDF Parsing with LLama 4 Maverick and ThreadPool

This notebook processes a single PDF in a Unity Catalog volume and transcribes them using LLama 4 by calling the endpoint using Threadpool and automatically adjusting the number of workers.

For examples, speed, and cost estimates, look at Notebook #2 which covers Multi-PDF parsing

**Author: Erica Yuen**

**Date: 07/03/2025**


#Installations, Imports, and preliminary setup

In [0]:
%pip install --quiet databricks-sdk httpx PyMuPDF openai
dbutils.library.restartPython()

In [0]:
import base64
import fitz 
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from pyspark.sql.functions import col, concat, lit, regexp_replace, split



#Set the UC paths

###To do: update the paths below

In [0]:
pdf_path = "/Volumes/erica/parsing/pdfs/Form Example.pdf"

intermediate_table_path = "erica.parsing.form_parsed_intermediate"
final_table_path = "erica.parsing.form_parsed"

DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()


DATABRICKS_BASE_URL = 'https://e2-demo-field-eng.cloud.databricks.com/serving-endpoints/'


In [0]:
from dbruntime.databricks_repl_context import get_context

get_context().workspaceId

#Parsing the documents and figure captions 

##Convert PDF to Base64 Image

This takes about **2 min for 440 pages**

Update save_to_unity_catalog if you don't want overrwrites:

        .mode("overwrite") \
        .option("overwriteSchema", "true") \

In [0]:
import base64
import fitz 
import pandas as pd

def convert_pdf_to_base64(pdf_path, dpi=300):
    """
    Convert PDF to base64 images optimized for RAG applications.
    Simple single-threaded approach for reliable performance.
    
    Args:
        pdf_path: Path to PDF file
        dpi: Resolution
    
    Returns:
        pandas DataFrame with columns: page_num, base64_img, doc_id
    """
    
    zoom = dpi / 72
    zoom_matrix = fitz.Matrix(zoom, zoom)
    
    doc = fitz.open(pdf_path)
    num_pages = len(doc)
    
    print(f"Processing {num_pages} pages at {dpi} DPI...")
    
    df_data = []
    
    for page_num in range(num_pages):
        if page_num % 25 == 0:  # Progress update every 25 pages
            print(f"Processing page {page_num + 1}/{num_pages}")
        
        page = doc.load_page(page_num)
        
        pix = page.get_pixmap(matrix=zoom_matrix, alpha=False)
        img_bytes = pix.tobytes("png")  
        img_base64 = base64.b64encode(img_bytes).decode('utf-8')
        
        df_data.append({
            'page_num': page_num + 1,
            'base64_img': img_base64,
            'doc_id': pdf_path
        })
    
    doc.close()
    print(f"Conversion complete. Generated {len(df_data)} base64 images.")
    
    return pd.DataFrame(df_data)

def save_to_unity_catalog(df, table_path = intermediate_table_path):
    """
    Save the DataFrame to Unity Catalog
    """
    spark_df = spark.createDataFrame(df)
    
    spark_df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(table_path)
    
    print(f"Saved to Unity Catalog: {table_path}")
    return spark_df



In [0]:

df = convert_pdf_to_base64(pdf_path)


In [0]:
display(df.head())
print(len(df))

save_to_unity_catalog(df, table_path = intermediate_table_path)

#View images - Update PAGE_NUM to browse

In [0]:
import base64
from IPython.display import Image as IPImage


PAGE_NUM = 1

def show_image(base64_str):
    return IPImage(data=base64.b64decode(base64_str))

show_image(df.iloc[PAGE_NUM - 1]['base64_img'])

In [0]:
prompt = """
Transcribe the follow form into markdown. 
Please bold all keys in key value pairs, and output sections with section headers. 
    """

In [0]:
import pandas as pd
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from openai import OpenAI
from tqdm import tqdm
import threading
from collections import deque
from datetime import datetime, timedelta

RETRYABLE_ERROR_SUBSTRINGS = ["retry", "got empty embedding result", "request_limit_exceeded", "rate limit", "insufficient_quota", "expecting value", "rate", "overloaded", "429", "bad gateway", "502"]

class RateLimitTracker:
    """Track API rate limits and adjust concurrency dynamically."""
    
    def __init__(self, initial_workers=5, min_workers=1, max_workers=10):
        self.current_workers = initial_workers
        self.min_workers = min_workers
        self.max_workers = max_workers
        self.rate_limit_events = deque(maxlen=20)  # Track recent rate limits
        self.success_count = 0
        self.lock = threading.Lock()
        
    def record_rate_limit(self):
        """Record a rate limit event and potentially reduce workers."""
        with self.lock:
            self.rate_limit_events.append(datetime.now())
            
            # If we've had multiple rate limits recently, reduce workers
            recent_limits = sum(1 for event in self.rate_limit_events 
                              if datetime.now() - event < timedelta(minutes=2))
            
            if recent_limits >= 3 and self.current_workers > self.min_workers:
                old_workers = self.current_workers
                self.current_workers = max(self.min_workers, self.current_workers - 1)
                print(f"🔽 Rate limits detected! Reducing workers: {old_workers} → {self.current_workers}")
                
    def record_success(self):
        """Record successful processing and potentially increase workers."""
        with self.lock:
            self.success_count += 1
            
            # If no recent rate limits and we've had some successes, gradually increase workers
            recent_limits = sum(1 for event in self.rate_limit_events 
                              if datetime.now() - event < timedelta(minutes=5))
            
            # Increase workers every 20 successes if no recent rate limits
            if (recent_limits == 0 and 
                self.current_workers < self.max_workers and 
                self.success_count % 20 == 0):
                old_workers = self.current_workers
                self.current_workers = min(self.max_workers, self.current_workers + 1)
                print(f"🔼 Performance good! Increasing workers: {old_workers} → {self.current_workers}")

def process_single_image(prompt, image_data, image_index, databricks_token, databricks_url, model, rate_tracker):
    """Process a single image with adaptive rate limiting."""
    
    client = OpenAI(api_key=databricks_token, base_url=databricks_url)
    
    # Skip empty images
    if pd.isna(image_data) or image_data == "":
        return (image_index, "ERROR: Empty image")
    
    
    # Retry logic with exponential backoff
    for attempt in range(3):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
                        }
                    ]
                }]
            )
            
            result = response.choices[0].message.content.strip()
            rate_tracker.record_success()
            
            # Print success message if this was a retry attempt
            if attempt > 0:
                print(f"✅ SUCCESS: Image {image_index} processed successfully after {attempt + 1} attempts")
            
            return (image_index, result)
            
        except Exception as e:
            error_str = str(e).lower()
            is_retryable = any(substring in error_str for substring in RETRYABLE_ERROR_SUBSTRINGS)
            
            if is_retryable:
                rate_tracker.record_rate_limit()
                
                if attempt < 2:  # Only retry if we have attempts left
                    # Exponential backoff with jitter
                    wait_time = (2 ** attempt) + random.uniform(1, 3)
                    print(f"⚠️  RATE LIMIT: Image {image_index}, attempt {attempt + 1}/3. Retrying in {wait_time:.1f}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"❌ FAILED: Image {image_index} failed after 3 attempts due to rate limiting")
                    return (image_index, f"ERROR: Rate limited after 3 attempts - {str(e)}")
            else:
                print(f"❌ ERROR: Image {image_index} failed with non-retryable error: {str(e)}")
                return (image_index, f"ERROR: {str(e)}")
    
    return (image_index, "ERROR: Max retries exceeded")

def process_images_adaptive(prompt, images, databricks_token, databricks_url, 
                           model="databricks-llama-4-maverick", 
                           initial_workers=5, min_workers=1, max_workers=10):
    """
    Adaptive processing that adjusts concurrency based on rate limits.
    
    Args:
        images: pandas Series of base64 encoded image strings
        databricks_token: Token for Databricks API  
        databricks_url: Base URL for Databricks API
        model: Model name to use
        initial_workers: Starting number of concurrent workers
        min_workers: Minimum workers (fallback during heavy rate limiting)
        max_workers: Maximum workers (cap for scaling up)
        
    Returns:
        pandas Series: Results with same index as input
    """
    
    # Convert to pandas Series if needed
    if not isinstance(images, pd.Series):
        images = pd.Series(images)
    
    results = pd.Series(index=images.index, dtype='object')
    rate_tracker = RateLimitTracker(
        initial_workers=initial_workers, 
        min_workers=min_workers, 
        max_workers=max_workers
    )
    
    print(f"🚀 Starting adaptive processing of {len(images)} images...")
    print(f"📊 Model: {model}")
    print(f"⚙️  Workers: {initial_workers} (range: {min_workers}-{max_workers})")
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        with tqdm(total=len(images), desc="Processing images", unit="img") as pbar:
            
            remaining_items = list(images.items())
            
            while remaining_items:
                # Submit batch based on current worker count
                batch_size = min(rate_tracker.current_workers, len(remaining_items))
                current_batch = remaining_items[:batch_size]
                remaining_items = remaining_items[batch_size:]
                
                # Submit current batch
                futures = {
                    executor.submit(process_single_image, prompt, img_data, idx, 
                                  databricks_token, databricks_url, model, rate_tracker): idx
                    for idx, img_data in current_batch
                }
                
                # Process batch results
                for future in as_completed(futures):
                    try:
                        image_index, result = future.result()
                        results[image_index] = result
                        
                        # Update progress bar with status and current worker count
                        if result.startswith("ERROR:"):
                            pbar.set_postfix({
                                "Last": f"❌ {image_index}", 
                                "Workers": rate_tracker.current_workers,
                                "Rate Limits": len(rate_tracker.rate_limit_events)
                            })
                        else:
                            pbar.set_postfix({
                                "Last": f"✅ {image_index}", 
                                "Workers": rate_tracker.current_workers,
                                "Rate Limits": len(rate_tracker.rate_limit_events)
                            })
                        
                    except Exception as e:
                        idx = futures[future]
                        results[idx] = f"ERROR: Exception - {str(e)}"
                        pbar.set_postfix({
                            "Last": f"❌ {idx} (Exception)", 
                            "Workers": rate_tracker.current_workers
                        })
                        print(f"❌ EXCEPTION: Image {idx} failed with exception: {str(e)}")
                    
                    pbar.update(1)
                
                # Small delay between batches if we have more to process
                if remaining_items:
                    time.sleep(0.2)  # Small delay to prevent overwhelming
    
    # Final summary statistics
    error_count = sum(1 for result in results if str(result).startswith("ERROR:"))
    success_count = len(results) - error_count
    
    print(f"\n📈 ADAPTIVE PROCESSING SUMMARY:")
    print(f"   ✅ Successful: {success_count}/{len(results)}")
    print(f"   ❌ Failed: {error_count}/{len(results)}")
    print(f"   📊 Success rate: {(success_count/len(results)*100):.1f}%")
    print(f"   🔧 Final worker count: {rate_tracker.current_workers}")
    print(f"   ⚠️  Total rate limit events: {len(rate_tracker.rate_limit_events)}")
    
    return results




🚀 #Execute PDF Parsing 

Update the initial workers if you are using an endpoint other than the default `model="databricks-llama-4-maverick"`

In [0]:
results_series = process_images_adaptive(
    prompt = prompt,
    images=df['base64_img'],
    databricks_token=DATABRICKS_TOKEN,
    databricks_url=DATABRICKS_BASE_URL,
    model="databricks-llama-4-maverick",
    initial_workers=5,  # Start with 5 workers for Pay-Per-Token, change to 30 if Provisioned Throughput with 200 model units
    min_workers=1,      # Fall back to 1 if heavy rate limiting, change to 10 if Provisioned Throughput with 200 model units
    max_workers=10      # Scale up to 10 if performance is good, change to 40 if Provisioned Throughput with 200 model units
)

In [0]:
df['transcription'] = results_series
display(df)

In [0]:
save_to_unity_catalog(df, table_path = final_table_path)