In [1]:
# Cell 1: Core imports and path setup
import os
import sys
import json
import logging
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Tuple

# Add src to path for imports
sys.path.insert(0, os.path.abspath('.'))

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("✅ Basic imports successful")
print(f"Python version: {sys.version}")
print(f"Current directory: {os.getcwd()}")

✅ Basic imports successful
Python version: 3.12.11 (main, Aug  8 2025, 17:06:48) [Clang 20.1.4 ]
Current directory: /home/jroberts/fon_proposal_writer


In [2]:
# Cell 2: Load and verify environment variables
from dotenv import load_dotenv

# Load .env file
load_dotenv()

# Check critical environment variables
env_vars = {
    "AZURE_API_KEY": os.getenv("AZURE_API_KEY"),
    "AZURE_API_BASE": os.getenv("AZURE_API_BASE"),
    "AZURE_OPENAI_DEPLOYMENT": os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    "LANGFUSE_PUBLIC_KEY": os.getenv("LANGFUSE_PUBLIC_KEY"),
    "LANGFUSE_SECRET_KEY": os.getenv("LANGFUSE_SECRET_KEY"),
    "AZURE_STORAGE_CONNECTION_STRING": os.getenv("AZURE_STORAGE_CONNECTION_STRING"),
}

# Display status (hide actual values)
for key, value in env_vars.items():
    status = "✅ Set" if value else "❌ Missing"
    print(f"{key}: {status}")
    
missing = [k for k, v in env_vars.items() if not v]
if missing:
    print(f"\n⚠️ Missing environment variables: {', '.join(missing)}")
else:
    print("\n✅ All required environment variables are set")

AZURE_API_KEY: ✅ Set
AZURE_API_BASE: ✅ Set
AZURE_OPENAI_DEPLOYMENT: ✅ Set
LANGFUSE_PUBLIC_KEY: ✅ Set
LANGFUSE_SECRET_KEY: ✅ Set
AZURE_STORAGE_CONNECTION_STRING: ✅ Set

✅ All required environment variables are set


In [3]:
# Cell 3: Test configuration module
from src.config import settings

print("Configuration loaded:")
print(f"  Azure Deployment: {settings.azure_openai_deployment}")
print(f"  Langfuse Host: {settings.langfuse_host}")
print(f"  Blob Container: {settings.azure_blob_container}")
print(f"  Debug Mode: {settings.debug}")

# Verify Azure OpenAI endpoint construction
endpoint = f"{settings.azure_api_base}/openai/v1/"
print(f"\n  Constructed endpoint: {endpoint}")

Configuration loaded:
  Azure Deployment: gpt-4.1
  Langfuse Host: https://us.cloud.langfuse.com
  Blob Container: rfp-poc
  Debug Mode: False

  Constructed endpoint: https://proposal-openai-model.openai.azure.com//openai/v1/


In [None]:
import os
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def download_zip_file():
    """Download DHS_CBP.zip from Azure Blob Storage"""
    
    # Initialize the BlobServiceClient using connection string
    blob_service_client = BlobServiceClient.from_connection_string(
        os.getenv("AZURE_STORAGE_CONNECTION_STRING")
    )
    
    # You need to know the container name - common names are:
    # "data", "files", "uploads", "rfp-poc", etc.
    container_name = "rfp-poc"  # Replace with your actual container name
    blob_name = "DHS_CBP.zip"
    local_file_path = "./DHS_CBP.zip"  # Where to save locally
    
    try:
        # Get blob client
        blob_client = blob_service_client.get_blob_client(
            container=container_name, 
            blob=blob_name
        )
        
        # Download the blob
        print(f"Downloading {blob_name}...")
        with open(local_file_path, "wb") as download_file:
            download_data = blob_client.download_blob()
            download_file.write(download_data.readall())
        
        print(f"✅ Successfully downloaded to {local_file_path}")
        
    except Exception as e:
        print(f"❌ Error downloading file: {e}")
        
        # If container name is wrong, list available containers
        if "ContainerNotFound" in str(e):
            print("\nAvailable containers:")
            for container in blob_service_client.list_containers():
                print(f"  - {container.name}")

# Run the download
download_zip_file()

In [None]:
import zipfile

# Unzip the file to current directory
with zipfile.ZipFile('./data/inputs/DHS_CBP.zip', 'r') as zip_ref:
    zip_ref.extractall('./data/inputs/')  # '.' means current directory

print("✅ Unzipped all files to current directory")

In [13]:
# Cell 4: Test PDF loader
from src.io.loaders import pdf_to_pages

# Create a test PDF or use an existing one
test_pdf_path = "data/inputs/DHS_CBP/RFP-RFQ/06C25R0054_RFP_RABAS_HCaTS_RFP_Letter.pdf"

if Path(test_pdf_path).exists():
    pages = pdf_to_pages(test_pdf_path)
    print(f"✅ Successfully loaded PDF: {test_pdf_path}")
    print(f"  Number of pages: {len(pages)}")
    # print(f"  First page preview (first 500 chars):")
    # if pages:
    #     print(f"  {pages[0][1][:500]}...")
else:
    print(f"❌ Test PDF not found at: {test_pdf_path}")
    print("  Create a test PDF or update the path")

✅ Successfully loaded PDF: data/inputs/DHS_CBP/RFP-RFQ/06C25R0054_RFP_RABAS_HCaTS_RFP_Letter.pdf
  Number of pages: 57


In [24]:
# Cell 5: Test heading-aware chunking
from src.preprocessing.segmenter import heading_aware_chunks

# Use the pages from previous cell or create test data
if 'pages' not in locals():
    # Create dummy test data
    pages = [
        (1, "Section 1. Introduction\n\nThe contractor SHALL deliver all documentation in PDF format. Font size MUST be 12pt Times New Roman.\n\n"),
        (2, "Section 2. Requirements\n\nThe system SHOULD support 1000 concurrent users. Response time MUST NOT exceed 2 seconds.\n\n")
    ]

chunks = list(heading_aware_chunks(pages, max_chars=1000, overlap=100))

print(f"✅ Created {len(chunks)} chunks")
for i, chunk in enumerate(chunks):  # Show first 3 chunks
    print(f"\nChunk {i+1}:")
    print(f"  Pages: {chunk['start_page']}-{chunk['end_page']}")
    print(f"  Section: {chunk['section']}")
    # print(f"  Text preview: {chunk['text'][:200]}...")

✅ Created 57 chunks

Chunk 1:
  Pages: 1-2
  Section: Owned Small Business Program

Chunk 2:
  Pages: 2-3
  Section: B. INSTRUCTIONS TO QUOTERS

Chunk 3:
  Pages: 3-4
  Section: Six pages

Chunk 4:
  Pages: 4-5
  Section: Prior experience with government entities is preferred.

Chunk 5:
  Pages: 5-6
  Section: SOW.

Chunk 6:
  Pages: 6-7
  Section: Failure to comply with this requirement may result in the quote being deemed non-

Chunk 7:
  Pages: 7-8
  Section: Failure to comply with this requirement may result in the quote being deemed non-

Chunk 8:
  Pages: 8-9
  Section: C. EVALUATION FACTORS AND BASIS FOR AWARD

Chunk 9:
  Pages: 9-10
  Section: Low Confidence The Government has low confidence that the Offeror

Chunk 10:
  Pages: 10-11
  Section: See  clauses below.

Chunk 11:
  Pages: 11-12
  Section: SECTION B DELIVERIES OR PERFORMANCE

Chunk 12:
  Pages: 12-13
  Section: C.3 ELECTRONIC INVOICING AND PAYMENT REQUIREMENTS - INVOICE PROCESSING PLATFORM

Chunk 13:
  Pages: 13-14
 

In [17]:
# Cell 6: Test regex pre-pass
from src.preprocessing.regex_pass import fast_hits

# Test with sample chunks
test_chunk = {
    "text": """The contractor SHALL submit monthly reports by the 15th of each month.
    All deliverables MUST be submitted by 12/31/2024.
    System SHALL be ISO 27001 certified.
    Technical evaluation worth 60 points.""",
    "section": "Submission Requirements",
    "start_page": 1,
    "end_page": 1
}

all_matches = []
for chunk in chunks:
    matches = fast_hits(chunk)
    print(f"✅ Found {len(matches)} regex matches:")
    all_matches.extend(matches)
print(f"✅ Found {len(all_matches)} regex matches:")
for match in all_matches:
    print(f"  - Type: {match['kind']}")
    print(f"    Match: {match['match'][:100]}")
    print()

✅ Found 2 regex matches:
✅ Found 0 regex matches:
✅ Found 1 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 1 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 1 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 1 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 3 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:
✅ Found 1 regex matches:
✅ Found 1 regex matches:
✅ Found 0 regex matches:
✅ Found 0 regex matches:


In [53]:
# Cell 7: Initialize DSPy with Azure OpenAI
import dspy
from src.config import settings

# Configure DSPy
lm = dspy.LM(
    model=settings.azure_openai_deployment,
    api_key=settings.azure_api_key,
    api_base=f"{settings.azure_api_base}/openai/v1/",
    temperature=0.1,
    max_tokens=32700
)

# Set up with JSON adapter for structured output
dspy.configure(lm=lm, adapter=dspy.JSONAdapter(), track_usage=True, cache=True)
# dspy.configure_cache(
#     enable_disk_cache=False,
#     enable_memory_cache=False,
# )

print(f"✅ DSPy configured with model: azure/{settings.azure_openai_deployment}")

# Test with a simple prompt
test_lm = dspy.Predict("question -> answer")
try:
    result = test_lm(question="What is 5+3?")
    print(f"✅ DSPy test successful: {result.answer}")
except Exception as e:
    print(f"❌ DSPy test failed: {e}")
    print("API Base:", settings.azure_api_base)
    print("Deployment:", settings.azure_openai_deployment)
    print("API Key set:", bool(settings.azure_api_key))

✅ DSPy configured with model: azure/gpt-4.1
✅ DSPy test successful: 5 + 3 equals 8.


In [22]:
# Cell 8: Initialize Langfuse tracing
from src.observability.tracing import initialize_tracing, get_langfuse_client

try:
    langfuse_client = initialize_tracing()
    print("✅ Langfuse tracing initialized")
    print(f"  Host: {settings.langfuse_host}")
    
    # Create a test trace
    from langfuse import observe
    
    @observe(name="test_function")
    def test_trace():
        return "Test successful"
    
    result = test_trace()
    langfuse_client.flush()
    print(f"✅ Test trace created: {result}")
    
except Exception as e:
    print(f"⚠️ Langfuse initialization warning: {e}")
    print("  Continuing without tracing...")

2025-08-31 22:17:32,318 - src.observability.tracing - INFO - Tracing initialized with Langfuse host: https://us.cloud.langfuse.com


✅ Langfuse tracing initialized
  Host: https://us.cloud.langfuse.com
✅ Test trace created: Test successful


In [54]:
# Cell 9: Extract requirements from the last 10 chunks of your RFP
from src.extraction.modules import Extractor
import json

# Initialize extractor
extractor = Extractor()

# Take the last 10 chunks (likely containing submission requirements, terms, etc.)
last_chunks = chunks[-12:-11]
print(f"Processing last {len(last_chunks)} chunks (pages {last_chunks[0]['start_page']}-{last_chunks[-1]['end_page']})")
print(f"Sections covered: {[c['section'] for c in last_chunks]}\n")

# Extract requirements from each chunk
all_extracted = []
for i, chunk in enumerate(last_chunks):
    print(f"Processing chunk {i+1}/10 (pages {chunk['start_page']}-{chunk['end_page']}, section: {chunk['section'][:50]}...)")
    try:
        requirements = extractor(chunk)
        all_extracted.extend(requirements)
        print(f"  → Found {len(requirements)} requirements")
        for req in requirements[:2]:  # Show first 2 from each chunk
            print(f"    • {req.get('label', 'No label')[:80]}...")
    except Exception as e:
        print(f"  ⚠️ Extraction failed: {e}")

print(f"\n✅ Total extracted: {len(all_extracted)} requirements from last 10 chunks")

Processing last 1 chunks (pages 46-47)
Sections covered: ['HSAR 3052.204-72 Safeguarding of controlled unclassified information.']

Processing chunk 1/10 (pages 46-47, section: HSAR 3052.204-72 Safeguarding of controlled unclas...)
  ⚠️ Extraction failed: 'list' object has no attribute 'set_lm_usage'

✅ Total extracted: 0 requirements from last 10 chunks


In [33]:
import dspy

# See what methods the cache has
print("Cache methods and attributes:")
for attr in dir(dspy.cache):
    if not attr.startswith('_'):
        print(f"  {attr}")

# Check for common cache patterns
if hasattr(dspy.cache, 'cache'):
    print("\nFound cache.cache - type:", type(dspy.cache.cache))
    if hasattr(dspy.cache.cache, 'clear'):
        dspy.cache.cache.clear()
        print("✓ Cleared dspy.cache.cache")

if hasattr(dspy.cache, '_cache'):
    print("\nFound cache._cache - type:", type(dspy.cache._cache))
    if hasattr(dspy.cache._cache, 'clear'):
        dspy.cache._cache.clear()
        print("✓ Cleared dspy.cache._cache")

if hasattr(dspy.cache, 'data'):
    print("\nFound cache.data - type:", type(dspy.cache.data))
    if hasattr(dspy.cache.data, 'clear'):
        dspy.cache.data.clear()
        print("✓ Cleared dspy.cache.data")

# Try to see the internal structure
print("\nAll cache attributes (including private):")
for attr in dir(dspy.cache):
    try:
        value = getattr(dspy.cache, attr)
        if not callable(value) and not attr.startswith('__'):
            print(f"  {attr}: {type(value)}")
    except:
        pass

Cache methods and attributes:
  cache_key
  disk_cache
  enable_disk_cache
  enable_memory_cache
  get
  load_memory_cache
  memory_cache
  put
  reset_memory_cache
  save_memory_cache

All cache attributes (including private):
  _lock: <class '_thread.RLock'>
  disk_cache: <class 'diskcache.fanout.FanoutCache'>
  enable_disk_cache: <class 'bool'>
  enable_memory_cache: <class 'bool'>
  memory_cache: <class 'cachetools.LRUCache'>


In [43]:
print(f"Memory cache size: {len(dspy.cache.memory_cache)}")
print(f"Disk cache size: {len(dspy.cache.disk_cache)}")


# 1. Clear the memory cache
dspy.cache.reset_memory_cache()
print("✓ Memory cache reset")

# 2. Clear the disk cache
dspy.cache.disk_cache.clear()
print("✓ Disk cache cleared")

Memory cache size: 0
Disk cache size: 0
✓ Memory cache reset
✓ Disk cache cleared


In [47]:
test_type = "extractor"
test_number = 2
with open(f"./test-results/{test_type}_test_{str(test_number)}.json", "x") as f:
    json.dump(all_extracted, f, indent=2)

print(f"Extracted requirements saved to ./test-results/{test_type}_{str(test_number)}.json")

Extracted requirements saved to ./test-results/extractor_2.json
