In [10]:
# Cell 1: Find where DSPy stores its cache
import dspy
import os
from pathlib import Path
import pickle
import json

# Check for cache directory
possible_paths = [
    './cachedir',
    './.dspy_cache', 
    '~/.dspy_cache',
    './dspy_cache',
    './__pycache__/dspy_cache'
]

cache_dir = None
for path in possible_paths:
    p = Path(path).expanduser()
    if p.exists():
        print(f"✅ Found cache directory: {p.absolute()}")
        cache_dir = p
        break

if not cache_dir:
    # Try to get it from the disk cache object
    if hasattr(dspy.cache.disk_cache, 'directory'):
        cache_dir = Path(dspy.cache.disk_cache.directory)
        print(f"✅ Cache directory from DSPy: {cache_dir}")
    elif hasattr(dspy.cache.disk_cache, '_directory'):
        cache_dir = Path(dspy.cache.disk_cache._directory)
        print(f"✅ Cache directory from DSPy: {cache_dir}")

if cache_dir and cache_dir.exists():
    files = list(cache_dir.rglob('*'))
    print(f"\n📁 Cache contains {len([f for f in files if f.is_file()])} files")
else:
    print("❌ No cache directory found")

✅ Found cache directory: /home/jroberts/.dspy_cache

📁 Cache contains 48 files


In [11]:
# Cell 2: Explore cache structure and list entries
if cache_dir and cache_dir.exists():
    cache_files = []
    
    for file in cache_dir.rglob('*'):
        if file.is_file():
            # Get file info
            size_kb = file.stat().st_size / 1024
            rel_path = file.relative_to(cache_dir)
            cache_files.append({
                'path': file,
                'rel_path': str(rel_path),
                'size_kb': size_kb,
                'name': file.name
            })
    
    # Sort by size to find the interesting ones
    cache_files.sort(key=lambda x: x['size_kb'], reverse=True)
    
    print(f"Top 10 largest cache files:")
    for i, cf in enumerate(cache_files[:10], 1):
        print(f"{i}. {cf['rel_path']}")
        print(f"   Size: {cf['size_kb']:.2f} KB")
        print()

Top 10 largest cache files:
1. 002/cache.db-wal
   Size: 193.16 KB

2. 000/cache.db-wal
   Size: 193.16 KB

3. 009/cache.db-wal
   Size: 193.16 KB

4. 007/cache.db-wal
   Size: 181.09 KB

5. 008/cache.db-wal
   Size: 160.97 KB

6. 012/cache.db-wal
   Size: 160.97 KB

7. 006/cache.db-wal
   Size: 156.95 KB

8. 013/cache.db-wal
   Size: 156.95 KB

9. 001/cache.db-wal
   Size: 156.95 KB

10. 010/cache.db-wal
   Size: 136.83 KB



In [12]:
# Cell 3: Try to load and examine cache entries
import shelve
import dbm

def try_load_cache_file(file_path):
    """Try different methods to load a cache file"""
    
    # Method 1: Try as pickle
    try:
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        return 'pickle', data
    except:
        pass
    
    # Method 2: Try as JSON
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        return 'json', data
    except:
        pass
    
    # Method 3: Try as shelve/dbm
    try:
        # Remove extension for shelve
        base_path = str(file_path).rsplit('.', 1)[0] if '.' in str(file_path) else str(file_path)
        db = shelve.open(base_path, 'r')
        data = dict(db)
        db.close()
        return 'shelve', data
    except:
        pass
    
    # Method 4: Try raw text
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = f.read()
        return 'text', data
    except:
        pass
    
    return None, None

# Try to load the first cache file
if cache_files:
    test_file = cache_files[0]['path']
    print(f"Attempting to load: {test_file.name}")
    
    format_type, data = try_load_cache_file(test_file)
    
    if format_type:
        print(f"✅ Successfully loaded as: {format_type}")
        print(f"Data type: {type(data)}")
        
        if isinstance(data, dict):
            print(f"Keys: {list(data.keys())[:5]}")  # Show first 5 keys
        elif isinstance(data, (list, tuple)):
            print(f"Length: {len(data)}")
        elif isinstance(data, str):
            print(f"String length: {len(data)} characters")
            print(f"Preview: {data[:200]}...")

Attempting to load: cache.db-wal


In [13]:
# Cell 4: Look at the in-memory cache (current session)
print("🧠 Examining DSPy Memory Cache")
print("-" * 40)

# Access memory cache
memory_cache = dspy.cache.memory_cache

print(f"Memory cache type: {type(memory_cache)}")
print(f"Number of entries: {len(memory_cache)}")

# Show first few entries
if memory_cache:
    for i, (key, value) in enumerate(list(memory_cache.items())[:3]):
        print(f"\n📌 Entry {i+1}:")
        print(f"Key type: {type(key)}")
        print(f"Key preview: {str(key)[:100]}...")
        print(f"Value type: {type(value)}")
        
        # If value is a dict with prompt/response
        if isinstance(value, dict):
            for k, v in value.items():
                if isinstance(v, str):
                    print(f"  {k}: {v[:100]}...")
                else:
                    print(f"  {k}: {type(v)}")

🧠 Examining DSPy Memory Cache
----------------------------------------
Memory cache type: <class 'cachetools.LRUCache'>
Number of entries: 0


In [14]:
# Cell 5: Extract actual prompts and responses from cache
def examine_cache_entry(entry):
    """Extract and format cache entry for examination"""
    
    if isinstance(entry, dict):
        # Look for common keys
        prompt_keys = ['prompt', 'messages', 'input', 'query', 'instruction']
        response_keys = ['response', 'output', 'completion', 'choices', 'text']
        
        found_prompt = None
        found_response = None
        
        for key in prompt_keys:
            if key in entry:
                found_prompt = entry[key]
                break
        
        for key in response_keys:
            if key in entry:
                found_response = entry[key]
                break
        
        return found_prompt, found_response
    
    return None, None

# Examine entries in memory cache
if memory_cache:
    print("🔍 Detailed Cache Analysis\n")
    
    for i, (cache_key, cache_value) in enumerate(list(memory_cache.items())[:5]):
        print(f"=" * 60)
        print(f"CACHE ENTRY {i+1}")
        print(f"=" * 60)
        
        # Try to extract prompt and response
        prompt, response = examine_cache_entry(cache_value)
        
        if prompt:
            print("\n📝 PROMPT/INPUT:")
            if isinstance(prompt, str):
                print(prompt[:500])
            elif isinstance(prompt, list):  # Messages format
                for msg in prompt[:3]:
                    print(f"  {msg}")
        
        if response:
            print("\n💬 RESPONSE/OUTPUT:")
            if isinstance(response, str):
                print(response[:500])
            elif isinstance(response, dict):
                print(json.dumps(response, indent=2)[:500])
        
        print("\n" + "-" * 60)

In [15]:
# Cell 5: Extract actual prompts and responses from cache
def examine_cache_entry(entry):
    """Extract and format cache entry for examination"""
    
    if isinstance(entry, dict):
        # Look for common keys
        prompt_keys = ['prompt', 'messages', 'input', 'query', 'instruction']
        response_keys = ['response', 'output', 'completion', 'choices', 'text']
        
        found_prompt = None
        found_response = None
        
        for key in prompt_keys:
            if key in entry:
                found_prompt = entry[key]
                break
        
        for key in response_keys:
            if key in entry:
                found_response = entry[key]
                break
        
        return found_prompt, found_response
    
    return None, None

# Examine entries in memory cache
if memory_cache:
    print("🔍 Detailed Cache Analysis\n")
    
    for i, (cache_key, cache_value) in enumerate(list(memory_cache.items())[:5]):
        print(f"=" * 60)
        print(f"CACHE ENTRY {i+1}")
        print(f"=" * 60)
        
        # Try to extract prompt and response
        prompt, response = examine_cache_entry(cache_value)
        
        if prompt:
            print("\n📝 PROMPT/INPUT:")
            if isinstance(prompt, str):
                print(prompt[:500])
            elif isinstance(prompt, list):  # Messages format
                for msg in prompt[:3]:
                    print(f"  {msg}")
        
        if response:
            print("\n💬 RESPONSE/OUTPUT:")
            if isinstance(response, str):
                print(response[:500])
            elif isinstance(response, dict):
                print(json.dumps(response, indent=2)[:500])
        
        print("\n" + "-" * 60)

In [16]:
# Cell 6: Search cache for specific patterns or requirements
search_term = "SHALL"  # Change this to search for different patterns

found_entries = []

print(f"🔎 Searching cache for '{search_term}'...\n")

for key, value in memory_cache.items():
    # Convert to string for searching
    value_str = str(value)
    
    if search_term.lower() in value_str.lower():
        found_entries.append((key, value))

print(f"Found {len(found_entries)} cache entries containing '{search_term}'")

# Show first match in detail
if found_entries:
    key, value = found_entries[0]
    print(f"\nFirst matching entry:")
    print("=" * 60)
    
    # Pretty print if it's JSON-like
    if isinstance(value, dict):
        print(json.dumps(value, indent=2)[:1000])
    else:
        print(str(value)[:1000])

🔎 Searching cache for 'SHALL'...

Found 0 cache entries containing 'SHALL'


In [17]:
# Cell 7: See how DSPy constructs prompts from your signatures
# This shows what DSPy actually sends vs what you defined

from src.extraction.signatures import ExtractReqs

print("📋 Your Signature Definition:")
print(f"Class: {ExtractReqs.__name__}")
print(f"Docstring: {ExtractReqs.__doc__}")
print(f"Fields: {ExtractReqs.__fields__}")

print("\n🤖 What DSPy Actually Sends:")

# Look for entries that match your extraction calls
extraction_entries = []

for key, value in memory_cache.items():
    value_str = str(value)
    # Look for signature markers
    if "requirement" in value_str.lower() and "json" in value_str.lower():
        extraction_entries.append(value)

if extraction_entries:
    # Show the full prompt DSPy constructed
    example = extraction_entries[0]
    
    if isinstance(example, dict) and 'messages' in example:
        for msg in example['messages']:
            print(f"\n[{msg.get('role', 'unknown').upper()}]:")
            print(msg.get('content', '')[:800])
    else:
        print(json.dumps(example, indent=2)[:1500])
else:
    print("No extraction prompts found in cache yet. Run an extraction first.")

📋 Your Signature Definition:
Class: ExtractReqs
Docstring: You are a Federal proposal compliance analyst. Read all attached solicitation files (RFP Sections A–M, SOW/PWS, CDRLs, amendments, attachments). Prioritize Section L (Instructions) and Section M (Evaluation).
Deliver a BD-style “Submission Compliance Checklist” workbook with the following tabs and schemas. Cite the exact source for every row as: DocName, Section/Para, Page. If anything is not stated, write “MISSING” (don’t guess).

Tab 1 - Volumes & Tabs (authoritative spine)
Columns: Volume, Tab, Title/Topic, What to Provide (verbatim/summary), Page/Slide Limit, Format (PDF/Word/PPT/Excel), Font/Margins/Spacing, Required File Naming, Submission Destination (Portal/Email), Due (Date, Time, Time Zone), Owner, Source Citation, Notes.
- Capture exact volume/tab structure, mandatory contents per tab, and constraints.
- Include file-type rules (allowed/prohibited), email size limits, labeling conventions (“1 of 3”), ZIP allowed?

Ta

/tmp/ipykernel_192783/525708050.py:9: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print(f"Fields: {ExtractReqs.__fields__}")


In [18]:
# Cell 8: Export interesting cache entries to a file for deeper analysis
import pandas as pd

cache_export = []

for i, (key, value) in enumerate(memory_cache.items()):
    entry = {
        'index': i,
        'key_hash': str(key)[:50],
        'type': type(value).__name__,
        'size_bytes': len(str(value))
    }
    
    # Try to extract meaningful content
    if isinstance(value, dict):
        entry['keys'] = list(value.keys())
        if 'messages' in value:
            entry['message_count'] = len(value['messages'])
        if 'response' in value:
            entry['has_response'] = True
    
    cache_export.append(entry)

# Save to CSV for analysis
df = pd.DataFrame(cache_export)
output_file = 'dspy_cache_analysis.csv'
df.to_csv(output_file, index=False)
print(f"✅ Exported {len(cache_export)} cache entries to {output_file}")

# Show summary statistics
print(f"\nCache Summary:")
print(f"  Total entries: {len(cache_export)}")
print(f"  Total size: {df['size_bytes'].sum() / 1024:.2f} KB")
print(f"  Average entry size: {df['size_bytes'].mean():.2f} bytes")

✅ Exported 0 cache entries to dspy_cache_analysis.csv

Cache Summary:
  Total entries: 0


KeyError: 'size_bytes'