## Section 0: Installs

In [1]:
%pip install langchain-huggingface
%pip install langchain-neo4j

%pip install langchain
%pip install langchain-community
%pip install langchain-text-splitters
%pip install neo4j
%pip install sentence-transformers
%pip install python-dotenv
%pip install pydantic
%pip install pydantic_core
%pip install numpy



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Section 1: Imports

In [2]:
# Import necessary libraries
import json
import os
from typing import Dict, List, Any
import time

# LangChain imports
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_neo4j import Neo4jGraph
from dotenv import load_dotenv

## Section 2: Configure Environment and Connections

In [3]:
# Load environment variables
load_dotenv()

# Neo4j connection settings
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", "neo4j")

# Initialize Neo4j connection
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE
)

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Initialize text splitter
text_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=20, 
    tokens_per_chunk=256
)

  from .autonotebook import tqdm as notebook_tqdm


## Section 3: Create Index in Neo4j

In [4]:
def setup_neo4j_indexes():
    """Create vector index in Neo4j database for similarity search."""
    # Create constraint for unique chunk IDs
    graph.query("""
    CREATE CONSTRAINT IF NOT EXISTS FOR (c:Chunk) REQUIRE c.id IS UNIQUE
    """)
    
    # Create vector index for embeddings
    graph.query("""
    CREATE VECTOR INDEX image_chunk_embeddings IF NOT EXISTS
    FOR (c:Chunk) 
    ON c.embedding
    OPTIONS {
        indexConfig: {
            `vector.dimensions`: 384,
            `vector.similarity_function`: 'cosine'
        }
    }
    """)
    
    print("Neo4j indexes created successfully.")

## Section 4: Helper Functions

In [5]:
def extract_path_components(path: str) -> Dict[str, str]:
    """Extract folder and file components from a path structure."""
    components = path.split('/')
    if len(components) >= 2:
        folder = components[0]
        subfolder = components[1] if len(components) > 1 else None
        filename = components[-1]
    else:
        folder = None
        subfolder = None
        filename = components[0]
    
    return {
        "folder": folder, 
        "subfolder": subfolder,
        "filename": filename
    }

def extract_sections(text: str) -> Dict[str, str]:
    """Extract sections from OCR text based on numbered headers."""
    sections = {}
    lines = text.split('\n')
    
    current_section = None
    current_content = []
    
    for line in lines:
        line = line.strip()
        # Check for section headers like "1. Image Type and Category:"
        if any(line.startswith(f"{i}. ") for i in range(1, 7)):
            # Save previous section if exists
            if current_section:
                sections[current_section] = '\n'.join(current_content).strip()
            
            # Start new section
            current_section = line
            current_content = []
        else:
            # Add line to current section content
            if current_section:
                current_content.append(line)
    
    # Add the last section
    if current_section and current_content:
        sections[current_section] = '\n'.join(current_content).strip()
    
    # If there's a "Detailed Description" section
    detailed_idx = next((i for i, line in enumerate(lines) if "Detailed Description:" in line), -1)
    if detailed_idx >= 0:
        detailed_text = '\n'.join(lines[detailed_idx+1:]).strip()
        sections["Detailed Description:"] = detailed_text
    
    return sections

## Section 5: Document Processing Functions

In [6]:
def process_json_file(file_path: str) -> Dict:
    """Load and parse JSON file containing OCR data."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def create_metadata(
    folder: str, 
    subfolder: str, 
    filename: str, 
    section_name: str
) -> Dict[str, Any]:
    """Create metadata for a chunk."""
    return {
        "folder": folder,
        "subfolder": subfolder,
        "file_name": filename,
        "file_type": "image",
        "section": section_name,
    }

def create_chunk_embedding(text: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
    """Create embedding for a chunk with given text and metadata."""
    embedding = embedding_model.embed_query(text)
    
    # Create a unique ID based on metadata
    chunk_id = f"{metadata['folder']}_{metadata['subfolder']}_{metadata['file_name']}_{metadata['section']}"
    chunk_id = chunk_id.replace(" ", "_").replace(":", "")
    
    return {
        "id": chunk_id,
        "text": text,
        "metadata": metadata,
        "embedding": embedding
    }

def save_chunk_to_neo4j(chunk_data: Dict[str, Any]):
    """Save chunk data to Neo4j."""
    query = """
    MERGE (c:Chunk {id: $id})
    SET c.text = $text,
        c.embedding = $embedding,
        c.folder = $metadata.folder,
        c.subfolder = $metadata.subfolder,
        c.file_name = $metadata.file_name,
        c.file_type = $metadata.file_type,
        c.section = $metadata.section
    RETURN c
    """
    
    result = graph.query(
        query=query,
        params={
            "id": chunk_data["id"],
            "text": chunk_data["text"],
            "embedding": chunk_data["embedding"],
            "metadata": chunk_data["metadata"]
        }
    )
    
    return result

## Section 6: Main Processing Logic

In [7]:
def process_ocr_data(data: Dict):
    """Process all OCR data from the loaded JSON."""
    total_chunks = 0
    
    # Process the nested structure
    for folder, subfolders in data.items():
        print(f"Processing folder: {folder}")
        
        for subfolder, files in subfolders.items():
            print(f"  Processing subfolder: {subfolder}")
            
            for filename, ocr_text in files.items():
                print(f"    Processing file: {filename}")
                
                # Extract sections from the OCR text
                sections = extract_sections(ocr_text)
                
                # Process each section
                for section_name, section_content in sections.items():
                    # Create chunks from the section text
                    chunks = text_splitter.split_text(section_content)
                    
                    for chunk in chunks:
                        # Create metadata
                        metadata = create_metadata(
                            folder=folder,
                            subfolder=subfolder,
                            filename=filename,
                            section_name=section_name
                        )
                        
                        # Create embedding
                        chunk_data = create_chunk_embedding(
                            text=chunk,
                            metadata=metadata
                        )
                        
                        # Save to Neo4j
                        save_chunk_to_neo4j(chunk_data)
                        total_chunks += 1
    
    print(f"Completed processing. Total chunks created: {total_chunks}")

## Section 7: Query Functions

In [8]:
def search_similar_chunks(query_text: str, top_k: int = 5):
    """Search for similar chunks based on the query text."""
    # Generate embedding for the query
    query_embedding = embedding_model.embed_query(query_text)
    
    # Search in Neo4j using vector similarity
    search_query = """
    CALL db.index.vector.queryNodes('image_chunk_embeddings', $top_k, $embedding)
    YIELD node, score
    RETURN 
        node.id as id,
        node.text as text,
        node.folder as folder,
        node.subfolder as subfolder,
        node.file_name as file_name,
        node.section as section,
        score
    ORDER BY score DESC
    """
    
    results = graph.query(
        query=search_query,
        params={"embedding": query_embedding, "top_k": top_k}
    )
    
    return results

## Section 8: Main Execution

In [9]:
def main():
    """Main execution function."""
    # Setup Neo4j indexes
    setup_neo4j_indexes()
    
    # Specify path to your JSON file
    json_file_path = "./final_image_sonnet.json"
    
    # Process the OCR data
    print(f"Loading data from {json_file_path}...")
    data = process_json_file(json_file_path)
    
    # Process the data
    start_time = time.time()
    process_ocr_data(data)
    end_time = time.time()
    
    print(f"Processing completed in {end_time - start_time:.2f} seconds.")
    
    # Example search
    print("\nExample search:")
    search_results = search_similar_chunks("What does the official seal look like?", top_k=3)
    for result in search_results:
        print(f"Score: {result['score']:.4f}")
        print(f"Document: {result['folder']}/{result['subfolder']}/{result['file_name']}")
        print(f"Section: {result['section']}")
        print(f"Text: {result['text'][:100]}...\n")

# Run the main function when executing the notebook
if __name__ == "__main__":
    main()

Neo4j indexes created successfully.
Loading data from ./final_image_sonnet.json...
Processing folder: Appendix
  Processing subfolder: Appendix___Appendix_D
    Processing file: 13002_lg_D_Part_2_Map_1_Other_Lands_Wharf_Street.gif
    Processing file: 13002_lg_D_Part_2_Map_2_Other_Lands_Savary_Island_v2.gif
  Processing subfolder: Appendix___Appendix_C
    Processing file: 13002_lg_C_1_Part_2_Map_1_Sliammon_IR_1_v9.gif
    Processing file: 13002_lg_C_1_Part_2_Map_2_Harwood_Island_IR_2_v8.gif
    Processing file: 13002_lg_C_1_Part_2_Map_3_Paukeanum_IR_3_v9.gif
    Processing file: 13002_lg_C_1_Part_2_Map_4_Toquana_IR_4_v8.gif
    Processing file: 13002_lg_C_1_Part_2_Map_5_Tokenatch_IR_5_v9.gif
    Processing file: 13002_lg_C_1_Part_2_Map_6_Kahkaykay_IR_6_v8.gif
    Processing file: 13002_lg_C_2_Part_1_Index_Former_Crown_v10.gif
    Processing file: 13002_lg_C_2_Part_2_Map_01_Theodosia_Inlet_Thor_Hill_v10.gif
    Processing file: 13002_lg_C_2_Part_2_Map_02_Okeover_Inlet_North_v11.gif
   