# Neo4j Document Processing Components

Core components for processing documents with embeddings and ingesting into Neo4j.
Use these components to loop through files in a folder.

## 1. Install Dependencies

In [1]:
!pip install neo4j sentence-transformers pandas numpy scikit-learn python-dotenv



## 2. Import Libraries

In [2]:
import os
import uuid
import numpy as np
import json
import time
from datetime import datetime
from typing import List, Dict, Any, Optional
import pandas as pd
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# LLM imports
try:
    import openai
    OPENAI_AVAILABLE = True
except ImportError:
    OPENAI_AVAILABLE = False
    print("⚠ OpenAI not installed. Install with: pip install openai")

# Load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✓ Environment variables loaded from .env file")
except ImportError:
    print("⚠ python-dotenv not installed. Install with: pip install python-dotenv")
    print("Using environment variables directly from system")

  from .autonotebook import tqdm as notebook_tqdm


✓ Environment variables loaded from .env file


## 3. Configuration

Create a `.env` file in your project directory with:
```
NEO4J_URI=bolt://localhost:7687
NEO4J_USERNAME=neo4j
NEO4J_PASSWORD=your_actual_password
EMBEDDING_MODEL=paraphrase-multilingual-MiniLM-L12-v2
CHUNK_DELIMITER=---
```

In [3]:
# Load environment variables from specific path
try:
    from dotenv import load_dotenv
    # Specify the path to your .env file
    env_path = r'D:\Users\Dell\Documents\GeoSpark\rag-app\backend\.env'
    load_dotenv(env_path)
    print(f"✓ Environment variables loaded from: {env_path}")
except ImportError:
    print("⚠ python-dotenv not installed. Install with: pip install python-dotenv")
    print("Using environment variables directly from system")

# Neo4j Configuration from environment variables
NEO4J_URI = os.getenv('NEO4J_URI', 'bolt://localhost:7687')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME', 'neo4j')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')   
# Document configuration
CHUNK_DELIMITER = os.getenv('CHUNK_DELIMITER', '---')
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'paraphrase-multilingual-MiniLM-L12-v2')
# LLM configuration
LLM_PROVIDER = os.getenv('LLM_PROVIDER', 'openai')  # 'openai' or 'anthropic'
LLM_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
# Validate required environment variables
if not NEO4J_PASSWORD:
    raise ValueError("NEO4J_PASSWORD not found in environment variables. Please check your .env file.")
print(f"✅ Configuration loaded:")
print(f"  Neo4j URI: {NEO4J_URI}")
print(f"  Neo4j Username: {NEO4J_USERNAME}")
print(f"  Chunk Delimiter: '{CHUNK_DELIMITER}'")
print(f"  Embedding Model: {EMBEDDING_MODEL}")
print(f"  LLM Provider: {LLM_PROVIDER}")
print(f"  LLM Model: {LLM_MODEL}")

✓ Environment variables loaded from: D:\Users\Dell\Documents\GeoSpark\rag-app\backend\.env
✅ Configuration loaded:
  Neo4j URI: neo4j+s://f9af1b11.databases.neo4j.io
  Neo4j Username: neo4j
  Chunk Delimiter: '---'
  Embedding Model: paraphrase-multilingual-MiniLM-L12-v2
  LLM Provider: openai
  LLM Model: gpt-5-nano


In [4]:
section_patterns = {
        'title': ['factors influencing', 'executive summary', 'แนวโน้ม', 'สถานการณ์'],
        'abstract': ['บทคัดย่อ', 'abstract', 'บทสรุป'],
        'introduction': ['บทนำ', 'introduction'],
        'background': ['ข้อมูลพื้นฐาน', 'background', 'ที่มาและความสำคัญ'],
        'methodology': ['วิธีการวิจัย', 'methodology', 'วิธีดำเนินการ'],
        'results': ['ผลการวิจัย', 'results', 'ผลการศึกษา'],
        'discussion': ['อภิปรายผล', 'discussion'],
        'conclusion': ['สรุป', 'conclusion', 'บทสรุป'],
        'recommendations': ['ข้อเสนอแนะ', 'recommendations', 'ข้อเสนอ'],
        'measures': ['มาตรการ', 'measures', 'แนวทางแก้ไข', 'โครงการ'],
        'policy': ['นโยบาย', 'policy', 'มติ', 'คณะรัฐมนตรี'],
        'forecast': ['พยากรณ์', 'forecast', 'แนวโน้ม', 'คาดการณ์'],
        'situation': ['สถานการณ์', 'situation', 'ภาวะ'],
        'market': ['ตลาด', 'market', 'ราคา', 'การส่งออก'],
        'climate': ['สภาพภูมิอากาศ', 'climate', 'อุณหภูมิ', 'ฝน', 'เอลนีโญ', 'ลานีญา'],
        'agriculture': ['เกษตรกร', 'agriculture', 'การเพาะปลูก', 'ผลผลิต'],
        'forecast': ['พยากรณ์', 'forecast', 'แนวโน้ม', 'คาดการณ์'],
        'appendix': ['ภาคผนวก', 'appendix']
    }
    
    # Crop type patterns - these will be added as both section_type and crop_type
crop_patterns = {
        'rubber': ['ยางพารา', 'rubber', 'ยาง', 'สวนยาง','ยางพาราแผ่นดิบ', 'ยางแผ่นดิบ'],
        'cassava': ['มันสำปะหลัง', 'cassava', 'หัวมัน', 'แป้งมัน', 'มันเส้น']
    }

In [5]:
class LLMSectionExtractor:
        def __init__(self, provider: str = 'openai', model: str = 'gpt-3.5-turbo'):
            self.provider = provider.lower()
            self.model = model
            
            if self.provider == 'openai':
                if not OPENAI_API_KEY:
                    raise ValueError("OpenAI API key not found or package not installed")
                openai.api_key = OPENAI_API_KEY
                self.client = openai.OpenAI(api_key=OPENAI_API_KEY)
            else:
                raise ValueError(f"Unsupported LLM provider: {provider}")
            
            print(f"✅ LLM Section Extractor initialized with {provider} ({model})")
        
        def extract_section_info(self, content: str, max_retries: int = 3) -> Dict[str, Any]:
            """
            Extract section information from content using LLM
            
            Returns:
                Dict containing section_type, section_title, key_topics, and summary
            """
            prompt = f"""
    Analyze the following text chunk and extract structural information. Return a JSON object with these fields:
    - "section_type": The list of type of section, which in {section_patterns}, if none match, return []
    - "crop_type": The crop type, which in {crop_patterns}, if there are no crop type mentioned, return []
    - "key_topics": An array of 3-5 main topics or keywords mentioned in this chunk
    - "organization": Any organizations mentioned, if none, return []

    
    Text chunk:
    {content}
    
    Return only valid JSON:
    """
            
            for attempt in range(max_retries):
                try:
                    if self.provider == 'openai':
                        response = self.client.chat.completions.create(
                            model=self.model,
                            messages=[
                                {"role": "system", "content": "You are an expert at analyzing document structure and extracting section information. Always return valid JSON."},
                                {"role": "user", "content": prompt}
                            ]
                        )
                        response_text = response.choices[0].message.content.strip()
                    
                    
                    # Clean and parse JSON response
                    if response_text.startswith('```json'):
                        response_text = response_text.split('```json')[1].split('```')[0].strip()
                    elif response_text.startswith('```'):
                        response_text = response_text.split('```')[1].strip()
                    
                    result = json.loads(response_text)
                    
                    # Validate required fields
                    required_fields = ['section_type', 'crop_type', 'key_topics', 'organization']
                    if all(field in result for field in required_fields):
                        return result
                    else:
                        print(f"⚠ Missing required fields in LLM response, attempt {attempt + 1}")
                        
                except json.JSONDecodeError as e:
                    print(f"⚠ JSON parsing error on attempt {attempt + 1}: {e}")
                except Exception as e:
                    print(f"⚠ LLM API error on attempt {attempt + 1}: {e}")
                    if attempt < max_retries - 1:
                        time.sleep(2 ** attempt)  # Exponential backoff
            
            # Fallback if all attempts fail
            print("⚠ All LLM attempts failed, using fallback section info")
            return {
                "section_type": [],
                "crop_type": [],
                "key_topics": [],
                "organization": []
            }
        
        def extract_batch_section_info(self, chunks: List[Dict[str, Any]], batch_size: int = 5) -> List[Dict[str, Any]]:
            """
            Extract section info for multiple chunks with rate limiting
            """
            print(f"Extracting section information for {len(chunks)} chunks...")
            
            for i, chunk in enumerate(chunks):
                try:
                    section_info = self.extract_section_info(chunk['content'])
                    
                    # Add section info to chunk
                    chunk.update(section_info)
                    
                    if (i + 1) % batch_size == 0:
                        print(f"  Processed {i + 1}/{len(chunks)} chunks")
                        time.sleep(1)  # Rate limiting
                        
                except Exception as e:
                    print(f"⚠ Error processing chunk {i + 1}: {e}")
                    # Add fallback section info
                    chunk.update({
                        "section_type": [],
                        "crop_type": [],
                        "key_topics": [],
                        "organization": []
                    })
            
            print("✅ Section extraction completed")
            return chunks
    
    # Initialize LLM extractor if keys are available
llm_extractor = None
try:
    llm_extractor = LLMSectionExtractor(LLM_PROVIDER, LLM_MODEL)
except Exception as e:
    print(f"⚠ Could not initialize LLM extractor: {e}")
    print("Section extraction will be skipped")

✅ LLM Section Extractor initialized with openai (gpt-5-nano)


## 4. Embedding Generator

In [6]:
class EmbeddingGenerator:
    def __init__(self, model_name: str = EMBEDDING_MODEL):
        print(f"Loading SentenceTransformer model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.embedding_dim = self.model.get_sentence_embedding_dimension()
        print(f"✓ Model loaded successfully. Embedding dimension: {self.embedding_dim}")
    
    def generate_embedding(self, text: str) -> List[float]:
        """Generate embedding for a single text"""
        embedding = self.model.encode(text, normalize_embeddings=True)
        return embedding.tolist()
    
    def generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for a batch of texts"""
        embeddings = self.model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
        return embeddings.tolist()
    
    def compute_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
        """Compute cosine similarity between two embeddings"""
        embedding1 = np.array(embedding1).reshape(1, -1)
        embedding2 = np.array(embedding2).reshape(1, -1)
        return cosine_similarity(embedding1, embedding2)[0][0]

# Initialize embedding generator
embedding_generator = EmbeddingGenerator()

Loading SentenceTransformer model: paraphrase-multilingual-MiniLM-L12-v2
✓ Model loaded successfully. Embedding dimension: 384


## 5. Document Processor

In [7]:
class DocumentProcessor:
    def __init__(self, delimiter: str = '---', embedding_generator=None, llm_extractor=None):
        self.delimiter = delimiter
        self.embedding_generator = embedding_generator
        self.llm_extractor = llm_extractor
    
    def read_document(self, file_path: str) -> str:
        """Read document from file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except FileNotFoundError:
            print(f"File not found: {file_path}")
            return ""
        except Exception as e:
            print(f"Error reading file: {e}")
            return ""
    
    def chunk_document(self, text: str, generate_embeddings: bool = True, extract_sections: bool = True) -> List[Dict[str, Any]]:
        """Split document into chunks using delimiter and optionally generate embeddings"""
        # Split by delimiter
        chunks = text.split(self.delimiter)
        
        processed_chunks = []
        chunk_texts = []
        
        # First pass: create chunk metadata
        for i, chunk in enumerate(chunks):
            cleaned_chunk = chunk.strip()
            
            if cleaned_chunk:  # Only process non-empty chunks
                chunk_data = {
                    'id': str(uuid.uuid4()),
                    'sequence': i + 1,
                    'content': cleaned_chunk,
                    'word_count': len(cleaned_chunk.split()),
                    'char_count': len(cleaned_chunk),
                    'created_at': datetime.now().isoformat()
                }
                processed_chunks.append(chunk_data)
                chunk_texts.append(cleaned_chunk)
        
        # Second pass: generate embeddings in batch if requested
        if generate_embeddings and self.embedding_generator and chunk_texts:
            print(f"Generating embeddings for {len(chunk_texts)} chunks...")
            embeddings = self.embedding_generator.generate_embeddings_batch(chunk_texts)
            
            # Add embeddings to chunk data
            for i, embedding in enumerate(embeddings):
                processed_chunks[i]['embedding'] = embedding
                processed_chunks[i]['embedding_dim'] = len(embedding)
            
            print(f"✓ Embeddings generated successfully")
        
        # Third pass: extract section information using LLM
        if extract_sections and self.llm_extractor and processed_chunks:
            processed_chunks = self.llm_extractor.extract_batch_section_info(processed_chunks)
        elif extract_sections and not self.llm_extractor:
            print("⚠ LLM extractor not available, skipping section extraction")

        return processed_chunks


# Initialize processor with LLM extractor
processor = DocumentProcessor(CHUNK_DELIMITER, embedding_generator, llm_extractor)
print("✅ Enhanced document processor initialized")

✅ Enhanced document processor initialized


## 6. Neo4j Handler

In [8]:
def ensure_list(value):
    """Force value into a list ([], [value], or value if already list)."""
    if value is None or value == "":
        return []
    if isinstance(value, list):
        return value
    return [value]


In [9]:
class Neo4jHandler:
    def __init__(self, uri: str, username: str, password: str):
        try:
            self.driver = GraphDatabase.driver(uri, auth=(username, password))
            # Test connection
            with self.driver.session() as session:
                session.run("RETURN 1")
            print("✓ Connected to Neo4j successfully")
        except Exception as e:
            print(f"❌ Failed to connect to Neo4j: {e}")
            raise
    
    def close(self):
        if hasattr(self, 'driver'):
            self.driver.close()
    
    def create_schema(self):
        """Create indexes and constraints for better performance"""
        with self.driver.session() as session:
            queries = [
                "CREATE CONSTRAINT document_id IF NOT EXISTS FOR (d:Document) REQUIRE d.id IS UNIQUE",
                "CREATE CONSTRAINT chunk_id IF NOT EXISTS FOR (c:Chunk) REQUIRE c.id IS UNIQUE",
                "CREATE INDEX chunk_sequence IF NOT EXISTS FOR (c:Chunk) ON (c.sequence)",
                "CREATE INDEX document_title IF NOT EXISTS FOR (d:Document) ON (d.title)",
                "CREATE INDEX document_filename IF NOT EXISTS FOR (d:Document) ON (d.filename)",
                "CREATE INDEX chunk_section_type IF NOT EXISTS FOR (c:Chunk) ON (c.section_type)",
                "CREATE INDEX chunk_crop_type IF NOT EXISTS FOR (c:Chunk) ON (c.crop_type)",
                "CREATE INDEX chunk_organization IF NOT EXISTS FOR (c:Chunk) ON (c.organization)",
                "CREATE INDEX chunk_key_topics IF NOT EXISTS FOR (c:Chunk) ON (c.key_topics)",
                # Text search index for content
                "CREATE FULLTEXT INDEX chunk_content IF NOT EXISTS FOR (c:Chunk) ON EACH [c.content]"
            ]
            
            for query in queries:
                try:
                    session.run(query)
                    print(f"✓ Executed: {query.split()[1]} {query.split()[2]}")
                except Exception as e:
                    if "already exists" not in str(e).lower():
                        print(f"⚠ Warning: {e}")
    
    def create_document_node(self, document_data: Dict[str, Any]):
        """Create a document node"""
        with self.driver.session() as session:
            query = """
            CREATE (d:Document {
                id: $id,
                title: $title,
                filename: $filename,
                file_path: $file_path,
                total_chunks: $total_chunks,
                created_at: $created_at
            })
            RETURN d
            """
            result = session.run(query, **document_data)
            return result.single()
    
    def create_chunk_nodes(self, chunks: List[Dict[str, Any]], document_id: str, batch_size: int = 5, sleep_time: int = 10):
        """
        Create chunk nodes with embeddings and link them to document, in batches.
        """
        total = len(chunks)
        for i in range(0, total, batch_size):
            batch = chunks[i:i + batch_size]

            with self.driver.session() as session:
                for j, chunk in enumerate(batch, start=i+1):
                    # Prepare chunk data
                    chunk_data = {
                        'id': chunk['id'],
                        'sequence': chunk['sequence'],
                        'content': chunk['content'],
                        'word_count': chunk['word_count'],
                        'char_count': chunk['char_count'],
                        'created_at': chunk['created_at'],
                        'embedding': chunk['embedding'],
                        'embedding_dim': chunk['embedding_dim'],
                        'section_type': ensure_list(chunk.get('section_type', [])),
                        'crop_type': ensure_list(chunk.get('crop_type', [])),
                        'key_topics': ensure_list(chunk.get('key_topics', [])),
                        'organization': ensure_list(chunk.get('organization', []))
                    }

                    # Create chunk node
                    chunk_query = """
                    CREATE (c:Chunk {
                        id: $id,
                        sequence: $sequence,
                        content: $content,
                        word_count: $word_count,
                        char_count: $char_count,
                        created_at: $created_at,
                        embedding: $embedding,
                        embedding_dim: $embedding_dim,
                        section_type: $section_type,
                        crop_type: $crop_type,
                        key_topics: $key_topics,
                        organization: $organization
                    })
                    """
                    session.run(chunk_query, chunk_data)

                    # Link to document
                    link_query = """
                    MATCH (d:Document {id: $doc_id})
                    MATCH (c:Chunk {id: $chunk_id})
                    CREATE (d)-[:HAS_CHUNK {sequence: $sequence}]->(c)
                    """
                    session.run(link_query, {
                        'doc_id': document_id,
                        'chunk_id': chunk['id'],
                        'sequence': chunk['sequence']
                    })

                    # NEXT relationship
                    if chunk['sequence'] > 1:
                        next_query = """
                        MATCH (d:Document {id: $doc_id})-[:HAS_CHUNK]->(c1:Chunk {sequence: $prev_seq})
                        MATCH (d)-[:HAS_CHUNK]->(c2:Chunk {sequence: $curr_seq})
                        CREATE (c1)-[:NEXT_CHUNK]->(c2)
                        """
                        session.run(next_query, {
                            'doc_id': document_id,
                            'prev_seq': chunk['sequence'] - 1,
                            'curr_seq': chunk['sequence']
                        })

                print(f"  ✅ Batch {i//batch_size + 1}: processed {i + len(batch)}/{total} chunks")

            # sleep between batches (except after last one)
            if i + batch_size < total:
                time.sleep(sleep_time)
    
    def semantic_search(self, query_embedding: List[float], limit: int = 5, similarity_threshold: float = 0.7):
        """Perform semantic search using embeddings"""
        with self.driver.session() as session:
            query = """
            MATCH (c:Chunk)
            WHERE c.embedding IS NOT NULL
            RETURN c.id, c.content, c.sequence, c.embedding
            """
            
            result = session.run(query)
            chunks_with_similarity = []
            
            for record in result:
                chunk_embedding = record['c.embedding']
                similarity = cosine_similarity(
                    np.array(query_embedding).reshape(1, -1),
                    np.array(chunk_embedding).reshape(1, -1)
                )[0][0]
                
                if similarity >= similarity_threshold:
                    chunks_with_similarity.append({
                        'id': record['c.id'],
                        'content': record['c.content'],
                        'sequence': record['c.sequence'],
                        'similarity': float(similarity)
                    })
            
            # Sort by similarity and return top results
            chunks_with_similarity.sort(key=lambda x: x['similarity'], reverse=True)
            return chunks_with_similarity[:limit]
    
    def get_document_statistics(self):
        """Get statistics about the ingested data"""
        with self.driver.session() as session:
            stats_query = """
            MATCH (d:Document)
            OPTIONAL MATCH (d)-[:HAS_CHUNK]->(c:Chunk)
            RETURN 
                count(DISTINCT d) as total_documents,
                count(c) as total_chunks,
                count(CASE WHEN c.embedding IS NOT NULL THEN 1 END) as chunks_with_embeddings
            """
            
            result = session.run(stats_query)
            return result.single()

# Initialize Neo4j handler
neo4j_handler = Neo4jHandler(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

✓ Connected to Neo4j successfully


## 7. Setup Schema

In [10]:
# Create schema once
print("Setting up Neo4j schema...")
neo4j_handler.create_schema()
print("✓ Schema setup completed")

Setting up Neo4j schema...
✓ Executed: CONSTRAINT document_id
✓ Executed: CONSTRAINT chunk_id
✓ Executed: INDEX chunk_sequence
✓ Executed: INDEX document_title
✓ Executed: INDEX document_filename
✓ Executed: INDEX chunk_section_type
✓ Executed: INDEX chunk_crop_type
✓ Executed: INDEX chunk_organization
✓ Executed: INDEX chunk_key_topics
✓ Executed: FULLTEXT INDEX
✓ Schema setup completed


## 8. File Processing Function

Use this function to process individual files in your loop:

In [11]:
def process_single_file(file_path: str, title: str = None) -> bool:
    """
    Process a single file and ingest into Neo4j
    
    Args:
        file_path (str): Path to the document file
        title (str): Optional title for the document
        
    Returns:
        bool: Success status
    """
    try:
        print(f"\n📄 Processing: {file_path}")
        
        # Read document
        document_text = processor.read_document(file_path)
        if not document_text.strip():
            print(f"⚠ Warning: Empty document - {file_path}")
            return False
        
        # Process chunks with embeddings
        chunks = processor.chunk_document(document_text, generate_embeddings=True)
        if not chunks:
            print(f"⚠ Warning: No chunks generated - {file_path}")
            return False
        
        print(f"✓ Generated {len(chunks)} chunks")
        
        # Create document metadata
        document_id = str(uuid.uuid4())
        filename = os.path.basename(file_path)
        
        document_data = {
            'id': document_id,
            'title': title or filename,
            'filename': filename,
            'file_path': file_path,
            'total_chunks': len(chunks),
            'created_at': datetime.now().isoformat()
        }
        
        # Create document node
        neo4j_handler.create_document_node(document_data)
        print(f"✓ Document node created: {document_id}")
        
        # Create chunk nodes and relationships
        neo4j_handler.create_chunk_nodes(chunks, document_id)
        print(f"✓ {len(chunks)} chunks ingested successfully")
        
        return True
        
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")
        return False

print("✓ File processing function ready")

✓ File processing function ready


## 9. Batch Processing Function

Use this to process multiple files from a folder:

In [12]:
def process_folder(folder_path: str, file_extensions: List[str] = ['.txt', '.md'], max_files: int = None):
    """
    Process all files in a folder
    
    Args:
        folder_path (str): Path to the folder containing documents
        file_extensions (List[str]): List of file extensions to process
        max_files (int): Optional limit on number of files to process
    """
    if not os.path.exists(folder_path):
        print(f"❌ Folder not found: {folder_path}")
        return
    
    # Get all matching files
    files = []
    for ext in file_extensions:
        pattern = os.path.join(folder_path, f"*{ext}")
        import glob
        files.extend(glob.glob(pattern))
    
    if max_files:
        files = files[:max_files]
    
    print(f"📁 Found {len(files)} files to process")
    
    successful = 0
    failed = 0
    
    for i, file_path in enumerate(files, 1):
        print(f"\n[{i}/{len(files)}] Processing file...")
        
        if process_single_file(file_path):
            successful += 1
        else:
            failed += 1
    
    print(f"\n📊 Batch Processing Summary:")
    print(f"  Total files: {len(files)}")
    print(f"  Successful: {successful}")
    print(f"  Failed: {failed}")
    
    # Show final statistics
    stats = neo4j_handler.get_document_statistics()
    print(f"\n📈 Database Statistics:")
    print(f"  Total Documents: {stats['total_documents']}")
    print(f"  Total Chunks: {stats['total_chunks']}")
    print(f"  Chunks with Embeddings: {stats['chunks_with_embeddings']}")

print("✓ Batch processing function ready")

✓ Batch processing function ready


## 10. Example Usage

Here's how to use the components:

In [None]:
# Example 1: Process a single file
# success = process_single_file('../../resource/graph_doc\รายงานการเปลี่ยนแปลงสภาพภูมิอากาศ.txt', 'รายงานการเปลี่ยนแปลงสภาพภูมิอากาศ')

# Example 2: Process all files in a folder
process_folder('../../resource/graph_doc', file_extensions=['.txt', '.md'], max_files=12)

# Example 3: Process specific files
# files_to_process = ['/path/to/doc1.txt', '/path/to/doc2.txt']
# for file_path in files_to_process:
#     process_single_file(file_path)

print("Ready to process your files!")
print("Update the file paths above and uncomment to use.")

📁 Found 12 files to process

[1/12] Processing file...

📄 Processing: ../../resource/graph_doc\4มาตรการช่วยเหลือมัน.txt
Generating embeddings for 1 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.59it/s]

✓ Embeddings generated successfully
Extracting section information for 1 chunks...





✅ Section extraction completed
✓ Generated 1 chunks
✓ Document node created: 42bd1a09-7b29-4377-80c8-f6b7ba24109b
  ✅ Batch 1: processed 1/1 chunks
✓ 1 chunks ingested successfully

[2/12] Processing file...

📄 Processing: ../../resource/graph_doc\กระทรวงเกษตรเร่งแก้ปัญหาลักลอบนำเข้ายางพาราเถื่อนและราคายางพาราตกต่ำ.txt
Generating embeddings for 1 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.66it/s]

✓ Embeddings generated successfully
Extracting section information for 1 chunks...





✅ Section extraction completed
✓ Generated 1 chunks
✓ Document node created: 9e26b62f-faf5-451c-ab51-ea3d4fb59256
  ✅ Batch 1: processed 1/1 chunks
✓ 1 chunks ingested successfully

[3/12] Processing file...

📄 Processing: ../../resource/graph_doc\การปรับตัวรองรับการเปลี่ยนแปลงสภาพภูมิอากาศของเกษตรกรชาวสวนยางพาราจังหวัดระยอง.txt
Generating embeddings for 11 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.02it/s]


✓ Embeddings generated successfully
Extracting section information for 11 chunks...
  Processed 5/11 chunks
  Processed 10/11 chunks
✅ Section extraction completed
✓ Generated 11 chunks
✓ Document node created: a8cf4c3e-4a1e-4d0b-af2d-0e95c0bcea39
  ✅ Batch 1: processed 5/11 chunks
  ✅ Batch 2: processed 10/11 chunks
  ✅ Batch 3: processed 11/11 chunks
✓ 11 chunks ingested successfully

[4/12] Processing file...

📄 Processing: ../../resource/graph_doc\ปัจจัยที่มีอิทธิพลต่อการปรับตัวของเกษตรกรชาวสวนยางพาราต่อการเปลี่ยนแปลงสภาพภูมิอากาสในพื้นที่ภาคใต้ตอนล่างของประเทศไทย.txt
Generating embeddings for 44 chunks...


Batches: 100%|██████████| 2/2 [00:00<00:00,  2.73it/s]


✓ Embeddings generated successfully
Extracting section information for 44 chunks...
  Processed 5/44 chunks
  Processed 10/44 chunks
  Processed 15/44 chunks
  Processed 20/44 chunks
  Processed 25/44 chunks
  Processed 30/44 chunks
  Processed 35/44 chunks
  Processed 40/44 chunks
✅ Section extraction completed
✓ Generated 44 chunks
✓ Document node created: ea750f27-b236-45f1-86fb-b1820ffd5cf5
  ✅ Batch 1: processed 5/44 chunks
  ✅ Batch 2: processed 10/44 chunks
  ✅ Batch 3: processed 15/44 chunks
  ✅ Batch 4: processed 20/44 chunks
  ✅ Batch 5: processed 25/44 chunks
  ✅ Batch 6: processed 30/44 chunks
  ✅ Batch 7: processed 35/44 chunks
  ✅ Batch 8: processed 40/44 chunks
  ✅ Batch 9: processed 44/44 chunks
✓ 44 chunks ingested successfully

[5/12] Processing file...

📄 Processing: ../../resource/graph_doc\ผลกระทบของการเปลี่ยนแปลงสภาพภูมิอากาศต่อผลผลิตยางพาราในพื้นที่เขตภาคใต้ตอนล่างของประเทศไทย.txt
Generating embeddings for 24 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]


✓ Embeddings generated successfully
Extracting section information for 24 chunks...
  Processed 5/24 chunks
  Processed 10/24 chunks
  Processed 15/24 chunks
  Processed 20/24 chunks
✅ Section extraction completed
✓ Generated 24 chunks
✓ Document node created: a9ef4c13-667b-4662-8a1e-25c85c2a84f2
  ✅ Batch 1: processed 5/24 chunks
  ✅ Batch 2: processed 10/24 chunks
  ✅ Batch 3: processed 15/24 chunks
  ✅ Batch 4: processed 20/24 chunks
  ✅ Batch 5: processed 24/24 chunks
✓ 24 chunks ingested successfully

[6/12] Processing file...

📄 Processing: ../../resource/graph_doc\ผลพยากรณ์ผลผลิตมันสำปะหลังโรงงาน.txt
Generating embeddings for 1 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  9.04it/s]

✓ Embeddings generated successfully
Extracting section information for 1 chunks...





✅ Section extraction completed
✓ Generated 1 chunks
✓ Document node created: 212790eb-b95a-468e-8f5a-efe72f944415
  ✅ Batch 1: processed 1/1 chunks
✓ 1 chunks ingested successfully

[7/12] Processing file...

📄 Processing: ../../resource/graph_doc\มาตรการชะลอการเก็บเกี่ยวมันสำปะหลังปี2566_67.txt
Generating embeddings for 2 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]


✓ Embeddings generated successfully
Extracting section information for 2 chunks...
✅ Section extraction completed
✓ Generated 2 chunks
✓ Document node created: 118e0286-7521-4181-bfef-b8756063a98d
  ✅ Batch 1: processed 2/2 chunks
✓ 2 chunks ingested successfully

[8/12] Processing file...

📄 Processing: ../../resource/graph_doc\รายงานการเปลี่ยนแปลงสภาพภูมิอากาศ.txt
Generating embeddings for 56 chunks...


Batches: 100%|██████████| 2/2 [00:01<00:00,  1.23it/s]


✓ Embeddings generated successfully
Extracting section information for 56 chunks...
  Processed 5/56 chunks
  Processed 10/56 chunks
  Processed 15/56 chunks
  Processed 20/56 chunks
  Processed 25/56 chunks
  Processed 30/56 chunks
  Processed 35/56 chunks
  Processed 40/56 chunks
  Processed 45/56 chunks
  Processed 50/56 chunks
  Processed 55/56 chunks
✅ Section extraction completed
✓ Generated 56 chunks
✓ Document node created: c0a6239e-0e09-4791-af98-be0822b6a69c
  ✅ Batch 1: processed 5/56 chunks
  ✅ Batch 2: processed 10/56 chunks
  ✅ Batch 3: processed 15/56 chunks
  ✅ Batch 4: processed 20/56 chunks
  ✅ Batch 5: processed 25/56 chunks
  ✅ Batch 6: processed 30/56 chunks
  ✅ Batch 7: processed 35/56 chunks
  ✅ Batch 8: processed 40/56 chunks
  ✅ Batch 9: processed 45/56 chunks
  ✅ Batch 10: processed 50/56 chunks
  ✅ Batch 11: processed 55/56 chunks
  ✅ Batch 12: processed 56/56 chunks
✓ 56 chunks ingested successfully

[9/12] Processing file...

📄 Processing: ../../resource/gr

Batches: 100%|██████████| 1/1 [00:00<00:00,  5.25it/s]


✓ Embeddings generated successfully
Extracting section information for 1 chunks...
✅ Section extraction completed
✓ Generated 1 chunks
✓ Document node created: 4604473b-4a84-46f9-84cf-3f1e7821a43b
  ✅ Batch 1: processed 1/1 chunks
✓ 1 chunks ingested successfully

[10/12] Processing file...

📄 Processing: ../../resource/graph_doc\สถานการณ์ยางพฤษภาคม2568.txt
Generating embeddings for 11 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.94it/s]


✓ Embeddings generated successfully
Extracting section information for 11 chunks...
  Processed 5/11 chunks
  Processed 10/11 chunks
✅ Section extraction completed
✓ Generated 11 chunks
✓ Document node created: 51f5fd68-769c-4755-8f12-b4be19e132cd
  ✅ Batch 1: processed 5/11 chunks
  ✅ Batch 2: processed 10/11 chunks
  ✅ Batch 3: processed 11/11 chunks
✓ 11 chunks ingested successfully

[11/12] Processing file...

📄 Processing: ../../resource/graph_doc\แนวโน้มอุตสาหกรรมมันสำปะหลัง2568_2570.txt
Generating embeddings for 5 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.38it/s]


✓ Embeddings generated successfully
Extracting section information for 5 chunks...
  Processed 5/5 chunks
✅ Section extraction completed
✓ Generated 5 chunks
✓ Document node created: 856fc278-9805-49d7-a668-92bff035ca22
  ✅ Batch 1: processed 5/5 chunks
✓ 5 chunks ingested successfully

[12/12] Processing file...

📄 Processing: ../../resource/graph_doc\แนวโน้มอุตสาหกรรมยางพารา2568_2570.txt
Generating embeddings for 5 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.53it/s]


✓ Embeddings generated successfully
Extracting section information for 5 chunks...
  Processed 5/5 chunks
✅ Section extraction completed
✓ Generated 5 chunks
✓ Document node created: 0079b4a4-7aa5-45fb-89fd-1ea231f2487e
  ✅ Batch 1: processed 5/5 chunks
✓ 5 chunks ingested successfully

📊 Batch Processing Summary:
  Total files: 12
  Successful: 12
  Failed: 0

📈 Database Statistics:
  Total Documents: 12
  Total Chunks: 162
  Chunks with Embeddings: 162
Ready to process your files!
Update the file paths above and uncomment to use.


: 