In [1]:
from pathlib import Path
import json
import sys

class VaultConfig:
    def __init__(self):
        self.config_file = Path("vault_config.json")
        
    def setup_vault(self):
        """One-time vault setup"""
        print("🔧 VAULT SETUP - One-time configuration")
        print("=" * 50)
        
        if self.config_file.exists():
            with open(self.config_file, 'r') as f:
                config = json.load(f)
                print(f"📁 Current vault: {config['vault_path']}")
                
                change = input("Change vault path? (y/n): ").strip().lower()
                if not change.startswith('y'):
                    return config['vault_path']
        
        while True:
            vault_path = input("\nEnter your vault path: ").strip()
            
            if not vault_path:
                print("❌ Please provide a path")
                continue
                
            if not Path(vault_path).exists():
                print(f"❌ Path {vault_path} doesn't exist")
                continue
                
            # Test for .md files
            md_files = list(Path(vault_path).rglob("*.md"))
            if not md_files:
                print(f"❌ No .md files found in {vault_path}")
                continue
                
            print(f"✅ Found {len(md_files)} markdown files")
            break
        
        # Save configuration
        config = {"vault_path": vault_path}
        with open(self.config_file, 'w') as f:
            json.dump(config, f)
            
        print(f"✅ Vault configured and saved!")
        return vault_path
    
    def get_vault_path(self):
        """Get configured vault path"""
        if not self.config_file.exists():
            return self.setup_vault()
            
        with open(self.config_file, 'r') as f:
            config = json.load(f)
            return config['vault_path']

if __name__ == "__main__":
    config = VaultConfig()
    vault_path = config.setup_vault()
    print(f"Vault ready at: {vault_path}")

🔧 VAULT SETUP - One-time configuration
✅ Found 227 markdown files
✅ Vault configured and saved!
Vault ready at: D:\LOST.DIR\Obsidian Vault


In [4]:
from pathlib import Path
import re
import json
import difflib
from typing import List, Dict
import sys

# Try to import sentence-transformers, fallback if not available
try:
    from sentence_transformers import SentenceTransformer
    import numpy as np
    SEMANTIC_SEARCH_AVAILABLE = True
except ImportError:
    SEMANTIC_SEARCH_AVAILABLE = False
    print("⚠️  sentence-transformers not installed. Semantic search disabled.")
    print("   Install with: pip install sentence-transformers")

from vault_config import VaultConfig

class SmartDocumentFinder:
    def __init__(self):
        # Get vault path from config
        self.config = VaultConfig()
        self.vault_path = Path(self.config.get_vault_path())
        
        print(f"📁 Loading vault: {self.vault_path}")
        
        # Load all files once
        self.md_files = list(self.vault_path.rglob("*.md"))
        self.file_contents = {}
        self.common_words = set()
        
        self._load_all_files()
        self._build_vocabulary()
        
        # Initialize semantic search if available
        if SEMANTIC_SEARCH_AVAILABLE:
            print("🤖 Loading AI model for semantic search...")
            self.model = SentenceTransformer('all-MiniLM-L6-v2')
            self.chunks_cache = {}  # Cache chunks to avoid reprocessing
        else:
            self.model = None
    
    def _load_all_files(self):
        """Load all markdown files into memory"""
        print(f"📥 Loading {len(self.md_files)} markdown files...")
        
        processed = 0
        for file_path in self.md_files:
            processed += 1
            if processed % 100 == 0:
                print(f"   Loaded {processed}/{len(self.md_files)} files...")
                
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    if len(content.strip()) > 50:  # Skip very small files
                        self.file_contents[str(file_path)] = content
            except Exception as e:
                print(f"⚠️  Error loading {file_path}: {e}")
        
        print(f"✅ Loaded {len(self.file_contents)} files successfully")
    
    def _build_vocabulary(self):
        """Build vocabulary for spell checking"""
        print("🔧 Building vocabulary for spell checking...")
        
        # Sample files for vocabulary (avoid processing all for large vaults)
        sample_files = list(self.file_contents.items())[:50]
        
        for file_path, content in sample_files:
            words = re.findall(r'\b[a-zA-Z]{3,}\b', content.lower())
            self.common_words.update(words)
        
        print(f"✅ Built vocabulary with {len(self.common_words)} words")
    
    def _chunk_document(self, content: str, max_chunk_size: int = 2000) -> List[str]:
        """Chunk large documents for better semantic search"""
        if len(content) <= max_chunk_size:
            return [content]
        
        # Try to split by paragraphs first
        paragraphs = content.split('\n\n')
        chunks = []
        current_chunk = ""
        
        for paragraph in paragraphs:
            # If adding this paragraph would exceed limit, save current chunk
            if len(current_chunk) + len(paragraph) > max_chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = paragraph
            else:
                current_chunk += "\n\n" + paragraph if current_chunk else paragraph
        
        # Add final chunk
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def spell_check_keyword(self, keyword):
        """Try to correct spelling"""
        if not self.common_words:
            return keyword
            
        suggestions = difflib.get_close_matches(
            keyword.lower(), 
            self.common_words, 
            n=3, 
            cutoff=0.6
        )
        
        if suggestions and suggestions[0] != keyword.lower():
            print(f"🔤 Did you mean: {', '.join(suggestions[:3])}?")
            return suggestions[0]
        
        return keyword
    
    def keyword_search(self, keyword: str) -> List[Dict]:
        """Traditional keyword search"""
        print(f"🔍 Keyword search for: '{keyword}'")
        
        matching_files = []
        keyword_lower = keyword.lower()
        
        for file_path, content in self.file_contents.items():
            if keyword_lower in content.lower():
                context = self._get_keyword_context(content, keyword)
                matching_files.append({
                    'file_path': file_path,
                    'file_name': Path(file_path).name,
                    'folder': str(Path(file_path).parent),
                    'context': context,
                    'file_size': len(content),
                    'search_type': 'keyword',
                    'relevance_score': 1.0
                })
        
        return matching_files
    
    def semantic_search(self, query: str, top_k: int = 10) -> List[Dict]:
        """Semantic similarity search using AI"""
        if not SEMANTIC_SEARCH_AVAILABLE or not self.model:
            print("❌ Semantic search not available. Install sentence-transformers.")
            return []
        
        print(f"🧠 Semantic search for: '{query}'")
        print("   Processing document chunks...")
        
        # Create corpus of chunks
        all_chunks = []
        chunk_info = []
        
        for file_path, content in self.file_contents.items():
            chunks = self._chunk_document(content)
            
            for i, chunk in enumerate(chunks):
                if len(chunk.strip()) > 100:  # Skip very short chunks
                    all_chunks.append(chunk)
                    chunk_info.append({
                        'file_path': file_path,
                        'chunk_index': i,
                        'chunk_content': chunk
                    })
        
        if not all_chunks:
            print("❌ No content chunks found")
            return []
        
        print(f"   Encoding {len(all_chunks)} chunks with AI model...")
        
        # Get embeddings
        try:
            query_embedding = self.model.encode([query])[0]
            chunk_embeddings = self.model.encode(all_chunks, show_progress_bar=True)
        except Exception as e:
            print(f"❌ Error during encoding: {e}")
            return []
        
        # Calculate similarities
        similarities = []
        for i, chunk_emb in enumerate(chunk_embeddings):
            similarity = np.dot(query_embedding, chunk_emb) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(chunk_emb)
            )
            similarities.append((similarity, chunk_info[i]))
        
        # Sort by similarity
        similarities.sort(key=lambda x: x[0], reverse=True)
        
        # Group by file and get top results
        file_results = {}
        for similarity, info in similarities:
            file_path = info['file_path']
            
            if file_path not in file_results:
                file_results[file_path] = {
                    'file_path': file_path,
                    'file_name': Path(file_path).name,
                    'folder': str(Path(file_path).parent),
                    'context': info['chunk_content'][:500] + "...",
                    'file_size': len(self.file_contents[file_path]),
                    'search_type': 'semantic',
                    'relevance_score': similarity,
                    'best_chunk': info['chunk_content']
                }
            elif similarity > file_results[file_path]['relevance_score']:
                # Update with better matching chunk
                file_results[file_path]['relevance_score'] = similarity
                file_results[file_path]['context'] = info['chunk_content'][:500] + "..."
                file_results[file_path]['best_chunk'] = info['chunk_content']
        
        # Return top results
        results = list(file_results.values())
        results.sort(key=lambda x: x['relevance_score'], reverse=True)
        
        return results[:top_k]
    
    def smart_search(self, query: str, auto_correct: bool = True) -> List[Dict]:
        """Automatically choose between keyword and semantic search"""
        # Clean up query
        query = query.strip()
        
        if not query:
            return []
        
        # Decide search type based on query length
        word_count = len(query.split())
        
        if word_count <= 3:
            # Short queries: use keyword search
            print(f"🎯 Using KEYWORD search ({word_count} words)")
            results = self.keyword_search(query)
            
            # Try spell correction if no results
            if not results and auto_correct:
                corrected = self.spell_check_keyword(query)
                if corrected != query.lower():
                    print(f"🔄 Trying corrected spelling: '{corrected}'")
                    results = self.keyword_search(corrected)
        else:
            # Long queries: use semantic search
            print(f"🎯 Using SEMANTIC search ({word_count} words)")
            results = self.semantic_search(query)
            
            # Fallback to keyword search if semantic fails
            if not results:
                print("🔄 Falling back to keyword search...")
                # Extract key terms from query
                key_terms = [word for word in query.split() if len(word) > 3]
                if key_terms:
                    results = self.keyword_search(" ".join(key_terms[:2]))
        
        return results
    
    def _get_keyword_context(self, content: str, keyword: str) -> str:
        """Get context around keyword"""
        lines = content.split('\n')
        context_lines = []
        
        for i, line in enumerate(lines):
            if keyword.lower() in line.lower():
                start = max(0, i-1)
                end = min(len(lines), i+2)
                context = '\n'.join(lines[start:end])
                context_lines.append(context)
                if len(context_lines) >= 2:
                    break
        
        return '\n...\n'.join(context_lines) if context_lines else "No context found"
    
    def display_results(self, results: List[Dict]):
        """Display search results"""
        if not results:
            print("❌ No matching documents found")
            return
        
        print(f"\n📋 FOUND {len(results)} RELEVANT DOCUMENTS:")
        print("=" * 80)
        
        for i, result in enumerate(results, 1):
            print(f"\n{i}. {result['file_name']}")
            print(f"   📂 Folder: {result['folder']}")
            print(f"   📏 Size: {result['file_size']:,} chars")
            print(f"   🔍 Search: {result['search_type'].upper()}")
            
            if result['search_type'] == 'semantic':
                print(f"   📈 Relevance: {result['relevance_score']:.3f}")
            
            print(f"   📝 Preview:")
            context_lines = result['context'].split('\n')[:3]
            for line in context_lines:
                print(f"      {line[:100]}{'...' if len(line) > 100 else ''}")
    
    def select_and_view_files(self, results: List[Dict]):
        """Handle file selection and viewing"""
        if not results:
            return
        
        print(f"\n🎯 SELECT DOCUMENTS (Multiple Selection Supported):")
        print("Examples: 1,3,5 | 1-3 | all | quit")
        
        while True:
            choice = input(f"\nYour selection (1-{len(results)}): ").strip()
            
            if not choice:
                print("❌ Please make a selection")
                continue
            
            if choice.lower() in ['quit', 'exit', 'q']:
                print("👋 Exiting...")
                sys.exit(0)
            
            if choice.lower() == 'all':
                selected = results
                break
            
            try:
                selected_indices = []
                parts = choice.split(',')
                
                for part in parts:
                    part = part.strip()
                    if '-' in part:
                        start, end = map(int, part.split('-'))
                        selected_indices.extend(range(start-1, end))
                    else:
                        selected_indices.append(int(part) - 1)
                
                valid_indices = [i for i in selected_indices if 0 <= i < len(results)]
                if not valid_indices:
                    print("❌ No valid selections")
                    continue
                
                selected = [results[i] for i in valid_indices]
                break
                
            except ValueError:
                print("❌ Invalid format")
                continue
        
        # View selected files
        print(f"\n📚 VIEWING {len(selected)} SELECTED DOCUMENTS:")
        print("=" * 100)
        
        for i, result in enumerate(selected, 1):
            print(f"\n📖 DOCUMENT {i}/{len(selected)}: {result['file_name']}")
            print(f"📂 Path: {result['file_path']}")
            print(f"🔍 Found via: {result['search_type'].upper()} search")
            print("-" * 100)
            
            try:
                with open(result['file_path'], 'r', encoding='utf-8') as f:
                    content = f.read()
                    print(content)
            except Exception as e:
                print(f"❌ Error reading file: {e}")
            
            print("-" * 100)
            
            if i < len(selected):
                user_input = input(f"\nPress Enter to view next document or 'quit' to exit: ").strip().lower()
                if user_input in ['quit', 'exit', 'q']:
                    print("👋 Exiting...")
                    sys.exit(0)
    
    def interactive_search(self):
        """Main interactive interface"""
        while True:
            print("\n" + "=" * 80)
            print("🤖 SMART DOCUMENT FINDER")
            print("✨ Features: Auto keyword/semantic search • Multi-select • Spell check")
            print(f"📁 Vault: {len(self.file_contents)} documents loaded")
            if SEMANTIC_SEARCH_AVAILABLE:
                print("🧠 AI semantic search: ENABLED")
            else:
                print("🧠 AI semantic search: DISABLED (install sentence-transformers)")
            print("=" * 80)
            
            query = input("\n💭 Enter your search query: ").strip()
            
            if not query:
                print("❌ Please enter a query")
                continue
            
            if query.lower() in ['quit', 'exit', 'q']:
                print("👋 Goodbye!")
                break
            
            # Perform smart search
            results = self.smart_search(query)
            
            if not results:
                print("❌ No documents found. Try a different query.")
                continue
            
            # Display and handle selection
            self.display_results(results)
            self.select_and_view_files(results)

if __name__ == "__main__":
    try:
        finder = SmartDocumentFinder()
        finder.interactive_search()
    except KeyboardInterrupt:
        print("\n👋 Program interrupted. Goodbye!")
    except Exception as e:
        print(f"❌ Error: {e}")


📁 Loading vault: D:\LOST.DIR\Obsidian Vault
📥 Loading 227 markdown files...
   Loaded 100/227 files...
   Loaded 200/227 files...
✅ Loaded 227 files successfully
🔧 Building vocabulary for spell checking...
✅ Built vocabulary with 2984 words
🤖 Loading AI model for semantic search...

🤖 SMART DOCUMENT FINDER
✨ Features: Auto keyword/semantic search • Multi-select • Spell check
📁 Vault: 227 documents loaded
🧠 AI semantic search: ENABLED
🎯 Using SEMANTIC search (14 words)
🧠 Semantic search for: 'model context protocol is a smart way to add external database to your llm'
   Processing document chunks...
   Encoding 487 chunks with AI model...


Batches: 100%|██████████| 16/16 [00:34<00:00,  2.17s/it]


📋 FOUND 10 RELEVANT DOCUMENTS:

1. 1. Intro.md
   📂 Folder: D:\LOST.DIR\Obsidian Vault\4. MCP
   📏 Size: 1,756 chars
   🔍 Search: SEMANTIC
   📈 Relevance: 0.541
   📝 Preview:
      ---
      title: 1. Intro
      updated: 2025-07-15 06:10:31Z

2. Intro.md
   📂 Folder: D:\LOST.DIR\Obsidian Vault\16. HuggingFace\MCP
   📏 Size: 1,865 chars
   🔍 Search: SEMANTIC
   📈 Relevance: 0.526
   📝 Preview:
      ---
      title: Intro
      updated: 2025-07-26 16:49:21Z

3. Things left to do in RAG.md
   📂 Folder: D:\LOST.DIR\Obsidian Vault\11. RAG
   📏 Size: 3,629 chars
   🔍 Search: SEMANTIC
   📈 Relevance: 0.402
   📝 Preview:
      This is similar to **RAG-Fusion**, but with **model-driven feedback loops**.
      
      ### Key Ideas:

4. Revision 15-7-25.md
   📂 Folder: D:\LOST.DIR\Obsidian Vault\4. MCP
   📏 Size: 2,140 chars
   🔍 Search: SEMANTIC
   📈 Relevance: 0.390
   📝 Preview:
      ---
      title: Revision 15-7-25
      updated: 2025-07-15 07:32:41Z

5. In-Context Learning, how it works





📚 VIEWING 2 SELECTED DOCUMENTS:

📖 DOCUMENT 1/2: 1. Intro.md
📂 Path: D:\LOST.DIR\Obsidian Vault\4. MCP\1. Intro.md
🔍 Found via: SEMANTIC search
----------------------------------------------------------------------------------------------------
---
title: 1. Intro
updated: 2025-07-15 06:10:31Z
created: 2025-07-14 07:49:58Z
---

MCP is a framework through which developers can embedded context and meaning to the model without explicitly mentioning what tool and which model to use. The MCP server takes care of that. You just declare what needs to happen instead of coding manually.

MCP (Model Context Protocol) is a framework that lets developers describe what context (like tools, documents, memory, or functions) a model should use—without manually coding how to connect them. Instead of wiring everything together, you just declare what you need, and the system sets it up so the model can use it intelligently.

MCP lets you describe what a model should have access to—like tools or memory—w