In [3]:
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document

In [9]:
# 1. Create documents
documents = [
    # Python Programming (10 docs)
    Document(page_content="Python is a high-level programming language", metadata={"id": "doc1", "category": "python"}),
    Document(page_content="Python tutorial for beginners with examples", metadata={"id": "doc2", "category": "python"}),
    Document(page_content="Advanced Python programming techniques and patterns", metadata={"id": "doc3", "category": "python"}),
    Document(page_content="Python data structures lists tuples and dictionaries", metadata={"id": "doc4", "category": "python"}),
    Document(page_content="Object oriented programming in Python with classes", metadata={"id": "doc5", "category": "python"}),
    Document(page_content="Python web development with Django and Flask", metadata={"id": "doc6", "category": "python"}),
    Document(page_content="Python for data science and machine learning", metadata={"id": "doc7", "category": "python"}),
    Document(page_content="Python scripting and automation tasks", metadata={"id": "doc8", "category": "python"}),
    Document(page_content="Python best practices and coding standards", metadata={"id": "doc9", "category": "python"}),
    Document(page_content="Python debugging and testing strategies", metadata={"id": "doc10", "category": "python"}),
    
    # Machine Learning (10 docs)
    Document(page_content="Machine learning algorithms and applications", metadata={"id": "doc11", "category": "ml"}),
    Document(page_content="Introduction to supervised learning techniques", metadata={"id": "doc12", "category": "ml"}),
    Document(page_content="Unsupervised learning clustering and dimensionality reduction", metadata={"id": "doc13", "category": "ml"}),
    Document(page_content="Deep learning neural networks and backpropagation", metadata={"id": "doc14", "category": "ml"}),
    Document(page_content="Machine learning model training and evaluation", metadata={"id": "doc15", "category": "ml"}),
    Document(page_content="Feature engineering for machine learning models", metadata={"id": "doc16", "category": "ml"}),
    Document(page_content="Machine learning deployment in production systems", metadata={"id": "doc17", "category": "ml"}),
    Document(page_content="Natural language processing with machine learning", metadata={"id": "doc18", "category": "ml"}),
    Document(page_content="Computer vision and image recognition with ML", metadata={"id": "doc19", "category": "ml"}),
    Document(page_content="Reinforcement learning and decision making", metadata={"id": "doc20", "category": "ml"}),
    
    # RAG and Vector Databases (10 docs)
    Document(page_content="Retrieval augmented generation for LLM applications", metadata={"id": "doc21", "category": "rag"}),
    Document(page_content="Vector databases for semantic search", metadata={"id": "doc22", "category": "rag"}),
    Document(page_content="Embeddings and vector representations of text", metadata={"id": "doc23", "category": "rag"}),
    Document(page_content="HNSW algorithm for approximate nearest neighbor search", metadata={"id": "doc24", "category": "rag"}),
    Document(page_content="Cosine similarity for document retrieval", metadata={"id": "doc25", "category": "rag"}),
    Document(page_content="Chunking strategies for document processing", metadata={"id": "doc26", "category": "rag"}),
    Document(page_content="Hybrid search combining dense and sparse vectors", metadata={"id": "doc27", "category": "rag"}),
    Document(page_content="BM25 algorithm for keyword based search", metadata={"id": "doc28", "category": "rag"}),
    Document(page_content="Pinecone Weaviate and other vector database platforms", metadata={"id": "doc29", "category": "rag"}),
    Document(page_content="RAG evaluation metrics and benchmarking", metadata={"id": "doc30", "category": "rag"}),
    
    # Web Development (10 docs)
    Document(page_content="JavaScript programming language fundamentals", metadata={"id": "doc31", "category": "web"}),
    Document(page_content="React framework for building user interfaces", metadata={"id": "doc32", "category": "web"}),
    Document(page_content="Node.js backend development and APIs", metadata={"id": "doc33", "category": "web"}),
    Document(page_content="HTML CSS and responsive web design", metadata={"id": "doc34", "category": "web"}),
    Document(page_content="RESTful API design principles and best practices", metadata={"id": "doc35", "category": "web"}),
    Document(page_content="Database design with SQL and NoSQL", metadata={"id": "doc36", "category": "web"}),
    Document(page_content="Web security authentication and authorization", metadata={"id": "doc37", "category": "web"}),
    Document(page_content="Frontend performance optimization techniques", metadata={"id": "doc38", "category": "web"}),
    Document(page_content="GraphQL for efficient data fetching", metadata={"id": "doc39", "category": "web"}),
    Document(page_content="Microservices architecture and containerization", metadata={"id": "doc40", "category": "web"}),
    
    # Data Science (10 docs)
    Document(page_content="Data analysis with pandas and numpy", metadata={"id": "doc41", "category": "data"}),
    Document(page_content="Data visualization using matplotlib and seaborn", metadata={"id": "doc42", "category": "data"}),
    Document(page_content="Statistical analysis and hypothesis testing", metadata={"id": "doc43", "category": "data"}),
    Document(page_content="Big data processing with Apache Spark", metadata={"id": "doc44", "category": "data"}),
    Document(page_content="Time series analysis and forecasting", metadata={"id": "doc45", "category": "data"}),
    Document(page_content="Data cleaning and preprocessing techniques", metadata={"id": "doc46", "category": "data"}),
    Document(page_content="Exploratory data analysis methods", metadata={"id": "doc47", "category": "data"}),
    Document(page_content="Data warehousing and ETL pipelines", metadata={"id": "doc48", "category": "data"}),
    Document(page_content="A/B testing and experimental design", metadata={"id": "doc49", "category": "data"}),
    Document(page_content="Data ethics and privacy considerations", metadata={"id": "doc50", "category": "data"}),
]

In [10]:
# 2. Create BM25 retriever
retriever = BM25Retriever.from_documents(documents)
retriever.k = 2  # Return top 2 results

In [11]:
# 3. Query
query = "Python programming"
results = retriever.invoke(query)

In [13]:
# 4. Print results
for doc in results:
    print(f"Content: {doc.page_content}")
    print(f"Source: {doc.metadata['category']}")
    print("---")

Content: Python is a high-level programming language
Source: python
---
Content: Advanced Python programming techniques and patterns
Source: python
---


HYBRID