Creating Knowldege Base
- https://docs.langchain.com/oss/python/langchain/knowledge-base

# Import Libraries

In [1]:
import os
import sys

from dotenv import load_dotenv
from pathlib import Path

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus import Milvus

from functools import lru_cache
from pymilvus import connections, utility

  from .autonotebook import tqdm as notebook_tqdm


# Setting Up Location

In [2]:
try:    
    current_file = Path(__file__).resolve()

    # Or navigate multiple levels
    PROJECT_DIR = current_file.parents[1]

except:
    PROJECT_DIR = Path.cwd().parent

sys.path.insert(0, str(PROJECT_DIR))        # Don't use -> sys.path.append(PROJECT_DIR)
print(PROJECT_DIR)

/Users/abhishek/Desktop/This PC/D drive/After BU/IndicConversations


In [3]:
load_dotenv()

True

In [4]:
from huggingface_hub import login

# Login - Required for Open Embedding Model
login(token=os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Document Loading

In [5]:
# Read the text file

filename = f"{PROJECT_DIR}/wikipedia_pages/Python_(programming_language)_wikipedia.txt"
with open(filename, "r") as fout:
    content = fout.read()

document = Document(
    page_content=content, metadata={"source": "https://example.com"}
)

# Chunking

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, 
    chunk_overlap=200, 
    add_start_index=True, # track index in original document
    separators=["\n\n==", "\n\n", "\n", ". ", " ", ""],  # prioritize section headers
    length_function=len,
)

In [7]:
all_splits = text_splitter.split_documents([document])

In [8]:
all_splits[3]

Document(metadata={'source': 'https://example.com', 'start_index': 2244}, page_content='The name Python derives from the British comedy series Monty Python\'s Flying Circus. (See § Naming.)\nPython 2.0 was released on 16 October 2000, featuring many new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 2.7\'s end-of-life was initially set for 2015, and then postponed to 2020 out of concern that a large body of existing code could not easily be forward-ported to Python 3. It no longer receives security patches or updates. While Python 2.7 and older versions are officially unsupported, a different unofficial Python implementation, PyPy, continues to support Python 2, i.e., "2.7.18+" (plus 3.11), with the plus signifying (at least some) "backported security updates".\nPython 3.0 was released on 3 December 2008, and was a major revision and not completely backward-compatible with earlier versions, with some new semanti

In [10]:
def merge_small_documents_with_metadata(docs, min_size=150):
    """Merge small documents while preserving important metadata"""
    merged = []
    i = 0
    
    while i < len(docs):
        current_doc = docs[i]
        current_content = current_doc.page_content.strip()
        
        if len(current_content) < min_size and i + 1 < len(docs):
            # Merge content
            merged_content = current_content + "\n\n" + docs[i + 1].page_content
            
            # Merge metadata intelligently
            merged_metadata = current_doc.metadata.copy()
            
            merged_doc = Document(
                page_content=merged_content,
                metadata=merged_metadata
            )
            merged.append(merged_doc)
            i += 2
        else:
            merged.append(current_doc)
            i += 1
    
    return merged

In [11]:
chunks = merge_small_documents_with_metadata(all_splits, min_size=150)

In [12]:
print("Before Merging", len(all_splits))
print("After Merging", len(chunks))

Before Merging 43
After Merging 40


# Embedding Model

In [13]:
model_name = "sentence-transformers/all-mpnet-base-v2"   # "BAAI/bge-m3"
model_kwargs = {"device": "cpu"}   # or "cuda"
encode_kwargs = {"normalize_embeddings": True}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)


In [14]:
# Test if it works
test = embeddings.embed_query("hello world")
print(f"Success! Embedding dimension: {len(test)}")

Success! Embedding dimension: 768


In [5]:
# Initialize once
@lru_cache(maxsize=1)
def get_embeddings():
    model_name = "sentence-transformers/all-mpnet-base-v2"   # "BAAI/bge-m3"
    model_kwargs = {"device": "cpu"}   # or "cuda"
    encode_kwargs = {"normalize_embeddings": True}

    embeddings = HuggingFaceEmbeddings(
        model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

    return embeddings

In [6]:
get_embeddings()

HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [7]:
# Use everywhere without reloading
def create_load_vector_store(COLLECTION_NAME, URI):
    embedding_model = get_embeddings()  # Uses cached model

    # Check if collection exists
    if utility.has_collection(COLLECTION_NAME):
        print(f"Collection '{COLLECTION_NAME}' exists. Loading...")
        
        # Load existing collection
        vector_store = Milvus(
            embedding_function=embedding_model,
            collection_name=COLLECTION_NAME,
            connection_args={"uri": URI},
        )
    
    # Create New Collection
    else:
        vector_store = Milvus(
            embedding_function=embedding_model,
            collection_name=COLLECTION_NAME,
            connection_args={"uri": URI},
            index_params={"index_type": "FLAT", "metric_type": "L2"},
        )
    return vector_store

# Ingest Chunks Into Vector Database

In [9]:
# Load / Initialize Vector Store

COLLECTION_NAME = "wikipedia_docs"
URI = "./milvus_example.db"   #   "http://localhost:19530"

vector_store = create_load_vector_store(COLLECTION_NAME, URI)

  from pkg_resources import DistributionNotFound, get_distribution


Collection 'wikipedia_docs' exists. Loading...


In [None]:
# Add chunks to Vector DB
vector_store.add_documents(chunks)



[463881024414154752,
 463881024414154753,
 463881024414154754,
 463881024414154755,
 463881024414154756,
 463881024414154757,
 463881024414154758,
 463881024414154759,
 463881024414154760,
 463881024414154761,
 463881024414154762,
 463881024414154763,
 463881024414154764,
 463881024414154765,
 463881024414154766,
 463881024414154767,
 463881024414154768,
 463881024414154769,
 463881024414154770,
 463881024414154771,
 463881024414154772,
 463881024414154773,
 463881024414154774,
 463881024414154775,
 463881024414154776,
 463881024414154777,
 463881024414154778,
 463881024414154779,
 463881024414154780,
 463881024414154781,
 463881024414154782,
 463881024414154783,
 463881024414154784,
 463881024414154785,
 463881024414154786,
 463881024414154787,
 463881024414154788,
 463881024414154789,
 463881024414154790,
 463881024414154791]

# Inspect Vector Store

In [11]:
from pymilvus import connections, Collection

In [None]:
# from pymilvus import connections, Collection

# def inspect_vector_store(COLLECTION_NAME, URI):

#     # Check if collection exists
#     if utility.has_collection(COLLECTION_NAME):

#         print(f"Collection '{COLLECTION_NAME}' exists.")

#         collection = Collection(COLLECTION_NAME)

#         # Inspect
#         print("Number of vectors:", collection.num_entities)
#         print("Collection schema:", collection.schema)

#     return collection

In [None]:
# COLLECTION_NAME = "wikipedia_docs"
# URI = "./milvus_example.db"

# collection = inspect_vector_store(COLLECTION_NAME, URI)

  from pkg_resources import DistributionNotFound, get_distribution


NameError: name 'utility' is not defined

In [8]:
PROJECT_DIR

PosixPath('/Users/abhishek/Desktop/This PC/D drive/After BU/IndicConversations')

In [10]:
COLLECTION_NAME = "wikipedia_docs"
URI = f"{PROJECT_DIR}/db/milvus_example.db"

# Connect to Milvus
connections.connect(uri=URI)

vector_store = create_load_vector_store(COLLECTION_NAME, URI)

# collection = inspect_vector_store(COLLECTION_NAME, URI)

Collection 'wikipedia_docs' exists. Loading...


# Vector Search

In [14]:

query = "What is Python Language?"
query = "How many packages does PyPi contains?"

query = "How Rome was built"

retrieved_docs = vector_store.similarity_search(query, k=10)

for doc in retrieved_docs:
    print(doc.page_content)
    print(doc.metadata["source"])
    print("*-"*20)

== Legacy ==

The Roman Empire was not only a political unity enforced by the use of military power; it was also the combined and elaborated civilization of the Mediterranean Basin and beyond. It included manufacture, trade, and architecture, widespread secular literacy, written law, and an international language of science and literature. The Western barbarians lost much of these higher cultural practices, but their redevelopment in the Middle Ages by polities aware of the Roman achievement formed the basis for the later development of Europe.
Observing the cultural and archaeological continuities through and beyond the period of lost political control, the process has been described as a complex cultural transformation, rather than a fall.


== See also ==

Succession of the Roman Empire
Comparative studies of the Roman and Han empires
Decline of the Byzantine Empire (Fall of the Eastern Roman Empire)
Historiography of the fall of the Western Roman Empire
Last of the Romans
Late Roma

In [31]:
chunks[0]



Document(metadata={'source': 'https://example.com', 'start_index': 0}, page_content="Python (programming language)\n\nPython is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically type-checked and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming.\nGuido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language. Python 3.0, released in 2008, was a major revision and not completely backward-compatible with earlier versions. Beginning with Python 3.5, capabilities and keywords for typing were added to the language, allowing optional static typing. As of 2026, the Python Software Foundation supports Python 3.10, 3.11, 3.12, 3.13, and 3.14, following the project's annual release cycle and five-year support policy. Earlier versions in t

In [32]:
chunks[1]

Document(metadata={'source': 'https://example.com', 'start_index': 1312}, page_content='== History ==\n\nPython was conceived in the late 1980s by Guido van Rossum at Centrum Wiskunde & Informatica (CWI) in the Netherlands. It was designed as a successor to the ABC programming language, which was inspired by SETL, capable of exception handling and interfacing with the Amoeba operating system. Python implementation began in December 1989. Van Rossum first released it in 1991 as Python 0.9.0. Van Rossum assumed sole responsibility for the project, as the lead developer, until 12 July 2018, when he announced his "permanent vacation" from responsibilities as Python\'s "benevolent dictator for life" (BDFL); this title was bestowed on him by the Python community to reflect his long-term commitment as the project\'s chief decision-maker. (He has since come out of retirement and is self-titled "BDFL-emeritus".) In January 2019, active Python core developers elected a five-member Steering Counc

In [33]:
chunks[2]

Document(metadata={'source': 'https://example.com', 'start_index': 2244}, page_content='The name Python derives from the British comedy series Monty Python\'s Flying Circus. (See § Naming.)\nPython 2.0 was released on 16 October 2000, featuring many new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 2.7\'s end-of-life was initially set for 2015, and then postponed to 2020 out of concern that a large body of existing code could not easily be forward-ported to Python 3. It no longer receives security patches or updates. While Python 2.7 and older versions are officially unsupported, a different unofficial Python implementation, PyPy, continues to support Python 2, i.e., "2.7.18+" (plus 3.11), with the plus signifying (at least some) "backported security updates".\nPython 3.0 was released on 3 December 2008, and was a major revision and not completely backward-compatible with earlier versions, with some new semanti

In [34]:
chunks[3]

Document(metadata={'source': 'https://example.com', 'start_index': 3393}, page_content='As of January 2026, Python 3.14.2 is the latest stable release. All older 3.x versions had a security update down to Python 3.9.24 then again with 3.9.25, the final version in 3.9 series. Python 3.10 is, since November 2025, the oldest supported branch. Python 3.15 has an alpha released, and Android has an official downloadable executable available for Python 3.14. Releases receive two years of full support followed by three years of security support.')

In [35]:
chunks[4]

Document(metadata={'source': 'https://example.com', 'start_index': 3851}, page_content='== Design philosophy and features ==\nPython is a multi-paradigm programming language. Object-oriented programming and structured programming are fully supported, and many of their features support functional programming and aspect-oriented programming – including metaprogramming and metaobjects. Many other paradigms are supported via extensions, including design by contract and logic programming. Python is often referred to as a \'glue language\' because it is purposely designed to be able to integrate components written in other languages.\nPython uses dynamic typing and a combination of reference counting and a cycle-detecting garbage collector for memory management. It uses dynamic name resolution (late binding), which binds method and variable names during program execution.\nPython\'s design offers some support for functional programming in the "Lisp tradition". It has filter, map, and reduce 

In [36]:
chunks[5]

Document(metadata={'source': 'https://example.com', 'start_index': 5103}, page_content="Explicit is better than implicit.\nSimple is better than complex.\nReadability counts.\nSpecial cases aren't special enough to break the rules.\nAlthough practicality beats purity,  errors should never pass silently, unless explicitly silenced.\nThere should be one-- and preferably only one --obvious way to do it.\nHowever, Python has received criticism for violating these principles and adding unnecessary language bloat. Responses to these criticisms note that the Zen of Python is a guideline rather than a rule. The addition of some new features had been controversial: Guido van Rossum resigned as Benevolent Dictator for Life after conflict about adding the assignment expression operator in Python 3.8.\nNevertheless, rather than building all functionality into its core, Python was designed to be highly extensible via modules. This compact modularity has made it particularly popular as a means of ad