intro to data ingestion and parsing

In [65]:
import os
from typing import List, Dict, Any
import pandas as pd



In [66]:
from langchain_core.documents import Document   
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    MarkdownHeaderTextSplitter,
    MarkdownTextSplitter,
    TextSplitter,
    TokenTextSplitter
)
print("Text splitters imported successfully.")
print("setup comompleted successfully.")

Text splitters imported successfully.
setup comompleted successfully.


### Understand document structure in langchain

In [67]:
### crete a simple document
doc = Document(
    page_content="This is a sample document. It contains multiple sentences. "
                 "The purpose is to demonstrate text splitting. "
                    "Each sentence will be treated as a separate chunk.",
    metadata={
        "source": "sample_document.txt",
        "date": "2025-08-16",
        "page_number": 1,
        "author": "sumanth anem",
        "custom_field":"any_value"
    }
)
print("Document Structure.")
print (f"content: {doc.page_content}")
print (f"metadata: {doc.metadata}")

Document Structure.
content: This is a sample document. It contains multiple sentences. The purpose is to demonstrate text splitting. Each sentence will be treated as a separate chunk.
metadata: {'source': 'sample_document.txt', 'date': '2025-08-16', 'page_number': 1, 'author': 'sumanth anem', 'custom_field': 'any_value'}


###Textfiles (.txt) reading and creating textfiles


In [82]:
import os
os.makedirs("data/text_files", exist_ok=True)
os.makedirs("data/pdf", exist_ok=True)
os.makedirs("data/word_files", exist_ok=True)

In [69]:
sample_text = {
    "data/text_files/mcp.txt":"""
Letâ€™s talk about MCP server â€” but just to clarify, "MCP" can mean di    fferent things depending on context (networking, Splunk, databases, gaming, etc.).
Iâ€™ll give you a breakdown of the most common meanings, and you can tell me which one youâ€™re after:

1. MCP in Splunk / Observability Context

MCP (Mission Control Platform) is sometimes used to describe Splunkâ€™s centralized server/service orchestration in SOAR or Observability products.

It acts like a control plane â€” managing data flows, orchestration of tasks, and communication between distributed components.

Think of it as the "brain" that schedules and directs activity across different Splunk instances.

ðŸ”¹ 2. MCP in Networking (Media Control Protocol / Management Control Protocol)

In networking, MCP often refers to Management Control Protocols or Media Control Protocols.

Example: PPP (Point-to-Point Protocol) has multiple MCPs that negotiate features like IP, IPv6, authentication, compression, etc.

An MCP server in this sense manages those negotiations or acts as a control endpoint.

ðŸ”¹ 3. MCP in Gaming (Minecraft Server Context)

Sometimes "MCP" is shorthand for Minecraft Coder Pack or Minecraft Protocol.

An MCP server could mean a Minecraft Protocol-compatible server, which implements the gameâ€™s networking protocol without running the official Mojang server software (common in custom server projects).

ðŸ”¹ 4. MCP in Enterprise Systems

Some vendors use MCP = Master Control Program (borrowed from TRON ðŸ˜…).

It refers to the main orchestration/control server in large enterprise setups that coordinates sub-systems.

ðŸ‘‰ To guide better:
Do you mean MCP server in Splunk/observability, in networking protocols, or in gaming (Minecraft)?
"""
}

for file_path, content in sample_text.items():
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)
    print(f"Created file: {file_path}")

Created file: data/text_files/mcp.txt


#### TestLoader -- Read single file 

In [70]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

class TestLoader(TextLoader):
    """Test Loader to read a single file."""

    def __init__(self, file_path: str, encoding: str = "utf-8"):
        super().__init__(file_path, encoding=encoding)

    def load(self) -> List[Document]:
        """Load a single file and return it as a list of Document."""
        documents = super().load()
        return documents

loader=TextLoader("data/text_files/mcp.txt", encoding="utf-8")
documents= loader.load()
print(f"Loaded {len(documents)} document(s) from {loader.file_path}")
print(f"Document content: {documents[0].page_content[:100]}...")  # Print first 100 characters
print(f"Document metadata: {documents[0].metadata}")

Loaded 1 document(s) from data/text_files/mcp.txt
Document content: 
Letâ€™s talk about MCP server â€” but just to clarify, "MCP" can mean di    fferent things depending on...
Document metadata: {'source': 'data/text_files/mcp.txt'}


#### Directory Loader - multiple files

In [71]:
#### Directory Loader - multiple files
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader( 
    path="data/text_files", 
    glob="**/*.txt", 
    loader_cls=TextLoader, ##LODER CLASS NAME "class TestLoader(TextLoader):""
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True
)

documents = dir_loader.load()
print(f"Loaded {len(documents)} document(s) from directory {dir_loader.path}")
for i, doc in enumerate(documents):
    print(f"Document {i+1}:")
    print(f"Content: {doc.page_content[:100]}...")  # Print first 100 characters
    print(f"Metadata: {doc.metadata}")
    print("-" * 40)
    

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 150.72it/s]

Loaded 1 document(s) from directory data/text_files
Document 1:
Content: 
Letâ€™s talk about MCP server â€” but just to clarify, "MCP" can mean di    fferent things depending on...
Metadata: {'source': 'data\\text_files\\mcp.txt'}
----------------------------------------





### Text spliting stragies 

In [64]:
##Different Text Splitters
### Text splitting strategies

from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    MarkdownHeaderTextSplitter,
    MarkdownTextSplitter,
    TextSplitter,
    TokenTextSplitter
)

print(documents)


#METHOD  1        CharacterTextSplitter

text=documents[0].page_content  # Use the content of the first document for splitting
print(f"Text to be split: {text[:100]}...")  # Print first






CharacterTextSplitter = CharacterTextSplitter(
    chunk_size=500,  # Maximum size of each chunk
    chunk_overlap=20,  # Overlap between chunks
    length_function=len,  # Function to calculate length of text
    add_start_index=True,  # Add start index to each chunk
    separator="\n",  # Use newline as separator
    keep_separator=True  # Keep the separator in the chunks
)   


doc=CharacterTextSplitter.split_text(text)
print(doc)
print(f"First chunk: {doc[0][:100]}...")  # Print first 100 characters of the first chunk
print(f"Number of chunks: {len(doc)}")  # Print number of chunks created
print(f"second chunk : {doc[1][:100]}")  # Print first 100 characters of the second chunk

for i, doc in enumerate(doc):
    print(f"Chunk {i+1}:")
    print(f"Content: {doc[:100]}...")  # Print first 100 characters of each chunk
    print("-" * 40) 







#METHOD 2        RecursiveCharacterTextSplitter 
RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Maximum size of each chunk
    chunk_overlap=20,  # Overlap between chunks
    length_function=len,  # Function to calculate length of text
    add_start_index=True,  # Add start index to each chunk
    separators=["\n\n", "\n", " ", ""]  # Use multiple separators for splitting
)
doc = RecursiveCharacterTextSplitter.split_text(text)
print(doc)
print(f"First chunk: {doc[0][:100]}...")  # Print first 100 characters of the first chunk
print(f"Number of chunks: {len(doc)}")  # Print number of chunks created    
print(f"second chunk : {doc[1][:100]}")  # Print first 100 characters of the second chunk
for i, doc in enumerate(doc):
    print(f"Chunk {i+1}:")
    print(f"Content: {doc[:100]}...")  # Print first 100 characters of each chunk
    print("-" * 40)



[Document(metadata={'source': 'data\\text_files\\mcp.txt'}, page_content='\nLetâ€™s talk about MCP server â€” but just to clarify, "MCP" can mean di    fferent things depending on context (networking, Splunk, databases, gaming, etc.).\nIâ€™ll give you a breakdown of the most common meanings, and you can tell me which one youâ€™re after:\n\n1. MCP in Splunk / Observability Context\n\nMCP (Mission Control Platform) is sometimes used to describe Splunkâ€™s centralized server/service orchestration in SOAR or Observability products.\n\nIt acts like a control plane â€” managing data flows, orchestration of tasks, and communication between distributed components.\n\nThink of it as the "brain" that schedules and directs activity across different Splunk instances.\n\nðŸ”¹ 2. MCP in Networking (Media Control Protocol / Management Control Protocol)\n\nIn networking, MCP often refers to Management Control Protocols or Media Control Protocols.\n\nExample: PPP (Point-to-Point Protocol) has multiple 

In [79]:
#Method 3 TokenTextSplitter

from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    MarkdownHeaderTextSplitter,
    MarkdownTextSplitter,
    TextSplitter,
    TokenTextSplitter
)
TokenSplitter = TokenTextSplitter(
    chunk_size=50,  # Maximum size of each chunk in tokens
    chunk_overlap=10,  # Overlap between chunks in tokens
)
doc = TokenSplitter.split_text(text)
print(doc)
print(f"First chunk: {doc[0][:100]}...")  # Print first 100 characters of the first chunk
print(f"Number of chunks: {len(doc)}")  # Print number of chunks created

['\nLetâ€™s talk about MCP server â€” but just to clarify, "MCP" can mean di    fferent things depending on context (networking, Splunk, databases, gaming, etc.).\nIâ€™ll', ', gaming, etc.).\nIâ€™ll give you a breakdown of the most common meanings, and you can tell me which one youâ€™re after:\n\n1. MCP in Splunk / Observability Context\n\nMCP', ' Splunk / Observability Context\n\nMCP (Mission Control Platform) is sometimes used to describe Splunkâ€™s centralized server/service orchestration in SOAR or Observability products.\n\nIt acts like a control plane â€” managing data', '\nIt acts like a control plane â€” managing data flows, orchestration of tasks, and communication between distributed components.\n\nThink of it as the "brain" that schedules and directs activity across different Splunk instances.\n\nðŸ”¹ 2', ' Splunk instances.\n\nðŸ”¹ 2. MCP in Networking (Media Control Protocol / Management Control Protocol)\n\nIn networking, MCP often refers to Management Control Protocols o