# Import Section

In [1]:
import os
from typing import List, Dict, Any
import pandas as pd

In [5]:
from langchain_core.documents import Document
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
)

print("setup completed")

setup completed


# Document Structure in LangChain

In [None]:
## create a document, print its content and metadata
doc = Document(
    page_content="this is main text content that is used for processing",
    metadata={
        "source": "example.txt",
        "page": 1,
        "author": "Akhil",
        "date": "2025-08-17",
        "tags": ["data", "ingestion", "langchain"],
    }
)

print(f"document created..!\ncontent: {doc.page_content}")
print(f"metadata: {doc.metadata}")

document created..!
content: this is main text content that is used for processing
metadata: {'source': 'example.txt', 'page': 1, 'author': 'Akhil', 'date': '2025-08-17', 'tags': ['data', 'ingestion', 'langchain']}


# Text File Reading

## create files under data/text_files

In [12]:
import os
os.makedirs("data/text_files", exist_ok=True)

In [13]:
sample_text = {
    "data/text_files/python_intro.txt": """Python Programming Introduction
Python is a high-level, interpreted programming language known for its readability and versatility. 
It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. 
Python's extensive standard library and active community make it a popular choice for web development, data analysis, 
artificial intelligence, scientific computing, and more.
Python's syntax is designed to be clear and straightforward, which makes it an excellent language for beginners.
Python supports dynamic typing, meaning that variable types are determined at runtime, allowing for flexibility in coding.
Python's extensive libraries and frameworks, such as Django for web development and Pandas for data analysis,
make it a powerful tool for developers across various domains.""",

"data/text_files/machine_learning.txt": """Machine Learning Overview
Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from and make predictions based on data.
It involves the use of algorithms and statistical models to enable computers to improve their performance on a specific task through experience.
Machine learning can be categorized into three main types: supervised learning, unsupervised learning, and  reinforcement learning.
Supervised learning involves training a model on labeled data, where the input data is paired with the  correct output.
Unsupervised learning involves training a model on unlabeled data, allowing the model to find patterns and relationships in the data without explicit instructions.
Reinforcement learning involves training an agent to make decisions by rewarding it for correct actions and penalizing it for incorrect actions.
Machine learning is widely used in various applications, including image recognition, natural language processing, and recommendation systems.""",
}

for filepath,content in sample_text.items():
    with open(filepath, "w",encoding="utf-8") as file:
        file.write(content)

print("Sample text files created in 'data/text_files' directory.")

Sample text files created in 'data/text_files' directory.


## TextLoader - Read Single file

In [4]:
from langchain.document_loaders import TextLoader

# load a single text file
loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")

document=loader.load()
print(f"content preview: {document[0].page_content[:100]}...")  # Display first 100 characters
print(f"metadata: {document[0].metadata}")


content preview: Python Programming Introduction
Python is a high-level, interpreted programming language known for i...
metadata: {'source': 'data/text_files/python_intro.txt'}


## DirectoryLoader  - Multiple Text Files

In [9]:
from langchain.document_loaders import DirectoryLoader

directory_loader = DirectoryLoader(
    "data/text_files",  
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=True  
)
documents = directory_loader.load()

print(f"Number of documents loaded: {len(documents)}")
for i, doc in enumerate(documents):
    print(f"document {i+1}:")
    print(f"Document source: {doc.metadata["source"]}")  # Display first 100 characters
    print(f"Document length: {len(doc.page_content)} characters\n")

100%|██████████| 2/2 [00:00<00:00, 1242.20it/s]

Number of documents loaded: 2
document 1:
Document source: data\text_files\machine_learning.txt
Document length: 1019 characters

document 2:
Document source: data\text_files\python_intro.txt
Document length: 829 characters




