In [1]:

## Introduction to Data Ingestion

In [2]:
import os
from typing import List, Dict, Any
from unittest import loader
import pandas as pd

In [3]:
from langchain_core.documents import Document
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
)
print("Setup Complete")

Setup Complete


In [4]:
## Understanding Document Structure In LangChain

In [5]:
### create a simple document
doc = Document(
    page_content="This is the main content that will be embedded and searched.",
    metadata={
        "source": "example.txt", 
        "page": 1, 
        "author": "azwri",
        "date_created": "2025-09-04",
        "custom_field": "any_value"
        }
)
print(doc)
print(f"{doc.page_content}")
print(f"{doc.metadata}")
print(f"{doc.metadata['source']}")

# why metadata is important?
print("\n Metadata is crucial for:")
print("- Filtering search results")
print("- Tracking document sources")
print("- Providing context in responses")
print("- Debugging and auditing")
print(type(doc))

page_content='This is the main content that will be embedded and searched.' metadata={'source': 'example.txt', 'page': 1, 'author': 'azwri', 'date_created': '2025-09-04', 'custom_field': 'any_value'}
This is the main content that will be embedded and searched.
{'source': 'example.txt', 'page': 1, 'author': 'azwri', 'date_created': '2025-09-04', 'custom_field': 'any_value'}
example.txt

 Metadata is crucial for:
- Filtering search results
- Tracking document sources
- Providing context in responses
- Debugging and auditing
<class 'langchain_core.documents.base.Document'>


In [6]:
### text file (.txt) - the simplest form of text data
os.makedirs('data/text_files', exist_ok=True)

simple_text = {
    "data/text_files/python_intro.txt": """Introduction to Python
Python is a versatile programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming.

Python is widely used in various fields such as web development, data analysis, artificial intelligence, scientific computing, and more. Its extensive standard library and active community contribute to its popularity.

Key features of Python include:
- Easy to learn and use
- Extensive libraries and frameworks
- Strong community support
- Cross-platform compatibility
- Integration capabilities with other languages and tools

""",
    "data/text_files/machine_learning.txt": """Machine Learning Basics
Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from and make decisions based on data. It involves training algorithms on datasets to identify patterns and make predictions.
There are several types of machine learning, including supervised learning, unsupervised learning, and reinforcement learning. Common algorithms include decision trees, support vector machines, and neural networks.
Applications of machine learning span various domains such as healthcare, finance, marketing, and more. It is used for tasks like image recognition, natural language processing, recommendation systems, and fraud detection.

Key concepts in machine learning include:
- Training and testing datasets
- Model evaluation and validation
- Overfitting and underfitting
- Feature selection and engineering
- Hyperparameter tuning

Types of Machine Learning:
- Supervised Learning: Learning from labeled data
- Unsupervised Learning: Finding patterns in unlabeled data
- Reinforcement Learning: Learning through rewards and penalties

"""}

for file_path, content in simple_text.items():
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

print("Sample text files created.")

Sample text files created.


In [7]:
### TextFileLoader - Read Single Text File
from langchain.document_loaders import TextLoader

loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
documents = loader.load()
print(documents)
print(f"Number of documents loaded: {len(documents)}")
print(f"Content of the document:\n{documents[0].page_content}")
print(f"Metadata of the document:\n{documents[0].metadata}")


[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Introduction to Python\nPython is a versatile programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming.\n\nPython is widely used in various fields such as web development, data analysis, artificial intelligence, scientific computing, and more. Its extensive standard library and active community contribute to its popularity.\n\nKey features of Python include:\n- Easy to learn and use\n- Extensive libraries and frameworks\n- Strong community support\n- Cross-platform compatibility\n- Integration capabilities with other languages and tools\n\n')]
Number of documents loaded: 1
Content of the document:
Introduction to Python
Python is a versatile programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and fun

In [8]:
### DirectoryLoader - Read Multiple Text Files from a Directory
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

loader = DirectoryLoader(
    "data/text_files", 
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=True
)
documents = loader.load()
print(f"Number of documents loaded: {len(documents)}")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1} Content:\n{doc.page_content[:200]}...")  # Print first 200 characters
    print(f"Metadata: {doc.metadata}")


100%|██████████| 2/2 [00:00<00:00, 681.72it/s]

Number of documents loaded: 2

Document 1 Content:
Machine Learning Basics
Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from and make decisions based on data. It involves training algorithms o...
Metadata: {'source': 'data\\text_files\\machine_learning.txt'}

Document 2 Content:
Introduction to Python
Python is a versatile programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and funct...
Metadata: {'source': 'data\\text_files\\python_intro.txt'}





In [9]:
### text splitting - splitting large documents into smaller chunks
# Why text splitting is important?
print("\nText splitting is important for:")
print("- Efficient processing and embedding")
print("- Improved search relevance")
print("- Handling model input size limitations")
print("- Better context management")


Text splitting is important for:
- Efficient processing and embedding
- Improved search relevance
- Handling model input size limitations
- Better context management


In [10]:
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
)

print(documents)
print(documents[0].page_content)

[Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\nMachine learning is a subset of artificial intelligence that focuses on building systems that can learn from and make decisions based on data. It involves training algorithms on datasets to identify patterns and make predictions.\nThere are several types of machine learning, including supervised learning, unsupervised learning, and reinforcement learning. Common algorithms include decision trees, support vector machines, and neural networks.\nApplications of machine learning span various domains such as healthcare, finance, marketing, and more. It is used for tasks like image recognition, natural language processing, recommendation systems, and fraud detection.\n\nKey concepts in machine learning include:\n- Training and testing datasets\n- Model evaluation and validation\n- Overfitting and underfitting\n- Feature selection and engineering\n- Hyperparameter tuning\n\nTypes of

In [30]:
#### method 1: CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=100,
    chunk_overlap=10,
    length_function=len,
)
texts = text_splitter.split_text(documents[0].page_content)
print(f"Number of chunks created: {len(texts)}")

for i, chunk in enumerate(texts):
    print(f"\nChunk {i+1}:\n{chunk}")

Number of chunks created: 13

Chunk 1:
Machine Learning Basics
Machine learning is a subset of artificial intelligence that focuses on

Chunk 2:
focuses on building systems that can learn from and make decisions based on data. It involves

Chunk 3:
involves training algorithms on datasets to identify patterns and make predictions.
There are

Chunk 4:
are several types of machine learning, including supervised learning, unsupervised learning, and

Chunk 5:
and reinforcement learning. Common algorithms include decision trees, support vector machines, and

Chunk 6:
and neural networks.
Applications of machine learning span various domains such as healthcare,

Chunk 7:
finance, marketing, and more. It is used for tasks like image recognition, natural language

Chunk 8:
language processing, recommendation systems, and fraud detection.

Key concepts in machine learning

Chunk 9:
learning include:
- Training and testing datasets
- Model evaluation and validation
- Overfitting

Chunk 10:
and u

In [26]:
##### method 2: RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    separators=[" "],
    chunk_size=100,
    chunk_overlap=10,
    length_function=len,
)

texts = text_splitter.split_text(documents[0].page_content)
print(f"Number of chunks created: {len(texts)}")
for i, chunk in enumerate(texts):
    print(f"\nChunk {i+1} length {len(chunk)}:\n{chunk}")


Number of chunks created: 13

Chunk 1 length 95:
Machine Learning Basics
Machine learning is a subset of artificial intelligence that focuses on

Chunk 2 length 94:
on building systems that can learn from and make decisions based on data. It involves training

Chunk 3 length 98:
training algorithms on datasets to identify patterns and make predictions.
There are several types

Chunk 4 length 98:
types of machine learning, including supervised learning, unsupervised learning, and reinforcement

Chunk 5 length 87:
learning. Common algorithms include decision trees, support vector machines, and neural

Chunk 6 length 99:
neural networks.
Applications of machine learning span various domains such as healthcare, finance,

Chunk 7 length 91:
finance, marketing, and more. It is used for tasks like image recognition, natural language

Chunk 8 length 99:
language processing, recommendation systems, and fraud detection.

Key concepts in machine learning

Chunk 9 length 97:
learning include:
- Tr

In [39]:
#### method 3: TokenTextSplitter
text_splitter = TokenTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
)
texts = text_splitter.split_text(documents[0].page_content)
print(f"Number of chunks created: {len(texts)}")
for i, chunk in enumerate(texts):
    print(f"\nToekn length {len(chunk)} Chunk {i+1}:\n{chunk}")

Number of chunks created: 3

Toekn length 587 Chunk 1:
Machine Learning Basics
Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from and make decisions based on data. It involves training algorithms on datasets to identify patterns and make predictions.
There are several types of machine learning, including supervised learning, unsupervised learning, and reinforcement learning. Common algorithms include decision trees, support vector machines, and neural networks.
Applications of machine learning span various domains such as healthcare, finance, marketing, and more. It is used for

Toekn length 510 Chunk 2:
 machine learning span various domains such as healthcare, finance, marketing, and more. It is used for tasks like image recognition, natural language processing, recommendation systems, and fraud detection.

Key concepts in machine learning include:
- Training and testing datasets
- Model evaluation and validation
- Overfitting