### Introduction to Data Ingestion

In [1]:
import langchain
import pandas as pd
import os

In [2]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter, 
    CharacterTextSplitter, 
    TokenTextSplitter)

### Understand the document structure in langchain

In [3]:
#create a sample document
doc = Document(
    page_content="This is a sample document for data ingestion.",
    metadata={"source": "sample_source.txt", "author": "Aayush"}
)
print("Document Structure:")
print(f"Content: {doc.page_content}")
print(f"Metadata: {doc.metadata}")

Document Structure:
Content: This is a sample document for data ingestion.
Metadata: {'source': 'sample_source.txt', 'author': 'Aayush'}


### Reading Text Files

In [4]:
## create a simple text file

import os
os.makedirs("data/text_files", exist_ok=True)

In [5]:
sample_texts = { 
    "data/text_files/sample1.txt": 
    """This is the content of sample text file which has information about Langchain.
    Langchain is a powerful library for building applications with LLMs.
    It provides tools for data ingestion, parsing, and more.
    And it is widely used in the AI community. What more, it supports various data formats.
    Again it has information about text splitting techniques.
    what else, it is very useful for developers working with large language models.
    Additionally, it has a vibrant community and extensive documentation.
    Developers can leverage Langchain to create innovative AI applications.
    Finally, it is an open-source project that encourages contributions from the community.
    Get started with Langchain today and explore its capabilities!
    Happy coding with Langchain!
    Just another line to increase the size of the file for testing purposes.
    This line is added to ensure the file has enough content for text splitting demonstrations.
    More lines to make the file larger.
    Yet another line to enhance the sample text file.let's keep adding lines.
    This should be sufficient for our current testing needs.
    """,
    "data/text_files/sample2.txt":
    """Another sample text file is created here.It contains different information.
    This file talks about Python programming. Python is a versatile language used for web development, data science, automation, and more.
    It has a rich ecosystem of libraries and frameworks that make development easier."""
    }
for file_path, content in sample_texts.items():
    with open(file_path, 'w', encoding="utf-8") as f:
        f.write(content)
print(f"Created sample text file at: {file_path}")

Created sample text file at: data/text_files/sample2.txt


### TextLoader: Read Single File

In [6]:
from langchain_community.document_loaders import TextLoader
# Load the text file using Langchain's TextLoader

loader = TextLoader("data/text_files/sample2.txt", encoding = "utf-8")
documents = loader.load()
print(type(documents))
print(documents)
print(f"loaded {len(documents)} document(s).")
print(f"Content Preview: {documents[0].page_content[:100]}")
print(f"metadata: {documents[0].metadata}")

<class 'list'>
[Document(metadata={'source': 'data/text_files/sample2.txt'}, page_content='Another sample text file is created here.It contains different information.\n    This file talks about Python programming. Python is a versatile language used for web development, data science, automation, and more.\n    It has a rich ecosystem of libraries and frameworks that make development easier.')]
loaded 1 document(s).
Content Preview: Another sample text file is created here.It contains different information.
    This file talks abou
metadata: {'source': 'data/text_files/sample2.txt'}


## DirectoryLoader - Multiple Text Files

In [7]:
from langchain_community.document_loaders import DirectoryLoader
# Load all text files from the directory
dir_loader = DirectoryLoader(
    "data/text_files",
    glob="**/*.txt", ##pattern to match files
    loader_cls=TextLoader, ##loader class to use
    loader_kwargs={"encoding": "utf-8"},
    show_progress=True
)
all_documents = dir_loader.load()

for i, document in enumerate(all_documents):
    print(f"\nDocument {i+1}:")
    print(f"Source: {document.metadata}")
    print(f"Content Preview: {document.page_content[:50]}")
    print(f"length: {len(document.page_content)} characters")
##why enumerate?
#enumerate() adds a counter to an iterable and returns it as an enumerate object.
# This is useful when you need both the index and the value from a list or other iterable.
# In this case, it helps to number the documents as they are printed.
# Without enumerate, you would have to manage a separate counter variable.
# Using enumerate makes the code cleaner and more Pythonic.
# It improves readability and reduces the chance of errors related to manual index management.
print(f"\nTotal documents loaded: {len(all_documents)}")

## Major Disadvantage of DirectoryLoader: ALL FILES MUST BE OF SAME TYPE AND USE SAME LOADER CLASS#####

100%|██████████| 2/2 [00:00<00:00, 623.87it/s]


Document 1:
Source: {'source': 'data\\text_files\\sample1.txt'}
Content Preview: This is the content of sample text file which has 
length: 1149 characters

Document 2:
Source: {'source': 'data\\text_files\\sample2.txt'}
Content Preview: Another sample text file is created here.It contai
length: 300 characters

Total documents loaded: 2





## Text Splitting Techniques

In [8]:
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter, 
    CharacterTextSplitter, 
    TokenTextSplitter)
print(all_documents)

[Document(metadata={'source': 'data\\text_files\\sample1.txt'}, page_content="This is the content of sample text file which has information about Langchain.\n    Langchain is a powerful library for building applications with LLMs.\n    It provides tools for data ingestion, parsing, and more.\n    And it is widely used in the AI community. What more, it supports various data formats.\n    Again it has information about text splitting techniques.\n    what else, it is very useful for developers working with large language models.\n    Additionally, it has a vibrant community and extensive documentation.\n    Developers can leverage Langchain to create innovative AI applications.\n    Finally, it is an open-source project that encourages contributions from the community.\n    Get started with Langchain today and explore its capabilities!\n    Happy coding with Langchain!\n    Just another line to increase the size of the file for testing purposes.\n    This line is added to ensure the fil

In [10]:
## Method 1: Character Text Splitter

text = all_documents[0].page_content
print(all_documents)
print("\n--- 1. Character Text Splitter ---")
char_splitter = CharacterTextSplitter(
    separator="\n", #Split at new lines
    chunk_size=200, #max 200 characters per chunk
    chunk_overlap=20, #20 characters overlap between chunks
    length_function=len #function to measure length (len for characters
)

char_chunks = char_splitter.split_text(text)
print(f"Total Chunks Created: {len(char_chunks)}")
print("-----------------------------------")
print(char_chunks)
print(f"first chunk preview: {char_chunks[0]}")
print("-----------------------------------")
print(f"second chunk preview: {char_chunks[1]}")

[Document(metadata={'source': 'data\\text_files\\sample1.txt'}, page_content="This is the content of sample text file which has information about Langchain.\n    Langchain is a powerful library for building applications with LLMs.\n    It provides tools for data ingestion, parsing, and more.\n    And it is widely used in the AI community. What more, it supports various data formats.\n    Again it has information about text splitting techniques.\n    what else, it is very useful for developers working with large language models.\n    Additionally, it has a vibrant community and extensive documentation.\n    Developers can leverage Langchain to create innovative AI applications.\n    Finally, it is an open-source project that encourages contributions from the community.\n    Get started with Langchain today and explore its capabilities!\n    Happy coding with Langchain!\n    Just another line to increase the size of the file for testing purposes.\n    This line is added to ensure the fil