In [20]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

import os

In [21]:
# Configurable parameters
data_dir = "./data"
chunk_size = 128
chunk_overlap = 32

# Ensure data directory exits
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Get list of files
files = os.listdir(data_dir)
print(f"List of files: {files}")

text_data_from_files = []
for file in files:
    try: 
        with open(os.path.join(data_dir, file), "r") as f:
            file_data = f.read()
        text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap
        )
        split_text = text_splitter.split_text(file_data)
        print("Split text: ", len(split_text))
        
        for index, chunked_text in enumerate(split_text):
            text_data_from_files.append(Document(
                page_content=chunked_text,
                metadata = {
                    "source": file, 
                    "page_content": chunked_text,
                    "title": file.split(".")[0],
                    "chunk_index": index
                }
            ))
    except Exception as e:
        print(f"Error processing file {file}: {e}")

print(f"Text data from files: ", [doc.page_content for doc in text_data_from_files])

List of files: ['product.md']
Split text:  7
Text data from files:  ['Synthetic Product Data for RAG App\nBelow is a collection of synthetic data representing imaginary products for use in a sample Retrieval-Augmented Generation (RAG) application. Each product includes a name, description, category, price, and unique identifier.\nProduct 1\n\nID: PRD-001\nName: Quantum Widget\nCategory: Electronics\nPrice: $149.99\nDescription: A cutting-edge quantum widget designed to optimize energy flow in smart home devices. Features a sleek design with voice-activated controls and compatibility with all major IoT platforms.\n\nProduct 2', 'Product 2\n\nID: PRD-002\nName: EcoGlow Lamp\nCategory: Home Decor\nPrice: $79.50\nDescription: An eco-friendly lamp that adjusts brightness based on ambient light. Made from recycled materials, it offers a warm, inviting glow and a battery life of up to 48 hours.\n\nProduct 3', 'Product 3\n\nID: PRD-003\nName: HyperFit Tracker\nCategory: Wearables\nPrice: $199.