In [1]:
import os 
from typing import List, Dict, Any
import pandas as pd

In [16]:
from langchain_core.documents import Document
from langchain_text_splitters import (RecursiveCharacterTextSplitter,
                                     CharacterTextSplitter,
                                     TokenTextSplitter)
print("Text Splitters Loaded")
print("SetUp Completed!")

Text Splitters Loaded
SetUp Completed!


## Understanding Doc structure In LangChain

In [18]:
## create a simple doc
doc = Document(
    page_content="This is the main text content that will be embedde and serached",
    metadata = {
        "source" : "example.txt",
        "page" : 1,
        "author" : "Ayman El Ouardiji",
        "data_created" : "2025-11-11"
    }
)
print("Doc structure")
print(f"Content : {doc.page_content}")
print(f"metadata : {doc.metadata}")

Doc structure
Content : This is the main text content that will be embedde and serached
metadata : {'source': 'example.txt', 'page': 1, 'author': 'Ayman El Ouardiji', 'data_created': '2025-11-11'}


### Text files .txt - Simplest Case

In [19]:
## Create a simple text file 
import os
os.makedirs("data/text_files", exist_ok=True)

In [None]:
sample_texts={
    "data/text_files/python_intro.txt": """
    Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python has become one of the most popular programming languages in the world.

    Key Features:

    Easy to learn and use

    Extensive standard library

    Cross-platform compatibility

    Strong community support

    Python is widely used in web development, data science, artificial intelligence, and automation.""",
    "data/text_files/machine_learning.txt": """Machine Learning Basics

    Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves.

    Types of Machine Learning:

    Supervised Learning: Learning with labeled data

    Unsupervised Learning: Finding patterns in unlabeled data

    Reinforcement Learning: Learning through rewards and penalties

    Applications include image recognition, speech processing, and recommendation systems
    """
}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)
print("Sample text files created!")

Sample text files created!
Sample text files created!


### TextLoader- Read Single File

In [6]:
#from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

## Loading a single text file
loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")

documents = loader.load()
#print(type(documents))
#print(documents)
print(f"Loaded {len(documents)} document")
print(f"Content preview : {documents[0].page_content[:100]}...")
print(f"Metadata: {documents[0].metadata}")


Loaded 1 document
Content preview : 
    Python is a high-level, interpreted programming language known for its simplicity and readabili...
Metadata: {'source': 'data/text_files/python_intro.txt'}


### DirectoryLoader- Multiple text files

In [7]:
from langchain_community.document_loaders import DirectoryLoader
## Loading multiple text files from a directory
dir_loader = DirectoryLoader("data/text_files"
                             , glob="**/*.txt" #Pattern to match files
                             ,loader_cls = TextLoader,
                              loader_kwargs={'encoding' : 'utf-8'},
                              show_progress=True)

documents = dir_loader.load()
print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocuments {i+1}:")
    print (f" Source: {doc.metadata['source']}")
    print(f" Length: {len(doc.page_content)} characters")



100%|██████████| 2/2 [00:00<00:00, 355.34it/s]

Loaded 2 documents

Documents 1:
 Source: data\text_files\machine_learning.txt
 Length: 591 characters

Documents 2:
 Source: data\text_files\python_intro.txt
 Length: 481 characters



