## Load from PDF

In [None]:
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Read the PDF and extract all text
pdf_reader = PdfReader(PDF_PATH)
print(f"📄 PDF loaded with {len(pdf_reader.pages)} pages")
# Extract text from all pages
raw_text = ""
for page_num, page in enumerate(pdf_reader.pages):
    page_text = page.extract_text()
    raw_text += page_text
print(f"📊 Extracted {len(raw_text)} characters total")

# Clean the extracted text
def clean_extracted_text(text: str) -> str:
    # Replace multiple whitespace with single spaces
    cleaned = re.sub(r'\s+', ' ', text)
    # Remove control characters
    cleaned = re.sub(r'[\x00-\x1F\x7F]', '', cleaned)
    # Strip leading/trailing whitespace
    return cleaned.strip()
document_text = clean_extracted_text(raw_text)
print(f"🧹 Cleaned text: {len(document_text)} characters")
print(f"📝 Preview: {document_text[:200]}...")

## Load From Web

In [2]:
! pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Using cached beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Using cached soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Using cached beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Using cached soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.13.4 bs4-0.0.2 soupsieve-2.7


In [3]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://en.wikipedia.org/wiki/Buddhist_ethics")
data = loader.load()

## Embed and Create DB

In [8]:
! pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Better than CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings

# 1. Use RecursiveCharacterTextSplitter instead - handles long texts better
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]  # Try to split on paragraphs first
)

# 2. Clean your data first
def clean_documents(data):
    return [doc for doc in data if doc.page_content.strip()]  # Remove empty documents

cleaned_data = clean_documents(data)
docs = text_splitter.split_documents(cleaned_data)

# 3. Verify documents before embedding
print(f"Total documents after splitting: {len(docs)}")
for i, doc in enumerate(docs[:3]):  # Inspect first few
    print(f"Doc {i} length: {len(doc.page_content)}")
    print(f"Metadata: {doc.metadata}")

# 4. Create DB with error handling
try:
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma.from_documents(
        documents=docs,
        embedding=embedding_function,
        persist_directory='./ethics_db'
    )
    print("Database created successfully!")
except Exception as e:
    print(f"Error creating DB: {e}")
    # Inspect problematic documents
    for i, doc in enumerate(docs):
        if not doc.page_content or len(doc.page_content) > 2000:
            print(f"Problematic doc {i}: Length={len(doc.page_content)}")

Total documents after splitting: 160
Doc 0 length: 982
Metadata: {'source': 'https://en.wikipedia.org/wiki/Buddhist_ethics', 'title': 'Buddhist ethics - Wikipedia', 'language': 'en'}
Doc 1 length: 891
Metadata: {'source': 'https://en.wikipedia.org/wiki/Buddhist_ethics', 'title': 'Buddhist ethics - Wikipedia', 'language': 'en'}
Doc 2 length: 811
Metadata: {'source': 'https://en.wikipedia.org/wiki/Buddhist_ethics', 'title': 'Buddhist ethics - Wikipedia', 'language': 'en'}


  return forward_call(*args, **kwargs)


Database created successfully!


  db.persist()
