# Document Loading and Processing - Interactive Notebook\n\nThis notebook provides hands-on examples for:\n- Loading documents from files, web, and specialized formats\n- Text splitting strategies (character, recursive, token, semantic)\n- Cleaning, transforming, and metadata extraction\n- Building document processing pipelines\n- Analyzing document similarity\n- A simple document Q&A system\n\nPrerequisites: Ensure your environment is set up and API keys configured in your `.env`, especially OpenAI for embeddings (optional for semantic splitting).

## 1. Setup and Imports

In [None]:
# Core imports\nimport os\nimport re\nimport json\nimport time\nimport hashlib\nfrom datetime import datetime\nfrom pathlib import Path\nfrom typing import List, Dict, Any, Iterator, Optional\n\n# Environment variables\nfrom dotenv import load_dotenv\nload_dotenv()\n\n# LangChain document types\nfrom langchain_core.documents import Document\n\n# Loaders (community)\nfrom langchain_community.document_loaders import (\n    TextLoader, DirectoryLoader,\n    PyPDFLoader, OnlinePDFLoader,\n    WebBaseLoader, SeleniumURLLoader,\n    CSVLoader, JSONLoader,\n    UnstructuredMarkdownLoader, UnstructuredWordDocumentLoader,\n    WikipediaLoader\n)\n\n# Text splitters\nfrom langchain_text_splitters import (\n    CharacterTextSplitter, RecursiveCharacterTextSplitter,\n    TokenTextSplitter\n)\n\n# Embeddings (optional for semantic operations)\nfrom langchain_openai import OpenAIEmbeddings\n\n# Numpy for similarity computations\nimport numpy as np\n\nprint("Setup complete.")

## 2. Sample Text and Utility Functions

In [None]:
sample_text = """\nLangChain is a framework for developing applications powered by language models.\nIt enables applications that are context-aware and reason through problems.\nWith LangChain, you can connect language models to various data sources.\n\nThe main components of LangChain include:\n1. Models - Interface to various LLMs\n2. Prompts - Template management and optimization\n3. Chains - Sequences of calls to components\n4. Agents - Systems that use LLMs to decide actions\n5. Memory - Persistence of state between calls\n\nEach component can be used independently or combined to create complex applications.\n"""\n\ndef cosine_sim(a: np.ndarray, b: np.ndarray) -> float:\n    """Compute cosine similarity between two vectors."""\n    denom = (np.linalg.norm(a) * np.linalg.norm(b))\n    if denom == 0: return 0.0\n    return float(np.dot(a, b) / denom)

## 3. Document Loaders

In [None]:
# 3.1 Text file loaders\ntry:\n    # Create a sample file for demonstration\n    with open("sample.txt", "w", encoding="utf-8") as f:\n        f.write(sample_text)\n\n    text_loader = TextLoader("./sample.txt")\n    documents = text_loader.load()\n    print(f"Loaded {len(documents)} documents from sample.txt")\n    print("Content preview:", documents[0].page_content[:100], "...")\nexcept Exception as e:\n    print("TextLoader error:", e)\n\n# Directory loader (loads all .txt under ./documents)\ntry:\n    os.makedirs("./documents", exist_ok=True)\n    with open("./documents/doc1.txt", "w", encoding="utf-8") as f:\n        f.write("Doc1: LangChain basics.")\n    with open("./documents/doc2.txt", "w", encoding="utf-8") as f:\n        f.write("Doc2: Document processing in LLM apps.")\n\n    directory_loader = DirectoryLoader(\n        "./documents/",\n        glob="**/*.txt",\n        loader_cls=TextLoader,\n        show_progress=True,\n        use_multithreading=True\n    )\n    dir_docs = directory_loader.load()\n    print(f"Loaded {len(dir_docs)} documents from directory ./documents")\nexcept Exception as e:\n    print("DirectoryLoader error:", e)

In [None]:
# 3.2 PDF loaders\ntry:\n    # If you have a PDF file, set pdf_path accordingly\n    pdf_path = "./document.pdf"\n    if os.path.exists(pdf_path):\n        pdf_loader = PyPDFLoader(pdf_path)\n        pdf_documents = pdf_loader.load()\n        print(f"Loaded {len(pdf_documents)} pages from {pdf_path}")\n        for doc in pdf_documents[:2]:\n            print(doc.metadata.get('page', 'Unknown'), len(doc.page_content), "chars")\n    else:\n        print("PDF file not found; skipping local PDF example.")\nexcept Exception as e:\n    print("PyPDFLoader error:", e)\n\ntry:\n    # Online PDF (requires a valid URL)\n    # online_pdf_loader = OnlinePDFLoader("https://example.com/document.pdf")\n    # online_documents = online_pdf_loader.load()\n    # print(f"Loaded {len(online_documents)} pages from online PDF")\n    print("Online PDF example commented; provide a valid URL to test.")\nexcept Exception as e:\n    print("OnlinePDFLoader error:", e)

In [None]:
# 3.3 Web content loaders\ntry:\n    web_loader = WebBaseLoader([\n        "https://python.langchain.com/docs/get_started/introduction"\n    ])\n    web_documents = web_loader.load()\n    print(f"Loaded {len(web_documents)} web pages via WebBaseLoader")\nexcept Exception as e:\n    print("WebBaseLoader error:", e)\n\ntry:\n    # Selenium URL loader (requires Selenium + browser driver installed)\n    # selenium_loader = SeleniumURLLoader(urls=["https://example.com/dynamic"], browser="chrome")\n    # selenium_documents = selenium_loader.load()\n    # print(f"Loaded {len(selenium_documents)} dynamic pages with Selenium")\n    print("SeleniumURLLoader example commented; requires Selenium setup.")\nexcept Exception as e:\n    print("SeleniumURLLoader error:", e)

In [None]:
# 3.4 Specialized loaders: CSV, JSON, Markdown, Word, Wikipedia\ntry:\n    # CSV example (create a sample)\n    with open("data.csv", "w", encoding="utf-8") as f:\n        f.write("column1,column2,column3\n")\n        f.write("a,b,c\n")\n        f.write("d,e,f\n")\n    csv_loader = CSVLoader(file_path="./data.csv")\n    csv_documents = csv_loader.load()\n    print("CSV docs:", len(csv_documents))\nexcept Exception as e:\n    print("CSVLoader error:", e)\n\ntry:\n    with open("data.json", "w", encoding="utf-8") as f:\n        json.dump([{"content": "Item1"}, {"content": "Item2"}], f)\n    json_loader = JSONLoader(file_path="./data.json", jq_schema=".[]", text_content=False)\n    json_documents = json_loader.load()\n    print("JSON docs:", len(json_documents))\nexcept Exception as e:\n    print("JSONLoader error:", e)\n\ntry:\n    with open("doc.md", "w", encoding="utf-8") as f:\n        f.write("# Title\n\nSome markdown content.")\n    md_loader = UnstructuredMarkdownLoader("./doc.md")\n    md_documents = md_loader.load()\n    print("Markdown docs:", len(md_documents))\nexcept Exception as e:\n    print("UnstructuredMarkdownLoader error:", e)\n\ntry:\n    # Word document requires an actual .docx; skipping creation here\n    # word_loader = UnstructuredWordDocumentLoader("./document.docx")\n    # word_documents = word_loader.load()\n    # print("Word docs:", len(word_documents))\n    print("UnstructuredWordDocumentLoader example commented; provide a .docx to test.")\nexcept Exception as e:\n    print("UnstructuredWordDocumentLoader error:", e)\n\ntry:\n    # Wikipedia loader (optional; requires internet)\n    wiki_loader = WikipediaLoader(query="Artificial Intelligence", load_max_docs=1, lang="en")\n    wiki_documents = wiki_loader.load()\n    print("Wikipedia docs:", len(wiki_documents))\nexcept Exception as e:\n    print("WikipediaLoader error:", e)

## 4. Text Splitters

In [None]:
# 4.1 Character-based splitting\ncharacter_splitter = CharacterTextSplitter(\n    separator="\n\n",\n    chunk_size=300,\n    chunk_overlap=50,\n    length_function=len,\n    is_separator_regex=False\n)\nchar_chunks = character_splitter.split_text(sample_text)\nprint(f"Character splitter produced {len(char_chunks)} chunks")\nfor i, ch in enumerate(char_chunks):\n    print(f"Chunk {i+1}: {len(ch)} chars")

In [None]:
# 4.2 Recursive character splitting\nrecursive_splitter = RecursiveCharacterTextSplitter(\n    separators=["\n\n", "\n", " ", ""],\n    chunk_size=250,\n    chunk_overlap=25,\n    length_function=len,\n    is_separator_regex=False\n)\nrecursive_chunks = recursive_splitter.split_text(sample_text)\nprint(f"Recursive splitter produced {len(recursive_chunks)} chunks")\nfor i, ch in enumerate(recursive_chunks[:5]):\n    print(f"Chunk {i+1}: {len(ch)} chars")

In [None]:
# 4.3 Token-based splitter\ntoken_splitter = TokenTextSplitter(\n    chunk_size=80,\n    chunk_overlap=20,\n    encoding_name="cl100k_base"\n)\ntoken_chunks = token_splitter.split_text(sample_text)\nprint(f"Token splitter produced {len(token_chunks)} chunks")\nfor i, ch in enumerate(token_chunks[:5]):\n    print(f"Chunk {i+1}: {len(ch)} chars")

In [None]:
# 4.4 Semantic splitter (requires embeddings; will fallback if not available)\ntry:\n    embeddings = OpenAIEmbeddings()\n    # Lightweight semantic chunking approximation: split by paragraphs, score with embeddings\n    paragraphs = sample_text.split("\n\n")\n    para_embeddings = embeddings.embed_documents(paragraphs)\n    print(f"Computed {len(para_embeddings)} paragraph embeddings.")\n    # Display basic norms\n    for i, vec in enumerate(para_embeddings):\n        print(f"Para {i+1} norm: {np.linalg.norm(vec):.4f}")\nexcept Exception as e:\n    print("Semantic splitting skipped (embeddings not available):", e)

## 5. Document Transformers: Cleaning and Metadata

In [None]:
class DocumentCleaner:\n    def __init__(self):\n        self.patterns = {\n            'extra_whitespace': r'\s+',\n            'urls': r'http[s]?://[^\s]+',\n            'emails': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'\n        }\n    def clean_text(self, text: str) -> str:\n        text = re.sub(self.patterns['urls'], '', text)\n        text = re.sub(self.patterns['emails'], '', text)\n        text = re.sub(self.patterns['extra_whitespace'], ' ', text)\n        text = ' '.join(text.split())\n        return text.strip()\n    def transform_documents(self, documents: List[Document]) -> List[Document]:\n        out = []\n        for doc in documents:\n            cleaned = self.clean_text(doc.page_content)\n            out.append(Document(page_content=cleaned, metadata=doc.metadata))\n        return out\n\ndocs_for_cleaning = [Document(page_content=sample_text, metadata={"source": "sample"})]\ncleaner = DocumentCleaner()\ncleaned_docs = cleaner.transform_documents(docs_for_cleaning)\nprint("Original chars:", len(docs_for_cleaning[0].page_content))\nprint("Cleaned chars:", len(cleaned_docs[0].page_content))\nprint("Cleaned preview:", cleaned_docs[0].page_content[:120], "...")

In [None]:
class MetadataExtractor:\n    def extract_basic_metadata(self, document: Document) -> Dict[str, Any]:\n        content = document.page_content\n        return {\n            'character_count': len(content),\n            'word_count': len(content.split()),\n            'sentence_count': len(re.split(r'[.!?]+', content)),\n            'paragraph_count': len(content.split('\n\n')),\n            'content_hash': hashlib.md5(content.encode()).hexdigest(),\n            'extraction_timestamp': datetime.now().isoformat()\n        }\n    def extract_keywords(self, document: Document, top_k: int = 10) -> List[str]:\n        content = document.page_content.lower()\n        words = re.findall(r'\b[a-zA-Z]{3,}\b', content)\n        stop_words = {'the','and','for','are','but','not','you','all','can','was','one','our','out','get','has','how','new','now','see','two','way','its'}\n        filtered = [w for w in words if w not in stop_words]\n        freq: Dict[str,int] = {}\n        for w in filtered:\n            freq[w] = freq.get(w, 0) + 1\n        top = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:top_k]\n        return [k for k, _ in top]\n    def enhance_document(self, document: Document) -> Document:\n        meta = self.extract_basic_metadata(document)\n        meta['keywords'] = self.extract_keywords(document)\n        merged = {**document.metadata, **meta}\n        return Document(page_content=document.page_content, metadata=merged)\n\nextractor = MetadataExtractor()\nenhanced = extractor.enhance_document(cleaned_docs[0])\nprint("Enhanced metadata keys:", list(enhanced.metadata.keys()))\nprint("Keywords:", enhanced.metadata.get('keywords', [])[:5])

## 6. Document Processing Pipeline

In [None]:
class DocumentProcessingPipeline:\n    def __init__(self):\n        self.steps: List[Dict[str, Any]] = []\n    def add_step(self, fn, step_name: Optional[str] = None):\n        self.steps.append({'name': step_name or fn.__name__, 'function': fn})\n    def process(self, documents: List[Document]) -> List[Document]:\n        out = documents\n        for step in self.steps:\n            print(f"Applying step: {step['name']}")\n            out = step['function'](out)\n            print(f"  Output: {len(out)} documents")\n        return out\n\ndef clean_documents(documents: List[Document]) -> List[Document]:\n    return DocumentCleaner().transform_documents(documents)\n\ndef split_documents(documents: List[Document]) -> List[Document]:\n    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)\n    return splitter.split_documents(documents)\n\ndef enhance_metadata(documents: List[Document]) -> List[Document]:\n    extractor = MetadataExtractor()\n    return [extractor.enhance_document(doc) for doc in documents]\n\npipeline = DocumentProcessingPipeline()\npipeline.add_step(lambda docs: docs, "Initial")\npipeline.add_step(clean_documents, "Clean")\npipeline.add_step(split_documents, "Split")\npipeline.add_step(enhance_metadata, "Enhance metadata")\n\nsample_docs = [\n    Document(page_content=sample_text, metadata={"source": "sample1.txt"}),\n    Document(page_content="This is another sample document for pipeline testing.", metadata={"source": "sample2.txt"})\n]\nprocessed_docs = pipeline.process(sample_docs)\nprint(f"Final output: {len(processed_docs)} processed chunks")\nprint("Preview chunk 1:", processed_docs[0].page_content[:120], "...")

## 7. Multi-format Directory Processor

In [None]:
class MultiFormatProcessor:\n    def __init__(self):\n        self.loaders = {\n            '.txt': TextLoader,\n            '.pdf': PyPDFLoader,\n            '.docx': UnstructuredWordDocumentLoader,\n            '.md': UnstructuredMarkdownLoader\n        }\n        self.splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=60)\n    def process_directory(self, directory_path: str) -> List[Document]:\n        documents: List[Document] = []\n        for file_path in Path(directory_path).rglob('*'):\n            if file_path.is_file():\n                ext = file_path.suffix.lower()\n                if ext in self.loaders:\n                    try:\n                        loader = self.loaders[ext](str(file_path))\n                        file_docs = loader.load()\n                        for doc in file_docs:\n                            doc.metadata.update({\n                                'file_path': str(file_path),\n                                'file_name': file_path.name,\n                                'file_type': ext,\n                                'file_size': file_path.stat().st_size\n                            })\n                        documents.extend(file_docs)\n                        print(f"Processed: {file_path}")\n                    except Exception as e:\n                        print(f"Error processing {file_path}: {e}")\n        if documents:\n            documents = self.splitter.split_documents(documents)\n        return documents\n\n# Example usage (commented):\n# processor = MultiFormatProcessor()\n# all_docs = processor.process_directory('./documents')\n# print(f"Processed {len(all_docs)} document chunks")

## 8. Document Similarity Analysis

In [None]:
class DocumentAnalyzer:\n    def __init__(self):\n        self.embeddings = None\n        try:\n            self.embeddings = OpenAIEmbeddings()\n        except Exception as e:\n            print("Embeddings unavailable:", e)\n    def compute_embeddings(self, documents: List[Document]) -> np.ndarray:\n        texts = [doc.page_content for doc in documents]\n        if self.embeddings is None:\n            # Fallback: simple bag-of-words frequency vector\n            vocab = {}\n            for t in texts:\n                for w in re.findall(r'\b[a-zA-Z]{3,}\b', t.lower()):\n                    vocab[w] = vocab.get(w, 0) + 0\n            vocab_list = sorted(vocab.keys())\n            vecs = []\n            for t in texts:\n                counts = {w: 0 for w in vocab_list}\n                for w in re.findall(r'\b[a-zA-Z]{3,}\b', t.lower()):\n                    if w in counts:\n                        counts[w] += 1\n                vecs.append(np.array([counts[w] for w in vocab_list], dtype=float))\n            return np.vstack(vecs)\n        else:\n            return np.array(self.embeddings.embed_documents(texts))\n    def find_similar_documents(self, query: str, documents: List[Document], top_k: int = 3) -> List[tuple]:\n        if not documents: return []\n        doc_vecs = self.compute_embeddings(documents)\n        if self.embeddings is None:\n            # Fallback: same featurization for query\n            vocab_list = [w for w in re.findall(r'\b[a-zA-Z]{3,}\b', ' '.join([d.page_content for d in documents]).lower())]\n            vocab_set = sorted(set(vocab_list))\n            q_counts = {w: 0 for w in vocab_set}\n            for w in re.findall(r'\b[a-zA-Z]{3,}\b', query.lower()):\n                if w in q_counts: q_counts[w] += 1\n            q_vec = np.array([q_counts[w] for w in vocab_set], dtype=float)\n        else:\n            q_vec = np.array(self.embeddings.embed_query(query))\n        sims = []\n        for i in range(doc_vecs.shape[0]):\n            sims.append(cosine_sim(q_vec, doc_vecs[i]))\n        top_idx = np.argsort(sims)[::-1][:top_k]\n        return [(documents[i], sims[i]) for i in top_idx]\n\nanalyzer = DocumentAnalyzer()\nsubset_docs = [Document(page_content=sample_text), Document(page_content="LLM apps need document chunking and retrieval.")]\nresults = analyzer.find_similar_documents("What are LangChain components?", subset_docs, top_k=2)\nfor doc, sim in results:\n    print(f"Similarity: {sim:.3f} | Preview: {doc.page_content[:60]}...")

## 9. Document Q&A System (Exercise Solution)

In [None]:
from langchain_core.prompts import PromptTemplate\nfrom langchain_core.chains import LLMChain\nfrom langchain_openai import ChatOpenAI\n\nclass DocumentQASystem:\n    def __init__(self):\n        try:\n            self.embeddings = OpenAIEmbeddings()\n        except Exception as e:\n            print("Embeddings not available:", e)\n            self.embeddings = None\n        self.llm = ChatOpenAI(model="gpt-3.5-turbo")\n        self.documents: List[Document] = []\n        self.doc_embeddings = None\n    def load_documents(self, file_paths: List[str]) -> None:\n        self.documents = []\n        for fp in file_paths:\n            try:\n                if fp.endswith('.txt'):\n                    loader = TextLoader(fp)\n                elif fp.endswith('.pdf') and os.path.exists(fp):\n                    loader = PyPDFLoader(fp)\n                else:\n                    print(f"Skipping unsupported or missing file: {fp}")\n                    continue\n                self.documents.extend(loader.load())\n                print(f"Loaded: {fp}")\n            except Exception as e:\n                print(f"Error loading {fp}: {e}")\n    def process_documents(self, chunk_size: int = 500, chunk_overlap: int = 100) -> None:\n        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n        self.documents = splitter.split_documents(self.documents)\n        texts = [doc.page_content for doc in self.documents]\n        if self.embeddings:\n            self.doc_embeddings = np.array(self.embeddings.embed_documents(texts))\n        else:\n            # Fallback: simple TF (term frequency) vectors\n            vocab = sorted(set(re.findall(r'\b[a-zA-Z]{3,}\b', ' '.join(texts).lower())))\n            mat = []\n            for t in texts:\n                counts = {w: 0 for w in vocab}\n                for w in re.findall(r'\b[a-zA-Z]{3,}\b', t.lower()):\n                    if w in counts: counts[w] += 1\n                mat.append(np.array([counts[w] for w in vocab], dtype=float))\n            self.doc_embeddings = np.vstack(mat)\n        print(f"Processed {len(self.documents)} chunks")\n    def retrieve_documents(self, query: str, top_k: int = 3) -> List[Document]:\n        if not self.documents or self.doc_embeddings is None:\n            return []\n        if self.embeddings:\n            q_vec = np.array(self.embeddings.embed_query(query))\n        else:\n            vocab = sorted(set(re.findall(r'\b[a-zA-Z]{3,}\b', ' '.join([d.page_content for d in self.documents]).lower())))\n            counts = {w: 0 for w in vocab}\n            for w in re.findall(r'\b[a-zA-Z]{3,}\b', query.lower()):\n                if w in counts: counts[w] += 1\n            q_vec = np.array([counts[w] for w in vocab], dtype=float)\n        sims = [cosine_sim(q_vec, self.doc_embeddings[i]) for i in range(self.doc_embeddings.shape[0])]\n        idx = np.argsort(sims)[::-1][:top_k]\n        return [self.documents[i] for i in idx]\n    def answer_question(self, question: str) -> str:\n        retrieved = self.retrieve_documents(question, top_k=3)\n        if not retrieved:\n            return "No relevant documents found."\n        context = "\n\n".join([d.page_content for d in retrieved])\n        prompt = PromptTemplate(\n            template=(\n                """Based on the following context, answer the question carefully.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""\n            ),\n            input_variables=["context", "question"]\n        )\n        chain = LLMChain(llm=self.llm, prompt=prompt)\n        return chain.run(context=context, question=question)\n\n# Demo: build small corpus and Q&A\nwith open("qa1.txt", "w", encoding="utf-8") as f:\n    f.write("LangChain provides chains, prompts, memory, agents, and integrations.")\nwith open("qa2.txt", "w", encoding="utf-8") as f:\n    f.write("Text splitting helps chunk large documents for better retrieval and processing.")\n\nqa = DocumentQASystem()\nqa.load_documents(["qa1.txt", "qa2.txt"])\nqa.process_documents()\nanswer = qa.answer_question("What components does LangChain provide?")\nprint("Q&A Answer:", answer[:200], "...")

## 10. SummaryIn this notebook, we've covered:- Loading documents from various sources and formats- Applying different splitting strategies- Cleaning and enriching documents with metadata- Building a reusable processing pipeline- Computing and using similarity for retrieval- Implementing a simple document Q&A systemThese techniques are foundational to RAG and other document-centric LLM applications.