# [Langchain Document loaders](https://python.langchain.com/docs/integrations/document_loaders/)

In [1]:
#! source ./.venv/bin/activate

In [None]:
#! uv pip install -r requirements.txt

In [1]:
## text loader
from langchain_community.document_loaders import TextLoader
loader = TextLoader("../data/input.txt")
documents = loader.load()
print(f"Number of documents: {len(documents)}")
print(f"First document content: {documents[0].page_content}")

Number of documents: 1
First document content: Oriental Stories was an American pulp magazine published by Popular Fiction and edited by Farnsworth Wright. It was launched in 1930 as a companion to Popular Fiction's Weird Tales, and carried stories with Far Eastern settings, including some fantasy. Contributors included Robert E. Howard, Frank Owen, and E. Hoffmann Price. In 1932 publication was paused; it was relaunched in 1933 under the title The Magic Carpet Magazine, with an expanded editorial policy that now included any story set in an exotic location, including other planets. Some science fiction began to appear alongside the fantasy and adventure material as a result, including work by Edmond Hamilton. Wright obtained stories from H. Bedford Jones, who was a popular pulp writer, and Seabury Quinn. Most of the covers of The Magic Carpet Magazine were by Margaret Brundage. Competition from established pulps in the same niche was too strong, and after five issues under the new tit

In [13]:
## pdf loader
from langchain_community.document_loaders import PyPDFLoader
pdf_loader = PyPDFLoader("../data/attention.pdf")
pdf_documents = pdf_loader.load()
print(f"Number of PDF documents: {len(pdf_documents)}")
print(f"First PDF document content: {pdf_documents[0].page_content}")   

Number of PDF documents: 15
First PDF document content: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗ ‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, d

In [None]:
# web base loader
from langchain_community.document_loaders import WebBaseLoader
web_loader = WebBaseLoader("https://python.langchain.com/docs/integrations/document_loaders/")
web_documents = web_loader.load()
print(f"Number of web documents: {len(web_documents)}")
print(f"First web document content: {web_documents[0].page_content[:500]}...")

In [20]:
# CSV loader
from langchain_community.document_loaders import CSVLoader
import pandas as pd

# Create a sample CSV file
sample_data = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'London', 'Paris']
})
sample_data.to_csv('../data/sample.csv', index=False)

# Load the CSV file
csv_loader = CSVLoader(file_path='../data/sample.csv')
csv_documents = csv_loader.load()
print(f"Number of CSV documents: {len(csv_documents)}")
print(f"First CSV document content: {csv_documents[0].page_content}")

Number of CSV documents: 3
First CSV document content: name: Alice
age: 25
city: New York


In [22]:
# JSON loader
from langchain_community.document_loaders import JSONLoader
import json

# Create a sample JSON file
sample_json = {
    'employees': [
        {'name': 'Alice', 'role': 'Developer'},
        {'name': 'Bob', 'role': 'Designer'},
        {'name': 'Charlie', 'role': 'Manager'}
    ]
}

with open('../data/sample.json', 'w') as f:
    json.dump(sample_json, f)

# Define a simple jq-style function to extract data
def extract_data(record: dict):
    return record.get('employees', [])

# Load the JSON file
json_loader = JSONLoader(
    file_path='../data/sample.json',
    jq_schema='.employees[]',
    text_content=False
)
json_documents = json_loader.load()
print(f"Number of JSON documents: {len(json_documents)}")
print(f"First JSON document content: {json_documents[0].page_content}")

Number of JSON documents: 3
First JSON document content: {"name": "Alice", "role": "Developer"}


In [35]:
# HTML loader
from langchain_community.document_loaders import BSHTMLLoader
import bs4

# Create a sample HTML file
sample_html = """
<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <h1>Welcome to the Sample Page</h1>
    <p>This is a paragraph of text.</p>
    <ul>
        <li>Item 1</li>
        <li>Item 2</li>
        <li>Item 3</li>
    </ul>
</body>
</html>
"""

with open('../data/sample.html', 'w') as f:
    f.write(sample_html)

# Load the HTML file
html_loader = BSHTMLLoader('../data/sample.html', bs_kwargs={'features': 'lxml'})
html_documents = html_loader.load()
print(f"Number of HTML documents: {len(html_documents)}")
print(f"First HTML document content: {html_documents[0].page_content}")

FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

In [37]:
# XML loader
from langchain_community.document_loaders import UnstructuredXMLLoader

# Create a sample XML file
sample_xml = """<?xml version="1.0" encoding="UTF-8"?>
<library>
    <book>
        <title>The Great Gatsby</title>
        <author>F. Scott Fitzgerald</author>
        <year>1925</year>
    </book>
    <book>
        <title>1984</title>
        <author>George Orwell</author>
        <year>1949</year>
    </book>
</library>"""

with open('../data/sample.xml', 'w') as f:
    f.write(sample_xml)

# Load the XML file
xml_loader = UnstructuredXMLLoader('../data/sample.xml')
xml_documents = xml_loader.load()
print(f"Number of XML documents: {len(xml_documents)}")
print(f"XML document content: {xml_documents[0].page_content}")

Number of XML documents: 1
XML document content: The Great Gatsby

F. Scott Fitzgerald

1925

1984

George Orwell

1949


In [39]:
# Markdown loader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

# Create a sample Markdown file
sample_markdown = """# Project Documentation

## Introduction
This is a sample markdown file for testing the document loader.

### Features
- Easy to read
- Simple formatting
- Supports **bold** and *italic* text

## Code Example
```python
def hello_world():
    print("Hello, World!")
```
"""

with open('../data/sample.md', 'w') as f:
    f.write(sample_markdown)

# Load the Markdown file
md_loader = UnstructuredMarkdownLoader('../data/sample.md')
md_documents = md_loader.load()
print(f"Number of Markdown documents: {len(md_documents)}")
print(f"Markdown document content: {md_documents[0].page_content}")

Number of Markdown documents: 1
Markdown document content: Project Documentation

Introduction

This is a sample markdown file for testing the document loader.

Features

Easy to read

Simple formatting

Supports bold and italic text

Code Example

def hello_world():
    print("Hello, World!")


In [42]:
# Directory loader
from langchain_community.document_loaders import DirectoryLoader
import os

# Create a directory with multiple files
os.makedirs('../data/sample_dir', exist_ok=True)

# Create a few text files in the directory
files_content = {
    'file1.txt': 'This is the content of file 1.',
    'file2.txt': 'This is the content of file 2.',
    'file3.txt': 'This is the content of file 3.'
}

for filename, content in files_content.items():
    with open(f'../data/sample_dir/{filename}', 'w') as f:
        f.write(content)

# Load all text files from the directory
dir_loader = DirectoryLoader(
    '../data/sample_dir',
    glob='**/*.txt',  # Load all .txt files recursively
    show_progress=True
)
dir_documents = dir_loader.load()
print(f"Number of documents loaded from directory: {len(dir_documents)}")
for doc in dir_documents:
    print(f"\nDocument source: {doc.metadata['source']}")
    print(f"Content: {doc.page_content}")

  0%|          | 0/3 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 3/3 [00:00<00:00, 289.88it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 3/3 [00:00<00:00, 289.88it/s]

Number of documents loaded from directory: 3

Document source: ../data/sample_dir/file2.txt
Content: This is the content of file 2.

Document source: ../data/sample_dir/file3.txt
Content: This is the content of file 3.

Document source: ../data/sample_dir/file1.txt
Content: This is the content of file 1.





In [43]:
# Email loader
from langchain_community.document_loaders import UnstructuredEmailLoader

# Create a sample email file
sample_email = """From: sender@example.com
To: recipient@example.com
Subject: Test Email
Date: Sat, 26 Oct 2025 10:00:00 -0500

Dear User,

This is a test email for demonstrating the UnstructuredEmailLoader.

Best regards,
Sender
"""

with open('../data/sample.eml', 'w') as f:
    f.write(sample_email)

# Load the email file
email_loader = UnstructuredEmailLoader('../data/sample.eml')
email_documents = email_loader.load()
print(f"Number of email documents: {len(email_documents)}")
print(f"Email document content: {email_documents[0].page_content}")

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


Number of email documents: 1
Email document content: Dear User,

This is a test email for demonstrating the UnstructuredEmailLoader.

Best regards,

Sender


In [49]:
# Install additional required packages
#!uv pip install "unstructured[all-docs]" pytesseract "pdfminer.six>=20221105" python-pptx openpyxl pillow lxml feedparser tesseract tesseract-ocr

In [None]:
# Install python-dotenv if not already installed
!pip install python-dotenv

In [50]:
# Image loader
from langchain_community.document_loaders import UnstructuredImageLoader
from PIL import Image, ImageDraw, ImageFont
import numpy as np

# Create a sample image with text
img = Image.new('RGB', (400, 100), color='white')
d = ImageDraw.Draw(img)
text = "This is sample text in an image"
d.text((10, 40), text, fill='black')
img.save('../data/sample_text.png')

# Alternative approach using unstructured directly
from unstructured.partition.image import partition_image
from langchain_core.documents import Document

# Load and extract text from the image
elements = partition_image(filename='../data/sample_text.png')
image_documents = [Document(page_content="\n\n".join([str(el) for el in elements]))]
print(f"Number of documents from image: {len(image_documents)}")
print(f"Extracted text from image: {image_documents[0].page_content}")

ImportError: cannot import name 'open_filename' from 'pdfminer.utils' (/Users/aamanlamba/Code/langchain/.venv/lib/python3.12/site-packages/pdfminer/utils.py)

In [53]:
# PowerPoint loader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from pptx import Presentation

# Create a sample PowerPoint file
prs = Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
body = slide.shapes.placeholders[1]
title.text = "Sample Presentation"
body.text = "This is a bullet point\nThis is another bullet point"
prs.save('../data/sample.pptx')

# Load the PowerPoint file
pptx_loader = UnstructuredPowerPointLoader('../data/sample.pptx')
pptx_documents = pptx_loader.load()
print(f"Number of documents from PowerPoint: {len(pptx_documents)}")
print(f"Extracted text from PowerPoint: {pptx_documents[0].page_content}")

Number of documents from PowerPoint: 1
Extracted text from PowerPoint: Sample Presentation

This is a bullet point

This is another bullet point


In [54]:
# Excel loader
from langchain_community.document_loaders import UnstructuredExcelLoader
import pandas as pd

# Create a sample Excel file
data = {
    'Name': ['John', 'Alice', 'Bob'],
    'Age': [30, 25, 35],
    'Department': ['IT', 'HR', 'Finance']
}
df = pd.DataFrame(data)
df.to_excel('../data/sample.xlsx', index=False)

# Load the Excel file
excel_loader = UnstructuredExcelLoader('../data/sample.xlsx')
excel_documents = excel_loader.load()
print(f"Number of documents from Excel: {len(excel_documents)}")
print(f"Extracted text from Excel: {excel_documents[0].page_content}")

Number of documents from Excel: 1
Extracted text from Excel: Name Age Department John 30 IT Alice 25 HR Bob 35 Finance


In [56]:
# RSS Feed loader
from langchain_community.document_loaders import RSSFeedLoader

# Load content from an RSS feed
rss_url = "http://rss.cnn.com/rss/cnn_topstories.rss"  # Example RSS feed URL
rss_loader = RSSFeedLoader(urls=[rss_url])

try:
    rss_documents = rss_loader.load()
    print(f"Number of RSS feed items: {len(rss_documents)}")
    print("\nFirst RSS item content:")
    print(f"Title: {rss_documents[0].metadata.get('title', 'No title')}")
    print(f"Link: {rss_documents[0].metadata.get('link', 'No link')}")
    print(f"Content: {rss_documents[0].page_content[:200]}...")
except Exception as e:
    print(f"Error loading RSS feed: {e}")

Error processing entry https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html, exception: newspaper package not found, please install it with `pip install newspaper3k`


Number of RSS feed items: 0

First RSS item content:
Error loading RSS feed: list index out of range


In [8]:
# Load environment variables
import os
from dotenv import load_dotenv  # Correct import from python-dotenv package
load_dotenv()
github_token = os.getenv("GITHUB_TOKEN")
print(f"GitHub Token loaded: {'Yes' if github_token else 'No'}")

GitHub Token loaded: No


In [None]:
# GitHub loader
from langchain_community.document_loaders import GitHubIssuesLoader

# To use the GitHub loader, you'll need a GitHub access token
# You can create one at https://github.com/settings/tokens
print("Using GitHub token from environment variables...")

# Load the token from environment variables
import os
from dotenv import load_dotenv  # Correct import from python-dotenv package
load_dotenv()
github_token = os.getenv("GITHUB_TOKEN")

if not github_token:
    raise ValueError("GitHub token not found in environment variables. Please check your .env file.")

# Load issues from a GitHub repository
repo = "langchain-ai/langchain"  # Example repository
try:
    github_loader = GitHubIssuesLoader(
        repo=repo,
        access_token=github_token,
        state="open",  # can be "open", "closed", or "all"
        labels=["bug"],  # filter issues by labels
    )    
    github_documents = github_loader.load()
    print(f"\nNumber of GitHub issues loaded: {len(github_documents)}")
    print("\nFirst issue content:")
    print(f"Title: {github_documents[0].metadata.get('title', 'No title')}")
    print(f"State: {github_documents[0].metadata.get('state', 'Unknown')}")
    print(f"URL: {github_documents[0].metadata.get('url', 'No URL')}")
    print(f"Content preview: {github_documents[0].page_content[:200]}...")
except Exception as e:
    print(f"Error loading GitHub issues: {e}")
    print("Make sure your token has the necessary permissions")

To access GitHub, you'll need a GitHub Personal Access Token
Create one at https://github.com/settings/tokens with 'repo' scope
None
Error loading GitHub issues: 1 validation error for GitHubIssuesLoader
  Value error, Did not find access_token, please add an environment variable `GITHUB_PERSONAL_ACCESS_TOKEN` which contains it, or pass `access_token` as a named parameter. [type=value_error, input_value={'repo': 'langchain-ai/la...pen', 'labels': ['bug']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/value_error
Make sure your token has the necessary permissions
