# Document Loaders

There are many other types of Documents that can be loaded in, including integrations which we'll cover in the next notebook. You can see all the document loaders available here: https://python.langchain.com/docs/modules/data_connection/document_loaders/

Keep in mind many Loaders are dependent on other libraries, meaning issues in those libraries can end up breaking the Langchain loaders.

In [1]:
import sys
sys.path.append('../../')

from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(), override=True)

from langchain_core.documents import Document

from document_loaders.load_document import load_document, load_document_lazy, DocumentLoader
from document_loaders.load_directory import load_directory, load_directory_lazy, DirectoryLoader

In [2]:
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

## CSV

In [3]:
docs = load_document('./files/penguins.csv', text_splitter="auto")

RuntimeError: Error loading ./files/penguins.csv

In [None]:
type(docs)

In [None]:
type(docs[0])

In [None]:
len(docs)

In [None]:
docs[0]

## HTML

In [None]:
docs = load_document("./files/some_website.html", text_splitter="auto")

len(docs)

In [None]:
docs[0]

In [None]:
docs = load_document("./files/some_website.html", mode="raw", text_splitter="auto")

len(docs)

In [None]:
docs[0]

## PDF

In [None]:
docs = load_document('./files/some_report.pdf', text_splitter="auto")

docs

## JSON

In [None]:
docs = load_document("./files/some_json.json", text_splitter="auto")

len(docs)

In [None]:
docs[len(docs)-1].page_content

## Programming Language

In [None]:
docs = load_document('../multi_vectorstore.py', text_splitter="auto")

len(docs)

In [None]:
print(docs[0].page_content[0:49])

## Wikipedia

In [None]:
docs = load_from_wikipedia(
            "Lionel Messi",
            lang='es',
            load_max_docs=1,
            text_splitter="recursive",
            splitter_kwargs={
                "chunk_size": CHUNK_SIZE,
                "chunk_overlap": CHUNK_OVERLAP,
            }
        )

len(docs)

In [None]:
print(docs[0].page_content)

## WEB

In [None]:
docs = load_from_web(
            "https://www.money.com",
            default_parser="html.parser",
            bs_get_text_kwargs={"strip":True},
            text_splitter="recursive",
            splitter_kwargs={
                "chunk_size": CHUNK_SIZE,
                "chunk_overlap": CHUNK_OVERLAP,
            }
        )

len(docs)

In [None]:
print(docs[0].page_content)

In [None]:
docs = load_with_chromium(
            "https://www.money.com",
            transform=True,
            tags_to_extract=["span"],
            text_splitter="recursive",
            splitter_kwargs={
                "chunk_size": CHUNK_SIZE,
                "chunk_overlap": CHUNK_OVERLAP,
            }
        )

len(docs)

In [None]:
print(docs[0].page_content)

## Load by file extension

In [None]:
docs = load_all_documents("../", ".py", recursive=True)

len(docs)

In [None]:
len(list(load_all_documents_lazy("../", ".py", recursive=True)))

In [None]:
loader = LoadAllDocuments("../", ".py", recursive=True)

In [None]:
docs = loader.load()

len(docs)

In [None]:
len(list(loader.lazy_load()))

### Load directories

In [None]:
from langchain.document_loaders import PythonLoader

In [None]:
docs = load_directory("../", glob="**/*.py", loader_cls=PythonLoader, silent_errors=True)

len(docs)

In [None]:
len(list(scan_load_directory("../", recursive=True, file_filter="*.py", autodetect_encoding=True)))

In [None]:
loader = ScanLoadDirectory("../", recursive=True, file_filter="*.py", autodetect_encoding=True)

In [None]:
docs = loader.load()

len(docs)

In [None]:
len(list(loader.lazy_load()))