# Directory Loaders

Directory loaders scan files and subdirectories in the given directory path and calls the load_document function on
each file that matches the directory filter and the file filter.

load_document will use the file extension to choose the right loader and use the prefixed parameters for each loader type.

In [1]:
import sys
sys.path.append('../../')

from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(), override=True)

from pprint import pprint

from document_loaders.load_directory import load_directory, load_directory_lazy, DirectoryLoader
from document_loaders.file_error_handling import FileErrorDB

In [2]:
file_error_db = FileErrorDB("file_errors.sql")

In [3]:
file_error_db.get_all_errors(group_by='extension').keys()

dict_keys(['', 'MOV', 'afpub', 'atx', 'bak', 'bin', 'blurb', 'css', 'epub', 'essentialsound', 'ffs_batch', 'ffs_gui', 'guides', 'h', 'in', 'ini', 'json', 'key', 'mom', 'mp4', 'numbers', 'olp', 'omo', 'pages', 'pbg', 'pbk', 'pfx', 'pjd', 'plist', 'pyw', 'rc', 'rdp', 'saproj', 'sql', 'uasset', 'umap', 'uproject', 'url', 'ush', 'vcf', 'xml', 'xpm', 'zip'])

In [None]:
def print_docs_per_extension(docs):
    docs_per_extension = {}
    for doc in docs:
        extension = doc.metadata["source"].split(".")[-1]
        if extension not in docs_per_extension:
            docs_per_extension[extension] = 0
        docs_per_extension[extension] += 1
    pprint(docs_per_extension)

In [None]:
DIR_PATH = "/Users/antonio/Documents"

## load_directory

In [None]:
docs = load_directory(
            DIR_PATH,
            dir_filter="**/[!.]*",
            file_filter="[!.]*",
            text_splitter="auto",
            on_file_error=file_error_db.add_file_error
        )

len(docs)

In [None]:
print_docs_per_extension(docs)

## load_directory_lazy

In [None]:
docs = []
for doc in load_directory_lazy(DIR_PATH, dir_filter="**/[!.]*", file_filter="[!.]*", text_splitter="auto"):
    docs.append(doc)

len(docs)

In [None]:
print_docs_per_extension(docs)

## DirectoryLoader

In [None]:
loader = DirectoryLoader(DIR_PATH, dir_filter="**/[!.]*", file_filter="[!.]*", text_splitter="auto")

In [None]:
docs = loader.load()

len(docs)

In [None]:
print_docs_per_extension(docs)

In [None]:
docs = []
for doc in loader.lazy_load():
    docs.append(doc)

len(docs)

In [None]:
print_docs_per_extension(docs)