# Directory Loaders

Directory loaders scan files and subdirectories in the given directory path and calls the load_document function on
each file that matches the directory filter and the file filter.

load_document will use the file extension to choose the right loader and use the prefixed parameters for each loader type.

In [1]:
import sys
sys.path.append('../../')

from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(), override=True)

from pprint import pprint

from document_loaders.load_directory import load_directory, load_directory_lazy, DirectoryLoader

In [2]:
def print_docs_per_source(docs):
    docs_per_source = {}
    for doc in docs:
        source = doc.metadata["source"].split("/")[-1]
        if doc.metadata["source"] not in docs_per_source:
            docs_per_source[source] = 0
        docs_per_source[source] += 1
    pprint(docs_per_source)

## load_directory

In [3]:
docs = load_directory("./", dir_filter="**/[!.]*", file_filter="[!.]*", text_splitter="auto")

len(docs)



66

In [4]:
print_docs_per_source(docs)

{'document_loaders.ipynb': 2,
 'facebook_chat.json': 1,
 'facebook_chat_messages.jsonl': 1,
 'load_directory.ipynb': 1,
 'load_document_csv.ipynb': 1,
 'load_document_json.ipynb': 1,
 'mlb_teams_2012.csv': 1,
 'some_json.json': 1,
 'some_report.pdf': 1,
 'some_website.html': 1,
 'state_of_the_union.txt': 1,
 'us_constitution.pdf': 1}


## load_directory_lazy

In [5]:
docs = []
for doc in load_directory_lazy("./", dir_filter="**/[!.]*", file_filter="[!.]*", text_splitter="auto"):
    docs.append(doc)

len(docs)



66

In [6]:
print_docs_per_source(docs)

{'document_loaders.ipynb': 2,
 'facebook_chat.json': 1,
 'facebook_chat_messages.jsonl': 1,
 'load_directory.ipynb': 1,
 'load_document_csv.ipynb': 1,
 'load_document_json.ipynb': 1,
 'mlb_teams_2012.csv': 1,
 'some_json.json': 1,
 'some_report.pdf': 1,
 'some_website.html': 1,
 'state_of_the_union.txt': 1,
 'us_constitution.pdf': 1}


## DirectoryLoader

In [7]:
loader = DirectoryLoader("./", dir_filter="**/[!.]*", file_filter="[!.]*", text_splitter="auto")

In [8]:
docs = loader.load()

len(docs)



66

In [9]:
print_docs_per_source(docs)

{'document_loaders.ipynb': 2,
 'facebook_chat.json': 1,
 'facebook_chat_messages.jsonl': 1,
 'load_directory.ipynb': 1,
 'load_document_csv.ipynb': 1,
 'load_document_json.ipynb': 1,
 'mlb_teams_2012.csv': 1,
 'some_json.json': 1,
 'some_report.pdf': 1,
 'some_website.html': 1,
 'state_of_the_union.txt': 1,
 'us_constitution.pdf': 1}


In [10]:
docs = []
for doc in loader.lazy_load():
    docs.append(doc)

len(docs)



66

In [11]:
print_docs_per_source(docs)

{'document_loaders.ipynb': 2,
 'facebook_chat.json': 1,
 'facebook_chat_messages.jsonl': 1,
 'load_directory.ipynb': 1,
 'load_document_csv.ipynb': 1,
 'load_document_json.ipynb': 1,
 'mlb_teams_2012.csv': 1,
 'some_json.json': 1,
 'some_report.pdf': 1,
 'some_website.html': 1,
 'state_of_the_union.txt': 1,
 'us_constitution.pdf': 1}
