# Recursive URL Loader
Web上のドキュメントをクローリングし、ローカルに保存します。

In [None]:
!pip install langchain-community
!pip install bs4

In [8]:
# recursive_url_loader の動作確認用コード
# 再起的なコンテンツ取得ができているかを確認する
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup

url = "https://python.langchain.com/docs/integrations/document_loaders/"
loader = RecursiveUrlLoader(
    url=url, max_depth=3, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()

print(f"ページ数:{len(docs)}")
print(f"テキスト:" + docs[-1].page_content[:50].replace("\n", " "))
print(f"メタデータ:{docs[-1].metadata}")

ページ数:159
テキスト:     Dropbox | 🦜️🔗 Langchain        Skip to main c
メタデータ:{'source': 'https://python.langchain.com/docs/integrations/document_loaders/dropbox', 'title': 'Dropbox | 🦜️🔗 Langchain', 'description': 'Dropbox is a file hosting', 'language': 'en'}


In [48]:
# URL配下のドキュメントを再起的に取得し、CSVファイルに保存
import os
from datetime import datetime
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup
import os
import csv

URL_ROOT = "https://python.langchain.com/docs/integrations/document_loaders/"
MAX_DEPTH = 2

def save_html_to_file(content, path):
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)

def save_to_csv(docs, directory):
    csv_path = os.path.join(directory, 'contents.csv')
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = [
            "source",
            "title",
            "description",
            "content",
            "language",
            "docs_updated_at",
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for doc in docs:
            content = doc.page_content.replace('\n', '\\n')
            writer.writerow(
                {
                    "source": doc.metadata["source"],
                    "title": doc.metadata["title"],
                    "description": doc.metadata.get("description", ""),
                    "content": content,
                    "language": doc.metadata["language"],
                    "docs_updated_at": doc.metadata.get("docs_updated_at", ""),
                }
            )

def main():
    loader = RecursiveUrlLoader(
        url=URL_ROOT,
        max_depth=MAX_DEPTH,
        extractor=lambda x: Soup(x, "html.parser").text,
    )
    docs = loader.load()

    # Create directory with current timestamp
    directory = f"./page_contents_{datetime.now().strftime('%Y%m%d%H%M%S')}/"
    os.makedirs(directory, exist_ok=True)

    # Call the save_to_csv function with the docs variable and directory
    save_to_csv(docs, directory)

main()