# 2025-03-02 Collect Data from Dhamma Talks
In this notebook, we will collect data from the Dhamma Talks website. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import requests
from dataclasses import dataclass
from time import sleep

import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

from scripture_search.config import Config


config = Config()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
BASE_URL = "https://www.dhammatalks.org"
DHAMMA_TALKS_DN_URL = BASE_URL + "/suttas/DN"
INDEX_PAGE_EXTENSIONS = [
    "/suttas/AN",
    "/suttas/DN",
    "/suttas/KN",
    "/suttas/MN/",
    "/suttas/SN/",
]
INDEX_PAGE_URLS = [BASE_URL + ext for ext in INDEX_PAGE_EXTENSIONS]

In [4]:
index_page_to_sutta_page_links = {}
for index_page_url in INDEX_PAGE_URLS:
    index_page = BeautifulSoup(requests.get(index_page_url).text, "html.parser")
    index_toc = index_page.find("div", class_="suttatoc")
    sutta_page_links = [
        BASE_URL + anchor["href"]
        for anchor in index_toc.find_all("a")
        if ".html" in anchor["href"]
    ]
    index_page_to_sutta_page_links[index_page_url] = sutta_page_links

for index_page_url, sutta_page_links in index_page_to_sutta_page_links.items():
    print(f"Index page URL: {index_page_url} example pages:")
    for sutta_page_link in sutta_page_links[:3]:
        print(f"  Sutta page link: {sutta_page_link}")

Index page URL: https://www.dhammatalks.org/suttas/AN example pages:
  Sutta page link: https://www.dhammatalks.org/suttas/AN/AN1_21.html
  Sutta page link: https://www.dhammatalks.org/suttas/AN/AN1_45.html
  Sutta page link: https://www.dhammatalks.org/suttas/AN/AN1_48.html
Index page URL: https://www.dhammatalks.org/suttas/DN example pages:
  Sutta page link: https://www.dhammatalks.org/suttas/DN/DN01.html
  Sutta page link: https://www.dhammatalks.org/suttas/DN/DN02.html
  Sutta page link: https://www.dhammatalks.org/suttas/DN/DN09.html
Index page URL: https://www.dhammatalks.org/suttas/KN example pages:
Index page URL: https://www.dhammatalks.org/suttas/MN/ example pages:
  Sutta page link: https://www.dhammatalks.org/suttas/MN/MN_syllabus.html
  Sutta page link: https://www.dhammatalks.org/suttas/MN/MN_syllabus.html
  Sutta page link: https://www.dhammatalks.org/suttas/MN/MN1.html
Index page URL: https://www.dhammatalks.org/suttas/SN/ example pages:
  Sutta page link: https://www.

In [5]:
@dataclass
class SuttaText:
    collection: str
    title: str
    paragraphs: list[str]
    url_source: str


data = []
for index_page_url, sutta_page_links in tqdm(index_page_to_sutta_page_links.items()):
    collection = list(filter(lambda x: len(x) > 0, index_page_url.split("/")))[-1]
    print(f"Index page URL: {index_page_url}")
    for sutta_page_link in tqdm(sutta_page_links):
        sutta_page = BeautifulSoup(requests.get(sutta_page_link).text, "html.parser")
        sutta_body = sutta_page.find("div", id="sutta")
        sutta_title = sutta_body.find("h1").text.split("\n")[0].strip()
        sutta_paragraphs = [p.text.strip() for p in sutta_body.find_all("p")]
        data.append(
            SuttaText(
                collection,
                sutta_title,
                sutta_paragraphs,
                sutta_page_link,
            )
        )
        sleep(0.25)

  0%|          | 0/5 [00:00<?, ?it/s]

Index page URL: https://www.dhammatalks.org/suttas/AN


100%|██████████| 7/7 [00:02<00:00,  2.92it/s]
 20%|██        | 1/5 [00:02<00:09,  2.40s/it]

Index page URL: https://www.dhammatalks.org/suttas/DN


100%|██████████| 14/14 [00:05<00:00,  2.67it/s]
 40%|████      | 2/5 [00:07<00:12,  4.08s/it]

Index page URL: https://www.dhammatalks.org/suttas/KN


0it [00:00, ?it/s]


Index page URL: https://www.dhammatalks.org/suttas/MN/


100%|██████████| 111/111 [00:38<00:00,  2.86it/s]
 80%|████████  | 4/5 [00:46<00:13, 13.67s/it]

Index page URL: https://www.dhammatalks.org/suttas/SN/


100%|██████████| 108/108 [00:36<00:00,  2.94it/s]
100%|██████████| 5/5 [01:23<00:00, 16.64s/it]


In [9]:
data = pd.DataFrame([sutta.__dict__ for sutta in data])
data.assign(
    religion="Buddhism",
    subgroup="Theravada",
    source="Dhamma Talks",
    translation_source="Thanissaro Bhikkhu",
)

Unnamed: 0,collection,title,paragraphs,url_source,religion,subgroup,source,translation_source
0,AN,A Single Thing,[21. âI donât envision a single thing that...,https://www.dhammatalks.org/suttas/AN/AN1_21.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu
1,AN,A Pool of Water,[45. âSuppose there were a pool of waterâs...,https://www.dhammatalks.org/suttas/AN/AN1_45.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu
2,AN,Soft,"[âJust as, of all trees, the balsam is forem...",https://www.dhammatalks.org/suttas/AN/AN1_48.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu
3,AN,Quick to Reverse Itself,[âI donât envision a single thing that is ...,https://www.dhammatalks.org/suttas/AN/AN1_49.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu
4,AN,Luminous,"[âLuminous, monks, is the mind.1 And it is d...",https://www.dhammatalks.org/suttas/AN/AN1_50.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu
...,...,...,...,...,...,...,...,...
235,SN,Victory Through What is Well Spoken,[On one occasion the Blessed One was staying n...,https://www.dhammatalks.org/suttas/SN/SN11_5.html,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu
236,SN,Poor,[On one occasion the Blessed One was staying n...,https://www.dhammatalks.org/suttas/SN/SN11_14....,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu
237,SN,A Delightful Place,[On one occasion the Blessed One was staying n...,https://www.dhammatalks.org/suttas/SN/SN11_15....,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu
238,SN,Ugly,"[Near SÄvatthÄ«. âOnce, monks, a certain ya...",https://www.dhammatalks.org/suttas/SN/SN11_22....,Buddhism,Theravada,Dhamma Talks,Thanissaro Bhikkhu


In [11]:
data.to_csv(config.paths.suttas_data_file)