In [None]:
!pip install scrapy
!apt install libicu-dev pkg-config
!pip install ICU-Tokenizer

In [None]:
from pathlib import Path
import requests
from scrapy import Spider, Request
from icu_tokenizer import SentSplitter
import yaml

In [None]:
SPLITTER = SentSplitter("ff")

In [None]:
class MySpider(Spider):
    name = "bible"
    with open("extra/links.yml") as links_file:
        links = yaml.safe_load(links_file)
    languages, start_urls = zip(*links.items())
    codes = [url.split(".")[-1] for url in start_urls]
    mapping = dict(zip(codes, languages))

    def download_audio(self, audio_src, output_file):
        data = requests.get(audio_src)
        with open(output_file, 'wb') as out_mp3:
            out_mp3.write(data.content)
            return True, Path(output_file).stem

    def parse(self, response):
        content_to_pass = {"ChapterContent_r___3KRx", "ChapterContent_label__R2PLt", "ChapterContent_note__YlDW0",
                           "ChapterContent_fr__0KsID", "ChapterContent_body__O3qjr", "ft", "w"}
        title = response.css("h1::text")
        title = title.get()
        book, chapter, code = response.url.split("/")[-1].split(".")[-3:]
        language = MySpider.mapping[code]

        output_folder = Path(f"bible/raw/{language}")
        output_folder.mkdir(exist_ok=True, parents=True)
        output_filename = output_folder / f"{book}_{chapter}_{code}"

        with open(f"bible/{code}.books", "a+") as titles:
            titles.write(f"{book}\t{chapter}\t{title}\n")
        audio = response.css("div.pli-1:nth-child(4) > div:nth-child(1) > audio:nth-child(1)")
        if "src" in audio.attrib:
            audio = audio.attrib["src"]
            self.download_audio(audio, f"{output_filename}.mp3")
        with open(f"{output_filename}.txt", "w") as output_file:
            output_file.write(f"{title}\n")
            for content in response.css("div.ChapterContent_chapter__uvbXo div"):
                if content.attrib["class"] in content_to_pass:
                    continue
                spans = content.css("span span")
                if not spans:
                    text = " ".join(verse.strip() for verse in content.css("span *::text").getall())
                    text = text.strip()
                    if not text:
                        continue
                    for sent in SPLITTER.split(text):
                        output_file.write(f"{sent}\n")
                    continue
                verses = []
                for span in spans:
                    if span.attrib["class"] in content_to_pass:
                        continue
                    for verse in span.css("*::text").getall():
                        verses.append(verse.strip())
                text = " ".join(verses).strip()
                text = text.strip()
                if not text:
                    continue
                for sent in SPLITTER.split(text):
                    output_file.write(f"{sent}\n")
        next_page = response.css("div.\[pointer-events\:all\]:nth-child(2) > a:nth-child(1)")
        if next_page:
            yield Request(url=f"https://www.bible.com{next_page.attrib['href']}")

In [None]:
%%capture
process = CrawlerProcess()

process.crawl(MySpider, name="bible")
process.start()