In [None]:
!mkdir -P ../downloads/zips/
!mkdir -P ../downloads/raw_audio/

In [None]:
import pandas as pd

df = pd.read_csv("../data/openbible_swahili_urls.tsv", sep="\t", names=["book", "url"])

In [None]:
for url in df["url"]:
    !wget {url} -P ../downloads/zips/

In [2]:
from glob import glob
from tqdm.auto import tqdm

zips = sorted(glob("../downloads/zips/*"))
for z in tqdm(zips):
    o = z.split("/")[-1]
    !mkdir ../downloads/raw_audio/{o}
    !unzip -q {z} -d ../downloads/raw_audio/{o}

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 66/66 [00:00<00:00, 788672.55it/s]


In [11]:
from pydub import AudioSegment
from pathlib import Path
from tqdm.contrib.concurrent import process_map

def convert_mp3_to_wav(mp3_path):
    chapter = mp3_path.split("/")[-1].strip(".mp3")
    book = chapter.split("_")[0]

    output_path_16 = f"../downloads/wavs_16/{book}/{chapter}.wav"
    Path(output_path_16).parent.mkdir(parents=True, exist_ok=True)
    output_path_44 = f"../downloads/wavs_44/{book}/{chapter}.wav"
    Path(output_path_44).parent.mkdir(parents=True, exist_ok=True)

    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(output_path_16, format="wav", parameters=["-ar", "16000", "-ac", "1"])
    audio.export(output_path_44, format="wav", parameters=["-ar", "44100", "-ac", "1"])

raw_audios = sorted(glob("../downloads/raw_audio/*/*.mp3"))
_ = process_map(convert_mp3_to_wav, raw_audios)

  _ = process_map(convert_mp3_to_wav, raw_audios)
  0%|          | 0/1189 [00:00<?, ?it/s]

100%|██████████| 1189/1189 [02:50<00:00,  6.97it/s]


In [1]:
from glob import glob

usxs = sorted(glob("../downloads/release/USX_1/*.usx"))

In [2]:
import re

def parse_usx(usx_path):
    verses = []
    with open(usx_path) as f:
        # remove newlines since verses can be multi-line
        lines = re.sub(r"\s+", " ", f.read())

    # verse follow this format: <verse sid="XX:YY">VERSE<verse eid="XX:YY">
    verse_matches = re.findall(r"<verse.+?sid=\"(.+?)\"\/>(.*?)<verse eid=\".+?\"\/>", lines)
    if verse_matches:
        for match in verse_matches:
            vid = match[0]
            verse_text = match[1]
            # remove notes
            for note_matches in re.findall(r"(<note.+?<\/note>)", verse_text):
                verse_text = verse_text.replace(note_matches, " ")
            # uppercase names of God
            for char_matches in re.findall(r"<char style=\"nd\">(.+?)<\/char>", verse_text):
                verse_text = verse_text.replace(char_matches, char_matches.upper())
            # remove char tags, keep content
            for char_matches in re.findall(r"(<char.+?>(.+?)<\/char>)", verse_text):
                verse_text = verse_text.replace(char_matches[0], char_matches[1])
            # again, for nested char tags
            for char_matches in re.findall(r"(<char.+?>(.+?)<\/char>)", verse_text):
                verse_text = verse_text.replace(char_matches[0], char_matches[1])
            # remove cell tags, keep content
            for cell_matches in re.findall(r"(<cell.+?>(.+?)<\/cell>)", verse_text):
                verse_text = verse_text.replace(cell_matches[0], cell_matches[1])
            # remove cell, row, and ref tags
            for cell_matches in re.findall(r"(<cell.+?>)", verse_text):
                verse_text = verse_text.replace(cell_matches, " ")
            for row_matches in re.findall(r"(<row.+?>)", verse_text):
                verse_text = verse_text.replace(row_matches, " ")
            for ref_matchs in re.findall(r"(\(<ref.+?<\/ref>\))", verse_text):
                verse_text = verse_text.replace(ref_matchs, " ")
            # remove para tags
            for para_matches in re.findall(r"(<para style=\"((li.|b|m|pi.|q|p|q.+?|p.+?|mi|s1|sp|r))\" vid=\".+?\"(>|/>))", verse_text):
                verse_text = verse_text.replace(para_matches[0], " ")
            # remove leftover tags
            REMOVE_TAGS = ["<optbreak/>", "</cell>", "</row>", "</table>", "<table>", "</para>"]
            for tag in REMOVE_TAGS:
                verse_text = verse_text.replace(tag, " ")

            # NOTE: special case for 1CH 25:31 Swahili
            for chapter_matches in re.findall(r"(<chapter eid=\".+?\"/>)", verse_text):
                verse_text = verse_text.replace(chapter_matches, " ")
            verse_text = verse_text.split('<chapter number="26" style="c" sid="1CH 26"/>')[0]

            verse_text = re.sub(r"\s+", " ", verse_text).strip()
            verses.append({"verseNumber": vid, "verseText": verse_text})

    return verses

In [3]:
books = [parse_usx(usx_path) for usx_path in usxs]

In [4]:
from pathlib import Path
import json

for name, book in zip(usxs, books):
    name = name.split("/")[-1].replace(".usx", "")
    output_path = f"../data/openbible_swahili/{name}.json"
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w") as f:
        json.dump(book, f, indent=2)