In [55]:
import os
import re
import uuid
import json
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from bs4 import BeautifulSoup
from typing import List

basedir = Path().cwd().parent
srcfile = basedir / "data" / "raw" / "poe.html"
tgtdir = basedir / "data" / "interim"

In [59]:
with open(srcfile, "r") as f:
    soup = BeautifulSoup(f)

def split_sentences(text: str) -> List[str]:
    """Split text into sentences."""
    return re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![\w\s][A-Z]\.)(?<=\.|\?|\!)\s", text)

segments = []

for c, chapter in enumerate(soup.find_all(name="div", class_="chapter")):
    title = chapter.h2.text
    title = re.sub(r"\n", " ", title)

    paras = (chapter.find_all(name="p"))
    
    for p, para in enumerate(paras):
        text = para.text
        text = re.sub(r"\n", " ", text)
        text = re.sub(r"\xa0", " ", text)
        sentences = split_sentences(text)

        for s, sentence in enumerate(sentences):
            sentence = re.sub(r" +", " ", sentence)
            sentence = re.sub(r"^\s+", "", sentence)
            segment = {
                "id": f"utterance_{str(uuid.uuid4())[:8]}",
                "text": sentence,
                "title": title,
                "chapter": c + 1,
                "paragraph": p + 1,
                "sentence": s +1,
            }
            segments.append(segment)

with open(tgtdir / "poe.json", "w") as f:
    json.dump(segments, f, indent=2, ensure_ascii=False)