In [None]:
%pip install epub3 markdown2

In [8]:
def split_ebook(filename, output_dir):
  """Splits an ebook txt file into separate chapter files based on chapter titles.

  Args:
    filename: Path to the ebook txt file.
    output_dir: Directory to save the split chapter files.
  """
  with open(filename, 'r', encoding='latin1') as ebook_file:
    chapter_num = 1
    chapter_title = ""
    chapter_content = ""
    for line in ebook_file:
      if line.startswith("chapter"):
        # Save previous chapter
        if chapter_title and chapter_content:
          with open(f"{output_dir}/Chapter {chapter_num} - {chapter_title}.txt", 'w') as chapter_file:
            chapter_file.write(chapter_content)
        chapter_num += 1
        chapter_title = line.split()[2]  # Extract title after "chapter" and number
        chapter_content = ""
      else:
        chapter_content += line
    # Save the last chapter
    if chapter_title and chapter_content:
      with open(f"{output_dir}/Chapter {chapter_num} - {chapter_title}.txt", 'w') as chapter_file:
        chapter_file.write(chapter_content)

In [10]:
split_ebook("data/LOTR-1-3-John-Ronald-Reuel-Tolkien.txt", "data/LOTR-split-txt")

In [None]:
from epub3 import ePub
import markdown2

def split_epub_to_md(epub_file, output_dir, book_title):
  """Splits an epub book into separate markdown (md) files for chapters.

  Args:
    epub_file: Path to the epub book file.
    output_dir: Directory to save the split markdown chapter files.
    book_title: Title of the book (used for consistent naming).
  """
  book = ePub(epub_file)

  # Loop through chapters
  for i, item in enumerate(book.get_items(), start=1):
    if item.get_type() == 'application/xhtml+xml':  # Check for chapter content
      chapter_title = item.get_title() or f"Chapter {i}"  # Handle missing titles
      chapter_content = item.get_content()

      # Convert HTML to markdown and remove unnecessary elements
      chapter_content = markdown2.markdown(chapter_content, extras=["break_on_newline"])  # Convert to markdown
      chapter_content = chapter_content.replace("<p>", "").replace("</p>", "\n\n")  # Remove paragraph tags

      # Generate filename with book title, chapter number, and title
      filename = f"{output_dir}/{book_title}-chapter-{i}-{chapter_title}.md"

      with open(filename, 'w') as chapter_file:
        chapter_file.write(chapter_content)

# Replace with your epub file path, output directory, and book title
split_epub_to_md("your_epub.epub", "split_chapters", "Your Book Title")