# Buildilng a PDF to audiobook converter in Python + CoPilot
We're going to run through this iteratively, making improvements as we go.  To start, we'll write a program that will:
1. Convert a number of pages of a PDF file to a text file
1. Convert that text file into an mp3 file using Microsoft Edge's excellent text-to-speech AI service

Incrementally, we'll make improvements to:
1. Ignore headers and other text which we don't want read to us in the audiobook
1. Convert the entire PDF to an audiobook
1. Properly format the text file so that paragraphs aren't broken by newlines
1. Provide indication of progress being made in the audiobook conversion (as this can take a while)

Let's get started!

-----

## Version 04 -- Keep it Simple & Use PyMuPDF
Keep it simple!  This approach:
* Extract text blocks using PyMuPDF
* Filter out blocks that are in headers / footers
* Remove newlines within each block
* Push block text to file and add newline after it
* Every N pages, create a new file (rather than fiddle with chapters)
* Create mp3 of text file

In [None]:
import fitz
from pathlib import Path

PROJECT_PATH = Path('~/projects/pdf_audiobook/')
PROJECT_PATH = PROJECT_PATH.expanduser()

PDF_PATH = PROJECT_PATH / 'PDFs'
TEXT_OUTPUT_PATH = PROJECT_PATH / 'output/txt'
AUDIO_OUTPUT_PATH = PROJECT_PATH / 'output/mp3'

PDF_FILE = PDF_PATH / "Plato - Apology.pdf"
AUDIO_FILE = AUDIO_OUTPUT_PATH / "Plato - Apology.mp3"
TEXT_FILE = TEXT_OUTPUT_PATH / 'extracted_text.txt'

AUDIO_FILE_PREFIX = 'Plato - Apology'
TEXT_FILE_PREFIX = 'Plato - Apology'

pdf = fitz.open(PDF_FILE)

PAGE_DIMENSIONS = pdf.load_page(0).bound()

HEADER_INCHES = 0.75
LEFT_MARGIN_INCHES = 1.25

PAGE_LENGTH = pdf.load_page(0).bound()[3]
PAGE_WIDTH = pdf.load_page(0).bound()[2]

# Define header and footer bounding boxes
HEADER_FROM_TOP = round(0.75 * 72)  # this can be used to filter out the header
FOOTER_FROM_TOP = PAGE_LENGTH - HEADER_FROM_TOP # this can be used to filter out the footer
LEFT_MARGIN_FROM_LEFT = LEFT_MARGIN_INCHES * 72
RIGHT_MARGIN_FROM_LEFT = PAGE_WIDTH - LEFT_MARGIN_FROM_LEFT

PyMuPDF will [extractBLOCKS()](https://pymupdf.readthedocs.io/en/latest/textpage.html#TextPage.extractBLOCKS) for us, which contain useful information for processing text.

extractBLOCKS() -- which can be called via the convenience method ```page.get_text("blocks")``` -- returns a list of tuples where each tuple contains:
```
(x0, y0, x1, y1, block text, block number, block type)
```

... where block number is the block sequence number on the page, and block type = 0 for text, and 1 for image.

1 inch = 72 points in PDF terms.  Typical margins are:
* Top & bottom:  0.75 inches
* Right & left:  1.25 inches

Let's define some constants to help us work in these dimensions.  Recall, also, that PDFs are measured from the left and top sides of the page.  First, we'll get the dimensions of the page itself.

1. Extract text blocks using PyMuPDF
1. Filter out blocks that are in headers / footers
1. Remove newlines within each block
1. Push block text to file and add newline after it
1. Every N pages, create a new file (rather than fiddle with chapters)
1. Create mp3 of text file

In [None]:
import os

PAGE_START = 2
#PAGE_STOP = 6
PAGE_STOP = pdf.page_count

PAGES_PER_FILE = 10 #TODO: reconcile this 1-index with the 0-index of the page count

page_count = 0
file_label = 0

# open the text file with zero-padded file count in the filename
text_file = open(file=TEXT_OUTPUT_PATH / (TEXT_FILE_PREFIX + f'_{file_label:02}' + '.txt'), mode='w')

# iterate through each page in the document
for page in pdf.pages(start=PAGE_START, stop=PAGE_STOP):

    # if the page count is a multiple of PAGES_PER_FILE, increment the file counter and open a new text file
    if (page_count % PAGES_PER_FILE == 0 and page_count > 0):
        # create an mp3 of the text file using edge-tts
        #!edge-tts --voice en-AU-NatashaNeural --file '{TEXT_FILE}' --write-media '{AUDIO_FILE}'
        #!edge-tts --voice en-AU-NatashaNeural --file '{text_file.name}' --write-media '{AUDIO_FILE_PREFIX}_{file_label:02}.mp3'
        
        print(f'Closing {text_file.name}')
        text_file.close()

        print(f'Writing {AUDIO_OUTPUT_PATH}/{AUDIO_FILE_PREFIX}_{file_label:02}.mp3')
        os.system(f"edge-tts --voice en-AU-NatashaNeural --file '{text_file.name}' --write-media '{AUDIO_OUTPUT_PATH}/{AUDIO_FILE_PREFIX}_{file_label:02}.mp3'")
        #os.system(f'edge-tts voice en-AU-NatashaNeural --file '{text_file.name}' --write-media '{text_file.name[:-4]}.mp3'")

        # increment the file counter
        file_label = file_label + 1
        
        # open a new text file with zero-padded file count in the filename
        print(f'Opening {TEXT_OUTPUT_PATH}/{TEXT_FILE_PREFIX}_{file_label:02}.txt')
        text_file = open(file=TEXT_OUTPUT_PATH / (TEXT_FILE_PREFIX + f'_{file_label:02}' + '.txt'), mode='w')

        
    # iterate through each block of text on the page
    for block in page.get_text('blocks'):
        print(f'Processing block #{block[5]} of page #{page.number}')
        # get the y coordinate of the bottom right corner of the block
        block_end_y = block[3]

        # if the block is not in the header or footer
        if not (block_end_y < HEADER_FROM_TOP or block_end_y > FOOTER_FROM_TOP):

            # remove the newline characters from the block text
            block_text = block[4].replace('\n', ' ')
            # write the block text to the text file and append a newline character
            print(f'Writing block #{block[5]} of page #{page.number} to {text_file.name}')
            text_file.write(block_text + '\n')
    
    # increment page counter
    #import pdb; pdb.set_trace()
    print(f'Incrementing page counter from {page_count} to {page_count + 1}')
    page_count = page_count + 1

    # if we've processed all of the pages in the range, close the text file and create an mp3 of the text file using edge-tts
    if page_count == PAGE_STOP - PAGE_START:
        print(f'Closing {text_file.name}')
        text_file.close()

        print(f'Writing {AUDIO_OUTPUT_PATH}/{AUDIO_FILE_PREFIX}_{file_label:02}.mp3')
        os.system(f"edge-tts --voice en-AU-NatashaNeural --file '{text_file.name}' --write-media '{AUDIO_OUTPUT_PATH}/{AUDIO_FILE_PREFIX}_{file_label:02}.mp3'")


### TODO:
* Separate the 2 loops:  text and audio file creation (this will also make it clear where there are errors in creation)
* Print some timestamps in audio file creation
* Handle audio file creation directly within Python using edge-tts so you can handle errors properly
* Use `nbdev` to create the first Python library from these notebooks
* Reconcile PAGES_PER_FILE 1-index with the 0-index of the page count
* See if there's not an even more natural sounding TTS library out there
    * [Explore the question with ChatGPT a bit](https://chat.openai.com/chat/abba5e65-897c-473d-86ca-e2a97ff106fe)

### Thinking out loud
What's an algorithm for grouping all of the characters by text size and then identifying which are useful for the audiobook?
* What types of characters do you expect?
    * Paragraphs
    * Chapter headers
    * Header
    * Footer
    * Annotations / references
* Analytze the entire document, grouping text by size
* Produce an image of each page which contains the first example of a size of text, with that text highlighted / redboxed
* Ask the user to label each of the groups of text size, using labels you predefine for audiobook layout (e.g. chapter heading, paragraph, etc ...)
* Generate an audiobook in accordance with the predefined layout

Start simply!
* Allow for chapter headers and paragraphs, only.
* Don't prompt the user -- make assumptions.

-----

## Version 05 -- Refactoring v4
Let's refactor a bit to accomplish the following from our TODO list:
1. Separate the 2 loops:  text and audio file creation (this will also make it clear where there are errors in creation)
1. Print some timestamps in audio file creation -- actually let's implement some logging more generally
1. Handle audio file creation directly within Python using edge-tts so you can handle errors properly

Let's ask ChatGPT to give us a hand in this.  [Here's a session for that](https://chat.openai.com/chat/abba5e65-897c-473d-86ca-e2a97ff106fe).

In [None]:
import fitz
from pathlib import Path
import os
import logging
import time
import edge_tts
import asyncio

# Configuration parameters
PROJECT_PATH = Path('~/projects/pdf_audiobook/')
PROJECT_PATH = PROJECT_PATH.expanduser()

PDF_PATH = PROJECT_PATH / 'PDFs'
TEXT_OUTPUT_PATH = PROJECT_PATH / 'output/txt'
AUDIO_OUTPUT_PATH = PROJECT_PATH / 'output/mp3'

PDF_FILE = PDF_PATH / "Plato - Apology.pdf"
AUDIO_FILE = AUDIO_OUTPUT_PATH / "Plato - Apology.mp3"
TEXT_FILE = TEXT_OUTPUT_PATH / 'extracted_text.txt'

AUDIO_FILE_PREFIX = 'Plato - Apology'
TEXT_FILE_PREFIX = 'Plato - Apology'

# Open the PDF file to get some dynamic parameters
pdf = fitz.open(PDF_FILE)

PAGE_DIMENSIONS = pdf.load_page(0).bound()

HEADER_INCHES = 0.75
LEFT_MARGIN_INCHES = 1.25

PAGE_LENGTH = pdf.load_page(0).bound()[3]
PAGE_WIDTH = pdf.load_page(0).bound()[2]

# Define header and footer bounding boxes
HEADER_FROM_TOP = round(0.75 * 72)  # this can be used to filter out the header
FOOTER_FROM_TOP = PAGE_LENGTH - HEADER_FROM_TOP # this can be used to filter out the footer
LEFT_MARGIN_FROM_LEFT = LEFT_MARGIN_INCHES * 72
RIGHT_MARGIN_FROM_LEFT = PAGE_WIDTH - LEFT_MARGIN_FROM_LEFT


# Execution parameters
PAGE_START = 2
#PAGE_STOP = 6
PAGE_STOP = pdf.page_count

PAGES_PER_FILE = 10 #TODO: reconcile this 1-index with the 0-index of the page count

Here is a refactored function generated by ChatGPT to extract text from a PDF.  We've modified it a bit below.

In [None]:
def extract_text(pdf=PDF_FILE, output_path=TEXT_OUTPUT_PATH, prefix=TEXT_FILE_PREFIX, start=PAGE_START, stop=PAGE_STOP, header=HEADER_FROM_TOP, footer=FOOTER_FROM_TOP):
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

    page_count = 0
    file_label = 0
    
    # TODO: create the output_path directory if it doesn't exist
    text_file = open(file=f'{output_path}/{prefix}_{file_label:02}.txt', mode='w')

    for page in pdf.pages(start=start, stop=stop):
        if page_count % PAGES_PER_FILE == 0 and page_count > 0:
            logging.info(f'Closing {text_file.name}')
            text_file.close()
            file_label += 1
            text_file = open(file=f'{output_path}/{prefix}_{file_label:02}.txt', mode='w')
            logging.info(f'Opened {text_file.name}')

        for block in page.get_text('blocks'):
            logging.info(f'Processing block #{block[5]} of page #{page.number}')
            if not (block[3] < header or block[3] > footer):
                block_text = block[4].replace('\n', ' ')
                logging.info(f'Writing block #{block[5]} of page #{page.number} to {text_file.name}')
                text_file.write(block_text + '\n')
        
        page_count += 1
        logging.info(f'Processed page {page_count}')

    text_file.close()

Similarly, here is a refactored function for generating the audio file from ChatGPT, modified as needed.

NB -- ChatGPT utterly fabricated (lied) about this function.  It didn't know how to use edge_tts.  Have a look at the chat sessions from this evening for more curious nonsense.

In [None]:
async def create_mp3(text_path, audio_path, prefix, num_files):
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

    for i in range(num_files):
        text_file_name = f'{text_path}/{prefix}_{i:02}.txt'
        text_file = open(file=text_file_name, mode='r')

        audio_file_name = f'{audio_path}/{prefix}_{i:02}.mp3'

        success = False
        while not success:
            try:
                #edge_tts.speak(text_file, audio_file, voice='en-AU-NatashaNeural')
                logging.info(f'Creating MP3 file {audio_file_name} from {text_file}')
                tts = edge_tts.communicate.Communicate()
                mp3 = b''
                async for data in tts.run(messages=text_file, voice='en-AU-NatashaNeural'):
                    if data[2] is not None:
                        mp3 += data[2]
                    elif data[0] is not None:
                        logging.info(f'Error:  {data[0]}')
                if not mp3:
                    logging.error(f'No data returned from Edge TTS.')
                    raise Exception('No data returned from Edge TTS.')
                success = True
            except Exception as e:
                logging.error(f'Error creating MP3 file: {e} ... retrying in 5 seconds.')
                time.sleep(5)  # pause for 5 seconds before retrying
        with open(audio_file_name, 'wb') as audio_file:
            audio_file.write(mp3)
        logging.info(f'Created MP3 file {audio_file_name}')

In [None]:
import shutil

# A function to move the text and audio files in a 'tmp' directory
def move_files(text_path, audio_path):
    tmp_text_path = text_path / 'tmp'
    tmp_audio_path = audio_path / 'tmp'
    
    if not tmp_text_path.exists():
        tmp_text_path.mkdir()
    
    if not tmp_audio_path.exists():
        tmp_audio_path.mkdir()
    
    for file in text_path.glob('*.txt'):
        shutil.move(file, tmp_text_path / file.name)
    
    for file in audio_path.glob('*.mp3'):
        shutil.move(file, tmp_audio_path / file.name)

In [None]:
# Execute the functions
#move_files(TEXT_OUTPUT_PATH, AUDIO_OUTPUT_PATH)
#extract_text(pdf=pdf, output_path=TEXT_OUTPUT_PATH, prefix=TEXT_FILE_PREFIX, start=PAGE_START, stop=PAGE_STOP, header=HEADER_FROM_TOP, footer=FOOTER_FROM_TOP)
await(create_mp3(text_path=TEXT_OUTPUT_PATH, audio_path=AUDIO_OUTPUT_PATH, prefix=TEXT_FILE_PREFIX, num_files=(PAGE_STOP - PAGE_START) // PAGES_PER_FILE))

This is from [this example](https://github.com/hasscc/hass-edge-tts/blob/4d5017100dca527af75a81fab959b8f343f26150/custom_components/edge_tts/tts.py#L364), and -- assuming this code actually works in the project it's from -- looks like it is informative.

In [None]:
async def async_get_tts_audio(self, message, language, options=None):
    """Load TTS audio."""
    opt = {CONF_LANG: language}
    if language in SUPPORTED_VOICES:
        opt[CONF_LANG] = SUPPORTED_VOICES[language]
        opt['voice'] = language
    opt = {**self._config, **opt, **(options or {})}

    # https://docs.microsoft.com/zh-CN/azure/cognitive-services/speech-service/speech-synthesis-markup?tabs=csharp#adjust-speaking-languages
    lang = opt.get(CONF_LANG) or language

    # https://docs.microsoft.com/zh-CN/azure/cognitive-services/speech-service/speech-synthesis-markup?tabs=csharp#use-multiple-voices
    voice = opt.get('voice') or SUPPORTED_LANGUAGES.get(lang) or 'zh-CN-XiaoxiaoNeural'

    # https://docs.microsoft.com/zh-CN/azure/cognitive-services/speech-service/speech-synthesis-markup?tabs=csharp#adjust-speaking-styles
    for f in self._style_options:
        v = opt.get(f)
        if v is not None:
            _LOGGER.warning(
                'Edge TTS options style/styledegree/role are no longer supported, '
                'please remove them from your automation or script. '
                'See: https://github.com/hasscc/hass-edge-tts/issues/8'
            )
            break

    _LOGGER.debug('%s: %s', self.name, [message, opt])
    mp3 = b''
    tts = EdgeCommunicate()
    async for i in tts.run(
        message,
        voice=voice,
        pitch=opt.get('pitch', '+0Hz'),
        rate=opt.get('rate', '+0%'),
        volume=opt.get('volume', '+0%'),
    ):
        # [offset, text, binary]
        if i[2] is not None:
            mp3 += i[2]
        elif i[0] is not None:
            _LOGGER.debug('%s: audio.metadata: %s', self.name, i)
    if not mp3:
        _LOGGER.warning('%s: failed: %s', self.name, [message, opt])
        return None, None
    return 'mp3', mp3

All of this has been a hassle to try to get working.  Let's simplify ...

We'll revert back to using the `edge-tts` command for audio file creation and add some error handling to it.

In [None]:
import fitz
from pathlib import Path
import os
import logging
import time
import shutil
import subprocess

# Configuration parameters
PROJECT_PATH = Path('~/projects/pdf_audiobook/')
PROJECT_PATH = PROJECT_PATH.expanduser()

PDF_PATH = PROJECT_PATH / 'PDFs'
TEXT_OUTPUT_PATH = PROJECT_PATH / 'output/txt'
AUDIO_OUTPUT_PATH = PROJECT_PATH / 'output/mp3'

PDF_FILE = PDF_PATH / "Plato - Apology.pdf"
AUDIO_FILE = AUDIO_OUTPUT_PATH / "Plato - Apology.mp3"
TEXT_FILE = TEXT_OUTPUT_PATH / 'extracted_text.txt'

AUDIO_FILE_PREFIX = 'Plato - Apology'
TEXT_FILE_PREFIX = 'Plato - Apology'

# Open the PDF file to get some dynamic parameters
pdf = fitz.open(PDF_FILE)

PAGE_DIMENSIONS = pdf.load_page(0).bound()

HEADER_INCHES = 0.75
LEFT_MARGIN_INCHES = 1.25

PAGE_LENGTH = pdf.load_page(0).bound()[3]
PAGE_WIDTH = pdf.load_page(0).bound()[2]

# Define header and footer bounding boxes
HEADER_FROM_TOP = round(0.75 * 72)  # this can be used to filter out the header
FOOTER_FROM_TOP = PAGE_LENGTH - HEADER_FROM_TOP # this can be used to filter out the footer
LEFT_MARGIN_FROM_LEFT = LEFT_MARGIN_INCHES * 72
RIGHT_MARGIN_FROM_LEFT = PAGE_WIDTH - LEFT_MARGIN_FROM_LEFT


# Execution parameters
PAGE_START = 2
#PAGE_STOP = 6
PAGE_STOP = pdf.page_count

PAGES_PER_FILE = 10 #TODO: reconcile this 1-index with the 0-index of the page count

In [None]:
def extract_text(pdf=PDF_FILE, output_path=TEXT_OUTPUT_PATH, prefix=TEXT_FILE_PREFIX, start=PAGE_START, stop=PAGE_STOP, header=HEADER_FROM_TOP, footer=FOOTER_FROM_TOP):
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

    page_count = 0
    file_label = 0
    
    # TODO: create the output_path directory if it doesn't exist
    text_file = open(file=f'{output_path}/{prefix}_{file_label:02}.txt', mode='w')

    for page in pdf.pages(start=start, stop=stop):
        if page_count % PAGES_PER_FILE == 0 and page_count > 0:
            logging.info(f'Closing {text_file.name}')
            text_file.close()
            file_label += 1
            text_file = open(file=f'{output_path}/{prefix}_{file_label:02}.txt', mode='w')
            logging.info(f'Opened {text_file.name}')

        for block in page.get_text('blocks'):
            logging.info(f'Processing block #{block[5]} of page #{page.number}')
            if not (block[3] < header or block[3] > footer):
                block_text = block[4].replace('\n', ' ')
                logging.info(f'Writing block #{block[5]} of page #{page.number} to {text_file.name}')
                text_file.write(block_text + '\n')
        
        page_count += 1
        logging.info(f'Processed page {page_count}')

    text_file.close()

In [None]:
def create_mp3(text_path, audio_path, prefix, num_files):
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

    for i in range(num_files):
        text_file_name = f'{text_path}/{prefix}_{i:02}.txt'
        text_file = open(file=text_file_name, mode='r')

        audio_file_name = f'{audio_path}/{prefix}_{i:02}.mp3'

        success = False
        while not success:
            try:
                logging.info(f'Creating MP3 file {audio_file_name} from {text_file.name}')
                subprocess.check_output(["edge-tts", "--voice",  "en-AU-NatashaNeural", "--file",  f'{text_file.name}', "--write-media", f'{audio_file_name}'])

            except subprocess.CalledProcessError as e:
                logging.error(f'Error creating MP3 file: {e} ... retrying in 5 seconds.')
                time.sleep(5)  # pause for 5 seconds before retrying
            success = True
        logging.info(f'Successfully created MP3 file {audio_file_name}!')

In [None]:
# A function to move the text and audio files in a 'tmp' directory
def move_files(text_path, audio_path):
    tmp_text_path = text_path / 'tmp'
    tmp_audio_path = audio_path / 'tmp'
    
    if not tmp_text_path.exists():
        tmp_text_path.mkdir()
    
    if not tmp_audio_path.exists():
        tmp_audio_path.mkdir()
    
    for file in text_path.glob('*.txt'):
        shutil.move(file, tmp_text_path / file.name)
    
    for file in audio_path.glob('*.mp3'):
        shutil.move(file, tmp_audio_path / file.name)

In [None]:
# Execute the functions
move_files(TEXT_OUTPUT_PATH, AUDIO_OUTPUT_PATH)
extract_text(pdf=pdf, output_path=TEXT_OUTPUT_PATH, prefix=TEXT_FILE_PREFIX, start=PAGE_START, stop=PAGE_STOP, header=HEADER_FROM_TOP, footer=FOOTER_FROM_TOP)
create_mp3(text_path=TEXT_OUTPUT_PATH, audio_path=AUDIO_OUTPUT_PATH, prefix=TEXT_FILE_PREFIX, num_files=(PAGE_STOP - PAGE_START) // PAGES_PER_FILE+1)