In [14]:
import PyPDF2
import os

1. Extract text from a PDF file using the extract_text_from_pdf function
2. Receive an error message when trying to extract text from an invalid PDF file
3. Receive an error message when trying to extract text from a non-existent PDF
4. Receive an empty string when trying to extract text from a PDF file with no text
5. Extract text from large PDF files without any performance issues
6. Extract text from PDF files with different encodings (e.g., UTF-8, Latin-1)

In [15]:
def extract_text_from_pdf(file_path):
    """
    Extracts text from a PDF file, handling various edge cases as specified in the user stories.

    :param file_path: Path to the PDF file
    :return: Extracted text as a string
    :raises: ValueError, FileNotFoundError, or RuntimeError depending on the issue
    """
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file at path '{file_path}' does not exist.")

    # Check if the file is a valid PDF
    if not file_path.lower().endswith('.pdf'):
        raise ValueError(f"The file '{file_path}' is not a valid PDF.")

    try:
        # Open the PDF file
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)

            # Check if PDF contains any pages
            if not reader.pages:
                return ""  # Return an empty string for PDFs with no text

            # Extract text from all pages
            extracted_text = ""
            for page in reader.pages:
                extracted_text += page.extract_text() or ""

            return extracted_text

    except PyPDF2.errors.PdfReadError:
        raise ValueError(f"The file '{file_path}' is not a readable PDF.")

    except MemoryError:
        raise RuntimeError("The file is too large to process with the available memory.")

In [None]:
import os

pdf_path = os.path.abspath('../data/Wipro-Jan20_2025.pdf')
if os.path.exists(pdf_path):
	text = extract_text_from_pdf(pdf_path)
	print(text)
else:
	print(f"File not found: {pdf_path}")