In [1]:
!pip install PyPDF2 pdf2image Pillow pytesseract

Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
     |████████████████████████████████| 232 kB 810 kB/s            
[?25hCollecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyPDF2, pdf2image
Successfully installed PyPDF2-3.0.1 pdf2image-1.17.0 pytesseract-0.3.13
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


### Extract text using OCR

In [3]:
import PyPDF2
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import json
import os

def extract_text_from_pdf(file_path):
    """
    Extract text from a PDF file using OCR.

    Args:
        file_path (str): Path to the PDF file.

    Returns:
        str: Extracted text.
    """
    # Convert PDF to images
    images = convert_from_path(file_path)

    # Initialize extracted text
    extracted_text = ""

    # Iterate over images and extract text using OCR
    for i, image in enumerate(images):
        # Save image to temporary file
        temp_image_path = f"temp_image_{i}.jpg"
        image.save(temp_image_path, "JPEG")

        # Extract text using OCR
        text = pytesseract.image_to_string(Image.open(temp_image_path))

        # Append extracted text
        extracted_text += text

        # Remove temporary image file
        os.remove(temp_image_path)

    return extracted_text

def write_to_json(file_path, data):
    """
    Write data to a JSON file.

    Args:
        file_path (str): Path to the JSON file.
        data (dict): Data to write.
    """
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)

def main():
    # Specify PDF file path
    pdf_file_path = "../data/books/Grade4A.pdf"

    # Specify JSON file path
    json_file_path = "../data/processed/extracted_text.json"

    # Extract text from PDF
    extracted_text = extract_text_from_pdf(pdf_file_path)

    # Create data dictionary
    data = {"extracted_text": extracted_text}

    # Write data to JSON file
    write_to_json(json_file_path, data)

    print("Text extracted and written to JSON file.")

if __name__ == "__main__":
    main()

Text extracted and written to JSON file.


### Extract text using the coversion of pdf to text

In [8]:
import PyPDF2
import json
import re

def clean_text(text):
    # Remove newline characters and replace with spaces for better readability
    text = text.replace('\n', ' ').replace('\r', '')
    
    # Remove special characters and numbers that are not part of words
    text = re.sub(r'[^a-zA-Z\s\.]', '', text)
    
    # Remove extra spaces
    text = re.sub(' +', ' ', text)
    
    return text.strip()


# Create a list to store the extracted text
text_list = []

# Open the PDF file
with open('../data/books/Grade4A.pdf', 'rb') as f:
    pdf = PyPDF2.PdfReader(f)



    # Iterate through each page in the PDF
    for page in pdf.pages:
        # Extract the text from the page
        text = page.extract_text()
        # Clean text
        text = clean_text(text)
        # Add the text to the list
        text_list.append(text)

# Create a JSON object to store the extracted text
json_data = {'text': text_list}

# Write the JSON object to a file
with open('../data/processed/extracted_text.json', 'w') as f:
    json.dump(json_data, f, indent=4)

In [9]:
import PyPDF2
import json
import re
import os

def clean_text(text):
    # Remove newline characters and replace with spaces for better readability
    text = text.replace('\n', ' ').replace('\r', '')
    
    # Remove special characters and numbers that are not part of words
    text = re.sub(r'[^a-zA-Z\s\.]', '', text)
    
    # Remove extra spaces
    text = re.sub(' +', ' ', text)
    
    return text.strip()

# Define the input and output directories
input_dir = '../data/books'
output_dir = '../data/processed'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop over each file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.pdf'):
        # Create a list to store the extracted text
        text_list = []

        # Open the PDF file
        with open(os.path.join(input_dir, filename), 'rb') as f:
            pdf = PyPDF2.PdfReader(f)

            # Iterate through each page in the PDF
            for page in pdf.pages:
                # Extract the text from the page
                text = page.extract_text()
                # Clean text
                text = clean_text(text)
                # Add the text to the list
                text_list.append(text)

        # Create a JSON object to store the extracted text
        json_data = {'text': text_list}

        # Write the JSON object to a file
        output_filename = os.path.splitext(filename)[0] + '.json'
        with open(os.path.join(output_dir, output_filename), 'w') as f:
            json.dump(json_data, f, indent=4)
        print(f"Processed {filename} and saved to {output_filename}")

Processed Grade4A.pdf and saved to Grade4A.json
Processed Grade4B.pdf and saved to Grade4B.json
Processed Grade5A.pdf and saved to Grade5A.json
Processed Grade5B.pdf and saved to Grade5B.json
