In [None]:
from google.cloud import vision
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import io
import fitz  # PyMuPDF
import os
import requests

In [None]:
client = vision.ImageAnnotatorClient()

In [None]:
import fitz  # PyMuPDF

def get_image_metadata_from_pdf(pdf_path):
    """
    Extracts metadata of images embedded in each page of a scanned PDF,
    including page dimensions in points and inches, and calculates DPI if missing.
    """
    doc = fitz.open(pdf_path)
    image_metadata = []
    
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        
        # Get page dimensions in points and inches
        page_width_pts, page_height_pts = page.rect.width, page.rect.height
        page_width_in = page_width_pts / 72  # Convert points to inches
        page_height_in = page_height_pts / 72
        
        # Extract the list of images on this page
        images = page.get_images(full=True)  # 'full=True' gets all images on page
        
        for img_index, img in enumerate(images):
            xref = img[0]  # Image reference number
            base_image = doc.extract_image(xref)
            
            # Retrieve metadata about the image
            width_px, height_px = base_image["width"], base_image["height"]
            dpi_x, dpi_y = base_image.get("dpi", (None, None))
            
            # Calculate DPI if not provided
            if dpi_x is None or dpi_y is None:
                dpi_x = width_px / page_width_in
                dpi_y = height_px / page_height_in
            
            # Append metadata, including page dimensions and calculated DPI
            image_metadata.append({
                "page": page_num + 1,
                "image_index": img_index + 1,
                "width_px": width_px,
                "height_px": height_px,
                "format": base_image["ext"],
                "dpi_x": dpi_x,
                "dpi_y": dpi_y,
                "page_width_pts": page_width_pts,
                "page_height_pts": page_height_pts,
                "page_width_in": page_width_in,
                "page_height_in": page_height_in,
            })
    
    doc.close()
    return image_metadata

In [None]:
pdf_path = '../books/TEST-phat-giao-viet-nam-1956-05-06.pdf'

In [None]:
get_image_metadata_from_pdf(pdf_path)

In [None]:
# Load PDF and extract image bytes from a given page
doc = fitz.open(pdf_path)
page = doc.load_page(0)  # Load the first page
images = page.get_images(full=True)
xref = images[0][0]  # Get the first image reference

base_image = doc.extract_image(xref)
image_bytes = base_image["image"]  # Get the raw JPEG byte content

In [None]:
# Convert to PIL Image for further processing if needed
pil_image = Image.open(io.BytesIO(image_bytes))


In [None]:
pil_image

In [None]:
#annotate with vision
image = vision.Image(content=image_bytes)
response = client.text_detection(image=image)

In [None]:
text = response.text_annotations

In [None]:
print(text[0].description)

In [None]:
draw = ImageDraw.Draw(test_image)

In [None]:
# scape website for .pdfs

import requests
from bs4 import BeautifulSoup
import re

# Target URL
url = "https://thuvienhoasen.org/a26248/tap-chi-phat-giao-viet-nam"

# Request page content
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all links ending with .pdf
pdf_links = []
for link in soup.find_all("a", href=True):
    if link["href"].endswith(".pdf"):
        pdf_links.append(link["href"])

# Display results
for pdf_link in pdf_links:
    print(pdf_link)

In [None]:
# Specify the directory where PDFs will be saved
pdf_download_directory = "../PDF/Phat_Giao_journals"  # Update this path to your desired directory
os.makedirs(pdf_download_directory, exist_ok=True)

In [None]:
# download all pdf:
# Base URL for constructing full PDF URLs
base_url = "https://thuvienhoasen.org"

# Download each PDF in pdf_links
for pdf_link in pdf_links:
    # Construct the full URL for each PDF link
    pdf_url = base_url + pdf_link if pdf_link.startswith('/') else pdf_link
    pdf_name = os.path.basename(pdf_url)
    pdf_path = os.path.join(pdf_download_directory, pdf_name)
    
    # Download the PDF file
    pdf_response = requests.get(pdf_url, stream=True)
    with open(pdf_path, "wb") as pdf_file:
        for chunk in pdf_response.iter_content(chunk_size=1024):
            pdf_file.write(chunk)
    
    print(f"Downloaded {pdf_name} to {pdf_path}")