## Extract Images from PDF Documents that Span Across Multiple Pages
This notebook contains a series of cells to process images extracted from PDF files. The notebook performs tasks such as installing dependencies, extracting and combining images, and removing duplicate images both locally and in an Azure storage account.

# **Install Dependencies**

### 1. Install Dependencies
This cell installs all the necessary dependencies using the `requirements.txt` file.

In [None]:
pip install -r requirements.txt

# Example - Extract and Combine Images Locally

This cell contains three Python functions to:
#
1. Extract images from a PDF.
2. Combine extracted images.
3. Extract and combine images from a PDF.

In [None]:
import fitz  # PyMuPDF
from PIL import Image
import io
import os

# Function to extract images from PDF
def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append((page_num, img_index, image))
    return images

# Function to combine images vertically
def combine_images(images):
    widths, heights = zip(*(i.size for i in images))
    total_height = sum(heights)
    max_width = max(widths)

    combined_image = Image.new('RGB', (max_width, total_height))
    y_offset = 0
    for img in images:
        combined_image.paste(img, (0, y_offset))
        y_offset += img.height

    return combined_image

# Main function to extract and combine images from PDF
def extract_and_combine_images(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    images = extract_images_from_pdf(pdf_path)
    print("Number of images:" +str(len(images)))
    current_sequence = []
    sequence_count = 1

    for i, (page_num, img_index, image) in enumerate(images):
        print("Page Num" + str(page_num))
        # Assuming images that extend across multiple pages have similar dimensions
        if current_sequence and (image.size != current_sequence[-1].size):
            combined_image = combine_images(current_sequence)
            combined_image.save(f"{output_folder}/combined_image_{sequence_count}.jpg")
            sequence_count += 1
            current_sequence = []
        current_sequence.append(image)

    if current_sequence:
        combined_image = combine_images(current_sequence)
        combined_image.save(f"{output_folder}/combined_image_{sequence_count}.jpg")

    print(f"Images saved to {output_folder}")


pdf_path = "input_folder/sample_pdf_with_continued_images_10_pages.pdf"
output_folder = "output_folder"
extract_and_combine_images(pdf_path, output_folder)



## Remove Duplicate Images Locally

This cell removes duplicate images from the local output folder.

In [None]:
#Remove duplicate Image Files

import os
import hashlib

def calculate_hash(image_path):
    """Calculate the hash of an image."""
    hasher = hashlib.md5()
    with open(image_path, 'rb') as img_file:
        buf = img_file.read()
        hasher.update(buf)
    return hasher.hexdigest()

def remove_duplicate_images(folder_path):
    """Remove duplicate images from a folder."""
    images_hashes = {}
    duplicates = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                image_path = os.path.join(root, file)
                image_hash = calculate_hash(image_path)

                if image_hash in images_hashes:
                    duplicates.append(image_path)
                else:
                    images_hashes[image_hash] = image_path

    # Remove duplicates
    for duplicate in duplicates:
        os.remove(duplicate)
        print(f"Removed duplicate image: {duplicate}")

folder_path = "output_folder"
remove_duplicate_images(folder_path)

# Example - Reading and Writing to Storage Containers

This cell performs the same tasks as Cell 2 but reads and writes files to an Azure storage account container. The code connects to this storage account using a connection string stored as a secret in Azure Key Vault. Ensure the Azure resource running the notebook has Key Vault Contributor access.

## Notes
- Ensure you have the necessary permissions to access Azure Key Vault and Storage account.
- Modify the paths and storage account details as needed.

In [None]:
import fitz  # PyMuPDF
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from PIL import Image
import io
import os
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.identity import DefaultAzureCredential
from azure.identity import ManagedIdentityCredential


# Define the Key Vault URL and the secret name
vault_url = "https://<VAULT_NAME>.vault.azure.net/"
secret_name = "<SECRET_NAME>"

# Create a DefaultAzureCredential object
credential = DefaultAzureCredential()

# Create a SecretClient object using the credential
client = SecretClient(vault_url=vault_url, credential=credential)

# Retrieve the secret
connection_string = client.get_secret(secret_name).value

# Create a BlobServiceClient object using the connection string
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

container_name = "data"
container_client = blob_service_client.get_container_client(container_name)

# Function to extract images from PDF in blob storage
def extract_images_from_pdf(blob_name):
    blob_client = container_client.get_blob_client(f"input_folder/{blob_name}")
    pdf_stream = io.BytesIO()
    blob_client.download_blob().readinto(pdf_stream)
    pdf_stream.seek(0)  # Reset the stream position to the beginning

    doc = fitz.open(stream=pdf_stream, filetype="pdf")
    images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append((page_num, img_index, image))
    return images

# Function to combine images vertically
def combine_images(images):
    widths, heights = zip(*(i.size for i in images))
    total_height = sum(heights)
    max_width = max(widths)

    combined_image = Image.new('RGB', (max_width, total_height))
    y_offset = 0
    for img in images:
        combined_image.paste(img, (0, y_offset))
        y_offset += img.height

    return combined_image

# Function to upload image to blob storage
def upload_image_to_blob(image, blob_name):
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format='JPEG')
    img_byte_arr = img_byte_arr.getvalue()
    blob_client = container_client.get_blob_client(f"output_folder/{blob_name}")
    blob_client.upload_blob(img_byte_arr, overwrite=True)

# Main function to extract and combine images from PDF in blob storage
def extract_and_combine_images(blob_name):
    images = extract_images_from_pdf(blob_name)
    #print("Number of images:", len(images))
    current_sequence = []
    sequence_count = 1

    for i, (page_num, img_index, image) in enumerate(images):
        print("Page Num:", page_num)
        # Assuming images that extend across multiple pages have similar dimensions
        if current_sequence and (image.size != current_sequence[-1].size):
            combined_image = combine_images(current_sequence)
            output_blob_name = f"combined_image_{sequence_count}.jpg"
            upload_image_to_blob(combined_image, output_blob_name)
            sequence_count += 1
            current_sequence = []
        current_sequence.append(image)

    if current_sequence:
        combined_image = combine_images(current_sequence)
        output_blob_name = f"combined_image_{sequence_count}.jpg"
        upload_image_to_blob(combined_image, output_blob_name)

    print(f"Images saved to blob storage container '{container_name}/output_folder'")


# Example usage
pdf_path = "sample_pdf_with_continued_images_10_pages.pdf"
extract_and_combine_images(pdf_path)

## Example - Remove Duplicated Images from Output Folder in Storage Container

This cell removes duplicate images from the `output_folder` stored in the Azure storage account.

In [None]:
import hashlib
from azure.storage.blob import BlobServiceClient

folder_name = "output_folder"

# Dictionary to store hashes of images
hashes = {}

# List blobs in the specified folder
blobs_list = container_client.list_blobs(name_starts_with=folder_name)

for blob in blobs_list:
    blob_client = container_client.get_blob_client(blob)
    blob_data = blob_client.download_blob().readall()
    
    # Calculate the hash of the blob data
    blob_hash = hashlib.md5(blob_data).hexdigest()
    
    if blob_hash in hashes:
        # If hash already exists, delete the duplicate blob
        print(f"Deleting duplicate blob: {blob.name}")
        blob_client.delete_blob()
    else:
        # Store the hash and blob name
        hashes[blob_hash] = blob.name

print("Duplicate images removed.")
