# Extract images from PDFs

- [The Python Code: Extract PDF images in Python](https://www.thepythoncode.com/article/extract-pdf-images-in-python)
- Different organizational methods for image generation tests

# Import libraries

In [None]:
import shutil
import pandas as pd

# https://www.thepythoncode.com/article/extract-pdf-images-in-python
import fitz # PyMuPDF
import io
from PIL import Image

# Import bik_df

In [None]:
%store -r bik_df

# Method 1: Save images to individual folders (by paper)

In [None]:
def extract_images(df,file,indx):
    
    print("------------------------------")
    print("INDEX: ", indx)
    
    # Open file
    pdf_file = fitz.open(file)
    
    # Create folder for images
    folder_location = "../image_generation/images/extracted_images/" + df["Title"][indx][:240].replace("/","-") + "-images"
    if not os.path.exists(folder_location):os.mkdir(folder_location)
        
    # Iterate over PDF pages
    for page_index in range(len(pdf_file)):
        # Get page
        page = pdf_file[page_index]
        image_list = page.get_images()
        # Print number of images found on page
        if image_list:
            print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
        else:
            print("[!] No images found on page", page_index)
        for image_index, img in enumerate(page.get_images(), start=1):
            try:
                # Get image XREF
                xref = img[0]
                # Extract image bytes
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                # Get image extension
                image_ext = base_image["ext"]
                # Load to PIL
                image = Image.open(io.BytesIO(image_bytes))
                # Save to local disk
                image.save(open(f"{folder_location}/image{page_index+1}_{image_index}.{image_ext}", "wb"))
            except:
                continue

In [None]:
# Call extract_images for each PDF
# PDFs named by index
for i in range(0,214):
    try:
        pdf_file = f"../data_preprocess/PDFS/{i}.pdf"
        extract_images(bik_df,pdf_file,i)
    except:
        continue

# Method 2: Save all images to one folder

In [None]:
def extract_images_new(df,file,indx):
    
    print("------------------------------")
    print("INDEX: ", indx)
    
    # Open file
    pdf_file = fitz.open(file)
    
    # Create folder for images
    folder_location = "../image_generation/images/extracted_images_new"
    if not os.path.exists(folder_location):os.mkdir(folder_location)
    
    # Iterate over PDF pages
    for page_index in range(len(pdf_file)):
        # Get page
        page = pdf_file[page_index]
        image_list = page.get_images()
        # Print number of images found on page
        if image_list:
            print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
        else:
            print("[!] No images found on page", page_index)
        for image_index, img in enumerate(page.get_images(), start=1):
            try:
                # Get image XREF
                xref = img[0]
                # Extract image bytes
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                # Get image extension
                image_ext = base_image["ext"]
                # Load to PIL
                image = Image.open(io.BytesIO(image_bytes))
                # Save to local disk
                image.save(open(f"{folder_location}/image{indx}_{page_index+1}_{image_index}.{image_ext}", "wb"))
            except:
                continue

In [None]:
# Call extract_images_new for each PDF
# PDFs named by index
for i in range(0,214):
    try:
        pdf_file = f"../data_preprocess/PDFS/{i}.pdf"
        extract_images_new(bik_df,pdf_file,i)
    except:
        continue