In [214]:
%pip install python-docx Pillow wand opencv-python

Note: you may need to restart the kernel to use updated packages.


In [215]:
import zipfile
import xml.etree.ElementTree as ET
from docx import Document
import os
import cv2
import numpy as np
import subprocess
from pathlib import Path

In [216]:
def get_images_and_positions_with_dimensions(docx_path):
    """Get all images from DOCX with their positions and dimensions"""
    # Enhanced version with dimensions

    doc = Document(docx_path)
    images = []
    
    # Go through each paragraph looking for images
    for para_idx, paragraph in enumerate(doc.paragraphs):
        paragraph_xml = paragraph._element
        
        # Look for drawing elements
        for drawing in paragraph_xml.iter():
            if drawing.tag.endswith('}drawing'):
                width = height = None
                embed_id = None
                
                # Look for extent (dimensions)
                for elem in drawing.iter():
                    if elem.tag.endswith('}extent'):
                        width = elem.get('cx')
                        height = elem.get('cy')
                    elif elem.tag.endswith('}blip'):
                        # Get the embed relationship ID
                        for attr_name, attr_value in elem.attrib.items():
                            if attr_name.endswith('}embed'):
                                embed_id = attr_value
                                break
                
                if embed_id:
                    image_info = {
                        'paragraph_index': para_idx,
                        'embed_id': embed_id,
                        'width': width,
                        'height': height,
                        'paragraph_text': paragraph.text
                    }
                    images.append(image_info)
    
    return images

In [217]:
def extract_image_files(docx_path, output_dir='images'):
    """Extract actual image files from DOCX"""
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    image_files = {}
    
    with zipfile.ZipFile(docx_path, 'r') as docx_zip:
        # Get all image files from media folder
        for file_info in docx_zip.filelist:
            if file_info.filename.startswith('word/media/'):
                image_name = os.path.basename(file_info.filename)
                image_data = docx_zip.read(file_info.filename)
                
                # Save image file
                image_path = os.path.join(output_dir, image_name)
                with open(image_path, 'wb') as f:
                    f.write(image_data)
                
                image_files[file_info.filename] = image_path
    
    return image_files

In [218]:
def cleanbackground(file_path):
  image = cv2.imread(file_path, cv2.IMREAD_UNCHANGED)
  res = None
  if image.shape[2] ==4:
    # Split BGR and alpha
      b, g, r, a = cv2.split(image)
      alpha = a.astype(float) / 255

      # Create white background
      white_bg = np.ones_like(b, dtype=float) * 255

      # Blend each channel
      b = b.astype(float) * alpha + white_bg * (1 - alpha)
      g = g.astype(float) * alpha + white_bg * (1 - alpha)
      r = r.astype(float) * alpha + white_bg * (1 - alpha)

      # Stack and convert to uint8
      result = cv2.merge((b, g, r)).astype(np.uint8)
  else:
      result = image
  #file_name = os.path.basename(file_path)
  cv2.imwrite(file_path, result)



In [219]:
def convert_to_png(file_path,output_dir):
    file_name = os.path.basename(file_path)
    ext = os.path.splitext(file_name)[1].lower()
    if ext not in ['.emf','.wmf']:
        return
    
    file_name = file_name.split(".")[0]
    result = subprocess.run([
        "inkscape",
        "--actions=page-fit-to-selection",
        "-o", f"{output_dir}/{file_name}.png",
        file_path
        ], check=True)
    subprocess.run(["rm", file_path], check=True)
    cleanbackground(f"{output_dir}/{file_name}.png")    

In [220]:
docx_file = "files/38413-h20 (1).docx"  # Replace with your file path

folder_name = os.path.splitext(os.path.basename(docx_file))[0]

# Create the full path: ./Images/<folder_name>
output_dir = Path("Images") / folder_name
output_dir.mkdir(parents=True, exist_ok=True)

# Get image positions
images = get_images_and_positions_with_dimensions(docx_file)

# Extract image files
image_files = extract_image_files(docx_file,output_dir)

# Display results
print(f"Found {len(images)} images:")
for i, img in enumerate(images):
    print(f"Image {i+1}:")
    print(f"  - Paragraph: {img['paragraph_index']}")
    print(f"  - Paragraph: {img['paragraph_text']}")
    print(f"  - Embed ID: {img['embed_id']}")
    if img['width'] and img['height']:
        print(f"  - Dimensions: {img['width']} x {img['height']} EMUs")
    print()

print(f"Extracted {len(image_files)} image files to 'images/' folder")
print(image_files)

Found 2 images:
Image 1:
  - Paragraph: 7
  - Paragraph: 	
  - Embed ID: rId9
  - Dimensions: 1212215 x 1212215 EMUs

Image 2:
  - Paragraph: 7
  - Paragraph: 	
  - Embed ID: rId10
  - Dimensions: 1625600 x 948055 EMUs

Extracted 95 image files to 'images/' folder
{'word/media/image90.emf': 'Images/38413-h20 (1)/image90.emf', 'word/media/image1.jpeg': 'Images/38413-h20 (1)/image1.jpeg', 'word/media/image2.png': 'Images/38413-h20 (1)/image2.png', 'word/media/image3.emf': 'Images/38413-h20 (1)/image3.emf', 'word/media/image4.emf': 'Images/38413-h20 (1)/image4.emf', 'word/media/image5.emf': 'Images/38413-h20 (1)/image5.emf', 'word/media/image6.emf': 'Images/38413-h20 (1)/image6.emf', 'word/media/image7.emf': 'Images/38413-h20 (1)/image7.emf', 'word/media/image8.emf': 'Images/38413-h20 (1)/image8.emf', 'word/media/image9.emf': 'Images/38413-h20 (1)/image9.emf', 'word/media/image10.emf': 'Images/38413-h20 (1)/image10.emf', 'word/media/image11.emf': 'Images/38413-h20 (1)/image11.emf', 'word/

In [221]:
print(output_dir)

Images/38413-h20 (1)


In [222]:
for file_path in output_dir.iterdir():
    if file_path.is_file():
        convert_to_png(file_path,output_dir)
