In [1]:
import os
import json
import uuid
from docx import Document

class DocElement:
    def __init__(self, type: str, content: str = None, level: int = None):
        self.type = type  # 'text' or 'image'
        self.content = content
        self.level = level
        self.parent = None
        self.sections = []

class Doc:
    def __init__(self, name: str):
        self.document = name
        self.sections = []

class SerializableDocElement(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, DocElement):
            if obj.type == 'text':
                serialized = {
                    "type": "text",
                    "level": obj.level,
                    "text": obj.content
                }
            elif obj.type == 'image':
                serialized = {
                    "type": "image",
                    "image": obj.content
                }
            
            if obj.sections:
                serialized["sections"] = [self.default(section) for section in obj.sections]
            
            return serialized
        elif isinstance(obj, Doc):
            return {
                "name": obj.document,
                "sections": [self.default(section) for section in obj.sections]
            }
        else:
            return super().default(obj)

def extract_images(docx_file, base_path):
    """
    Extract images from a docx file and save them to a specified directory.
    
    Args:
        docx_file (Document): Loaded docx document.
        base_path (str): Base path for saving images.
    
    Returns:
        list: List of image file paths.
    """
    image_paths = []
    os.makedirs(base_path, exist_ok=True)

    # Initialize an image counter for naming
    image_counter = 1

    for rel in docx_file.part.rels.values():
        if "image" in rel.target_ref:
            image_type = rel.target_part.content_type
            ext = {
                'image/jpeg': '.jpg',
                'image/png': '.png',
                'image/gif': '.gif',
                'image/bmp': '.bmp',
                'image/tiff': '.tiff',
                'image/svg+xml': '.svg'
            }.get(image_type, '.png')

            # Create a unique filename based on the counter
            unique_filename = f"image{image_counter}{ext}"
            image_save_path = os.path.join(base_path, unique_filename)

            with open(image_save_path, 'wb') as f:
                f.write(rel.target_part.blob)
            
            image_paths.append(image_save_path)
            image_counter += 1  # Increment the counter for the next image

    return image_paths


def list_paragraphs(file_path):
    print(f"Processing file: {file_path}")
    
    filename = os.path.basename(file_path)
    result = Doc(filename)
    
    depth_counter = 0
    style_to_level = {}
    
    docx = Document(file_path)
    
    base_dir = os.path.dirname(file_path)
    extracted_images_dir = os.path.join(base_dir, 'extracted_images')
    
    # Create directory for extracted images
    os.makedirs(extracted_images_dir, exist_ok=True)

    # Initialize image counter for consistent naming
    image_counter = 1
    
    # Iterate through paragraphs and inline shapes (images)
    for paragraph in docx.paragraphs:
        styleName = paragraph.style.name
        text = paragraph.text.replace('\u00a0', ' ')
        
        if text.strip():  # Only process non-empty paragraphs
            # Establish level mapping for styles
            if styleName not in style_to_level:
                style_to_level[styleName] = depth_counter
                depth_counter += 1
            
            # Create text element if text exists
            text_element = DocElement(type='text', content=text, level=style_to_level[styleName])
            result.sections.append(text_element)

        # Check for inline images within the paragraph's runs
        for run in paragraph.runs:
            inline_shapes = run._element.findall('.//a:blip', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
            for inline_shape in inline_shapes:
                rId = inline_shape.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                if rId:
                    rel = docx.part.rels[rId]
                    image_type = rel.target_part.content_type
                    ext = {
                        'image/jpeg': '.jpg',
                        'image/png': '.png',
                        'image/gif': '.gif',
                        'image/bmp': '.bmp',
                        'image/tiff': '.tiff',
                        'image/svg+xml': '.svg'
                    }.get(image_type, '.png')

                    # Create a unique filename based on the image_counter
                    unique_filename = f"image{image_counter}{ext}"
                    image_save_path = os.path.join(extracted_images_dir, unique_filename)

                    with open(image_save_path, 'wb') as f:
                        f.write(rel.target_part.blob)

                    # Add the image element immediately after the text element
                    image_element = DocElement(type='image', content=image_save_path)
                    result.sections.append(image_element)

                    # Increment the counter for the next image
                    image_counter += 1

    return result


def export_to_json(doc_obj, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(doc_obj, f, cls=SerializableDocElement, indent=4, ensure_ascii=False)

# Example usage
def main(file_path):
    doc = list_paragraphs(file_path)
    export_to_json(doc, 'output.json')
    print(json.dumps(doc, cls=SerializableDocElement, indent=4, ensure_ascii=False))





In [2]:
file_path = '/home/anushkas/Downloads/US/ProdDoc1/Generate Budget from DJC.docx'

main(file_path)

Processing file: /home/anushkas/Downloads/US/ProdDoc1/Generate Budget from DJC.docx
{
    "name": "Generate Budget from DJC.docx",
    "sections": [
        {
            "type": "text",
            "level": 0,
            "text": "Generate Budget from DJC"
        },
        {
            "type": "text",
            "level": 1,
            "text": "The “Generate Budget from DJC” function allows user to create new estimate with selected line items or all line items and generate budget from the “Cost Total” of the Line items."
        },
        {
            "type": "text",
            "level": 1,
            "text": "Generate Budget from DJC"
        },
        {
            "type": "image",
            "image": "/home/anushkas/Downloads/US/ProdDoc1/extracted_images/image1.png"
        },
        {
            "type": "text",
            "level": 1,
            "text": "Step 1. Select Line Item or All Line Items First step to select for which line items generate budget from DJC. There