# 0. Install Packages
pip install comtypes

# 1. Import Modules

In [1]:
from tkinter.filedialog import askopenfilename
import comtypes.client
import os

# 2. Word Constants

In [2]:
## File formats
wdFormatDocumentDefault = 16 ## Word default document file format. For Word, this is the DOCX format.
wdFormatFilteredHTML = 10 ## Filtered HTML format.
wdFormatWebArchive = 9 ## Web archive format.

## Paste
## wdPasteMetafilePicture = 3    ## Gives not as good results as EMF
wdPasteEnhancedMetafile = 9
## wdPasteDeviceIndependentBitmap = 5    ## Does not work
wdInLine = 0

## Line-spacing
wdLineSpaceAtLeast = 3

## Shapes
wdInlineShapePicture = 3
msoPicture = 13

## Information
wdActiveEndPageNumber = 3
wdNumberOfPagesInDocument = 4
wdHorizontalPositionRelativeToPage = 5
wdVerticalPositionRelativeToPage = 6

# 3. Clean Word Document

In [3]:
filename = askopenfilename()
in_file = os.path.abspath(filename)
print('Input file: ' + in_file)

out_file = os.path.splitext(in_file)[0] + '_clean'
print('Output file: ' + out_file)

word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat = wdFormatDocumentDefault)

try:
    tableIndex = doc.Tables.Count
    while tableIndex >= 1:
        table = doc.Tables.Item(tableIndex)
        table.Select()
        selection = word.Selection

        prevRange = selection.Previous()
        nextRange = selection.Next()

        containsInlineImages = False
        containsInlineShapes = False
        containsOverlayImages = False
        containsOverlayShapes = False
        
        for inlineShape in selection.Range.InlineShapes:
            if inlineShape.Type == wdInlineShapePicture:
                containsInlineImages = True
            else:
                containsInlineShapes = True

        for shape in selection.Range.ShapeRange:
            if shape.Type == msoPicture:
                containsOverlayImages = True
            else:
                containsOverlayShapes = True

        print('')
        print('Checking table {}'.format(tableIndex))
        print('The selection starts on page {} of {} ({}/{})'.format(
            prevRange.Information(wdActiveEndPageNumber),
            prevRange.Information(wdNumberOfPagesInDocument),
            prevRange.Information(wdVerticalPositionRelativeToPage),
            prevRange.Information(wdHorizontalPositionRelativeToPage)))
        print('The selection ends on page {} of {} ({}/{})'.format(
            nextRange.Information(wdActiveEndPageNumber),
            nextRange.Information(wdNumberOfPagesInDocument),
            nextRange.Information(wdVerticalPositionRelativeToPage),
            nextRange.Information(wdHorizontalPositionRelativeToPage)))
        print('The selection contains')
        if containsInlineImages:
            print('* inline images')
        if containsInlineShapes:
            print('* inline images')
        if containsOverlayImages:
            print('* overlay images')
        if containsOverlayShapes:
            print('* overlay shapes')
            
        if containsInlineImages or containsInlineShapes or containsOverlayImages or containsOverlayShapes:
            selection.Cut()
            selection.PasteSpecial(Link = False, DataType = wdPasteEnhancedMetafile, Placement = wdInLine, DisplayAsIcon = False)
            selection.ParagraphFormat.LineSpacingRule = wdLineSpaceAtLeast

        tableIndex -= 1
except:
    raise
finally:
    doc.Save()
    
    doc.WebOptions.AllowPNG = True
    
    doc.SaveAs(out_file, FileFormat = wdFormatFilteredHTML)
    doc.SaveAs(out_file, FileFormat = wdFormatWebArchive)
    doc.Close()
    word.Quit()

Input file: D:\_Customers\EMA\EPI\code\data\WordPIs\Abasaglar-h-2835-en.docx
Output file: D:\_Customers\EMA\EPI\code\data\WordPIs\Abasaglar-h-2835-en_clean

Checking table 20
The selection starts on page 105 of 106 (69.69999694824219/70.8499984741211)
The selection ends on page 1 of 106 (384.75/70.8499984741211)
The selection contains
* overlay images

Checking table 19
The selection starts on page 103 of 106 (579.4500122070312/70.8499984741211)
The selection ends on page 105 of 106 (56.45000076293945/70.8499984741211)
The selection contains
* overlay images

Checking table 18
The selection starts on page 102 of 105 (565.0499877929688/214.85000610351562)
The selection ends on page 103 of 105 (421.6499938964844/70.8499984741211)
The selection contains
* overlay images
* overlay shapes

Checking table 17
The selection starts on page 102 of 105 (56.45000076293945/70.8499984741211)
The selection ends on page 102 of 105 (475.79998779296875/70.8499984741211)
The selection contains
* overlay 