# End-Word
Combine an Excel data table with Word text and images
- Format must be easily controlled


In [15]:
# Standard imports
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os
import io
import datetime
from pprint import pprint

# Docx imports
import docx  # To read docx and extract data
from docxtpl import DocxTemplate, InlineImage  # To pass images to new doc
from docx.shared import Mm, Inches, Pt  # To preserve image sizes
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK, WD_BREAK_TYPE  # To get paragraph justification types
from docx.enum.section import WD_SECTION  # To get word sections
from docx.enum.dml import MSO_THEME_COLOR # Get theme colors
from docx.oxml import OxmlElement  # For checking borders
from docx.oxml.ns import qn  # For checking borders
from docx.text.paragraph import Paragraph

# Openpy imports
from openpyxl import load_workbook
from openpyxl import styles

# ---
# Path setup
# Get paths to sample content and data
sample_path = os.path.join(os.curdir,'samples')

for dirname, dirnames, filenames in os.walk(sample_path):
    for fname in filenames:
        if 'docx' in fname:
            word_content = os.path.join(dirname,fname)
        elif 'xlsx' in fname:
            excel_content = os.path.join(dirname,fname)

# Create empty output file as placeholder
output = docx.Document()
output_path = os.path.join(sample_path,'output.docx')
output.save(output_path)

# Initiate template path to title_page
title_page = os.path.join(sample_path,'1_title_template.docx')

# ---
# Helper functions
def get_para_data(output_doc_name, paragraph):
    """
    Write the run to the new file and then set its font, bold, alignment, color etc. data.
    """
    output_para = output_doc_name.add_paragraph(style=paragraph.style.name)
    
    for run in paragraph.runs:
        output_run = output_para.add_run(run.text)
        # Run's bold data
        output_run.bold = run.bold
        # Run's italic data
        output_run.italic = run.italic
        # Run's underline data
        output_run.underline = run.underline
        # Run's color data
        output_run.font.color.rgb = run.font.color.rgb
        # Run's font data
        output_run.style.name = run.style.name
    
    # Paragraph's alignment data
    output_para.alignment = paragraph.alignment

def style_tbl(table, xls_formats):
    # Helpers
    def borders(xls_format):
        tcPr = cell.tcPr # get tcPr element, in which we can define table styles
        tcBorders = OxmlElement('w:tcBorders')
        
        for position in xls_format['border'].keys():
            # Map xls border format to xml format
            if xls_format['border'][position]:
                val = 'single'
            else:
                val = 'nil'
            
            # Set border formats on obj
            # More options at http://officeopenxml.com/WPtableBorders.php
            side = OxmlElement(f'w:{position}')
            side.set(qn('w:val'), val)
            side.set(qn('w:sz'), '8')  # sz 2 = 1/4 pt
            side.set(qn('w:space'), '0')
            side.set(qn('w:color'), 'auto')
            side.set(qn('w:shadow'), 'false')
            
            tcBorders.append(side)
        tcPr.append(tcBorders)
        

    def fill_align(xls_format):
        # https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.shading?view=openxml-2.8.1
        tcPr = cell.tcPr # get tcPr element, in which we can define table styles
        
        # Set cell fill 
        fillshade = OxmlElement('w:shd')
        fillshade.set(qn('w:fill'), xls_format['fillColor'])
        tcPr.append(fillshade)
        
        # Set alignment
        vAlign = OxmlElement('w:vAlign')
        try:
            vAlign.set(qn('w:val'), xls_format['vertical'])
            tcPr.append(vAlign)
        except TypeError:
            print(f'No vertical alignment for a table @ cell {coord} - skipping...')
            pass
        
        
    def fonts(xls_format):
        # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/text/font-color.html
        run = cell.p_lst[0].r_lst[0]
        rPr = run._add_rPr()
        
        # Set font color
        if xls_format['fontColor']:
            fontColor = OxmlElement('w:color')
            fontColor.set(qn('w:val'), xls_format['fontColor'])
            rPr.append(fontColor)
        
        # Set bold
        if xls_format['bold']:
            fontBold = OxmlElement('w:b')
            rPr.append(fontBold)
        
    # Main
    tbl = table._tbl # get xml element of the table
    for cell in tbl.iter_tcs():
        coord = (cell.bottom-1, cell._grid_col)
        
        borders(xls_formats[coord])
        fill_align(xls_formats[coord])
        fonts(xls_formats[coord])
        

In [16]:
# Convert openpyxl theme colours to rgb...
from colorsys import rgb_to_hls, hls_to_rgb

RGBMAX = 0xff  # Corresponds to 255
HLSMAX = 240  # MS excel's tint function expects that HLS is base 240. see:
# https://social.msdn.microsoft.com/Forums/en-US/e9d8c136-6d62-4098-9b1b-dac786149f43/excel-color-tint-algorithm-incorrect?forum=os_binaryfile#d3c2ac95-52e0-476b-86f1-e2a697f24969

def rgb_to_ms_hls(red, green=None, blue=None):
    """Converts rgb values in range (0,1) or a hex string of the form '[#aa]rrggbb' to HLSMAX based HLS, (alpha values are ignored)"""
    if green is None:
        if isinstance(red, str):
            if len(red) > 6:
                red = red[-6:]  # Ignore preceding '#' and alpha values
            blue = int(red[4:], 16) / RGBMAX
            green = int(red[2:4], 16) / RGBMAX
            red = int(red[0:2], 16) / RGBMAX
        else:
            red, green, blue = red
    h, l, s = rgb_to_hls(red, green, blue)
    return (int(round(h * HLSMAX)), int(round(l * HLSMAX)), int(round(s * HLSMAX)))

def ms_hls_to_rgb(hue, lightness=None, saturation=None):
    """Converts HLSMAX based HLS values to rgb values in the range (0,1)"""
    if lightness is None:
        hue, lightness, saturation = hue
    return hls_to_rgb(hue / HLSMAX, lightness / HLSMAX, saturation / HLSMAX)

def rgb_to_hex(red, green=None, blue=None):
    """Converts (0,1) based RGB values to a hex string 'rrggbb'"""
    if green is None:
        red, green, blue = red
    return ('%02x%02x%02x' % (int(round(red * RGBMAX)), int(round(green * RGBMAX)), int(round(blue * RGBMAX)))).upper()


def get_theme_colors(wb):
    """Gets theme colors from the workbook"""
    # see: https://groups.google.com/forum/#!topic/openpyxl-users/I0k3TfqNLrc
    from openpyxl.xml.functions import QName, fromstring
    xlmns = 'http://schemas.openxmlformats.org/drawingml/2006/main'
    root = fromstring(wb.loaded_theme)
    themeEl = root.find(QName(xlmns, 'themeElements').text)
    colorSchemes = themeEl.findall(QName(xlmns, 'clrScheme').text)
    firstColorScheme = colorSchemes[0]

    colors = []

    for c in ['lt1', 'dk1', 'lt2', 'dk2', 'accent1', 'accent2', 'accent3', 'accent4', 'accent5', 'accent6']:
        accent = firstColorScheme.find(QName(xlmns, c).text)

        if 'window' in accent.getchildren()[0].attrib['val']:
            colors.append(accent.getchildren()[0].attrib['lastClr'])
        else:
            colors.append(accent.getchildren()[0].attrib['val'])

    return colors

def tint_luminance(tint, lum):
    """Tints a HLSMAX based luminance"""
    # See: http://ciintelligence.blogspot.co.uk/2012/02/converting-excel-theme-color-and-tint.html
    if tint < 0:
        return int(round(lum * (1.0 + tint)))
    else:
        return int(round(lum * (1.0 - tint) + (HLSMAX - HLSMAX * (1.0 - tint))))

def theme_and_tint_to_rgb(wb, theme, tint):
    """Given a workbook, a theme number and a tint return a hex based rgb"""
    rgb = get_theme_colors(wb)[theme]
    h, l, s = rgb_to_ms_hls(rgb)
    return rgb_to_hex(ms_hls_to_rgb(h, tint_luminance(tint, l), s))

In [17]:
# Main functions
def append_word(dest, data, two_columns=False):
    '''Appends content from the Word source to the destination Word doc - supports both text and images
    
    Parameters
    ----------
    dest : str
        The file location of the destination word doc
    source: str
        The file location of the target Word source
    '''
    source = docx.Document(data)
    ims = [im for im in source.inline_shapes]

    # Persistent indexes
    im_addresses = []
    im_heights = []
    im_widths = []

    # Temp variables
    im_streams = []
    
    # Set section to have two columns if noted
    if two_columns:
        dest.paragraphs[-1].add_run().add_break(WD_BREAK.SECTION_CONTINUOUS)
        section = dest.section[-1]
        print(dir(section._sectPr))

    
    # Get image binary and metadata
    for im_idx, im in enumerate(ims):
        # Binary
        blip = im._inline.graphic.graphicData.pic.blipFill.blip
        rId = blip.embed
        doc_part = source.part
        image_part = doc_part.related_parts[rId]
        byte_data = image_part._blob
        image_stream = io.BytesIO(byte_data)
        im_streams.append(image_stream)

        # Metadata
        image_name = f'img_{im_idx}.jpeg'
        im_heights.append(im.height.mm)
        im_widths.append(im.width.mm)
        im_addresses.append(image_name)

        with open(image_name, "wb") as fh:
            fh.write(byte_data)
        fh.close()


    # Populate and save output
    paras = source.paragraphs
    im_idx = 0
    
    for para_idx, para in enumerate(paras):
        if(para.text):
            get_para_data(dest, para)

        root = ET.fromstring(para._p.xml)
        namespace = {'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}

        inlines = root.findall('.//wp:inline',namespace)

        if(len(inlines) > 0):
            uid = f'img_{im_idx}'

            img = dest.add_paragraph()
            img.add_run().add_text("{{ " + uid + " }}")
            img.alignment = WD_ALIGN_PARAGRAPH.CENTER

            context[uid] = InlineImage(
                dest,
                im_addresses[im_idx],
                width=Mm(im_widths[im_idx]),
                height=Mm(im_heights[im_idx]),
            )
            im_idx += 1

    
def append_excel(dest, source, heading=None):
    '''Appends Excel data source to the destination Word doc as a Table
    
    Parameters
    ----------
    dest : str
        The file location of the destination word doc
    source: str
        The file location of the target Excel source
    heading: str
        A string that will be printed in the style of Heading 1 above the table in word (default is None)
    '''
    # Sub-functions
    def check_index(value):
        '''Returns index attribute value if it exists - otherwise returns None

        '''
        try:
            return value.index
        except AttributeError:
            return None
    
    
    def xl2doc_color(color_meta):
        if color_meta is None:
            return
        elif color_meta.type == 'theme':
            theme = color_meta.theme
            tint = color_meta.tint
            fillcolor = theme_and_tint_to_rgb(wb, theme, tint)
        elif color_meta.type == 'rgb':
            fillcolor = color_meta.rgb
        else:
            raise TypeError(f'Unrecognised {color_meta.type}. Check classes')
        return fillcolor
    
    
    # Main---
    if heading:
        dest.add_paragraph(style='Heading 1').add_run().add_text(heading)
    
    # Note: openpyxl cannot read/copy charts; it needs to recreate them from source data
    # Read-only and data-only increases the speed of reading data from workbooks
    wb = load_workbook(filename=source, data_only=True)
    ws = wb.active
    Colors = styles.colors.COLOR_INDEX
    
    # Get dimensions of table in Excel
    data_vals = np.asarray(tuple(ws.values))
    table_dim = np.shape(data_vals)
    
    # Store dict of formats
    src_fmts = {}
    
    for r,row in enumerate(ws.rows):
        for c,cell in enumerate(row):
            
            src_fmts[(r,c)] = {
                'bold': cell.font.b,
                'italic': cell.font.i,
                'name': cell.font.name,
                'size': cell.font.size,
                'fillColor': xl2doc_color(cell.fill.start_color), # 00000000 = no fill
                'fontColor': xl2doc_color(cell.font.color), # only supports theme colours
                'horizontal': cell.alignment.horizontal,
                'vertical': cell.alignment.vertical,
                'border': {
                    'top': cell.border.top.style,
                    'topColor': check_index(cell.border.top.color),
                    'bottom': cell.border.bottom.style,
                    'bottomColor': check_index(cell.border.bottom.color),
                    'left': cell.border.left.style,
                    'leftColor': check_index(cell.border.left.color),
                    'right': cell.border.right.style,
                    'rightColor': check_index(cell.border.right.color),
                }
            }
    
    # Create, fill, style table in Word
    table = dest.add_table(rows=table_dim[0], cols=table_dim[1])
    for r, row in enumerate(table.rows):
        for c, cell in enumerate(row.cells):
            cell.text = data_vals[r][c]
    style_tbl(table, src_fmts)
    
    
def publish():
    try:
        dest.render(context)
    except Exception as e:
        print(e)

    # 
    dest.save(output_path)
    print(f'Saved at {output_path}')

In [18]:
# Setup template and go to new page
dest = DocxTemplate(title_page)
dest.add_paragraph().add_run().add_break(WD_BREAK.PAGE)

# Instantiate connection to word content
context = {
    'title': 'Prototyping with Bob',
    'subtitle': 'Prepared by Yemeng Bob Jin for Yeqin Jim Jin',
    'date': datetime.date.today()
}

# Instantiate connection to excel content


# Append all content to destination
append_excel(dest,excel_content,heading='Jim and Bob profit split')
dest.add_paragraph().paragraph_format.space_after = Pt(36)  # Space between Excel table and Word doc
append_word(dest,word_content, two_columns=True)
publish()

KeyError: 3