bash0 · AnEnglishmanInNorway · Jun 9, 2025 · Jun 3, 2025 · Jun 3, 2025 · Jun 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,5 @@ nosetests.xml
 /tests/*/*.mcf.*.pdf
 /tests/*/*.mcfx.*.pdf
 /tests/temp
+/junk*.pdf
+/tests/testIndex/test_index.mcf.S.idx.png
diff --git a/README.md b/README.md
@@ -140,6 +140,49 @@ noShadows = False
 #	root:            ERROR[2], WARNING[4], INFO[38]
 ```
 
+#### Indexing an album
+It is possible to ask cewe2pdf to generate an index for the album, where index terms are selected using a combination of of font and font size used in a text area. The index is initially generated as a separate pdf file with black text on white background. The index pdf is used to create an index image file, a png in which the background is transparent. That png image is then merged into the album pdf, being placed on any page containing an index marker identifier.
+
+This feature may be useful in, for example, an album which represents a day-by-day record of some period of time. The headings for each day in the album can be specified in a font/fontsize combination which is not used for any other purpose in the album, and the index will then present a short day-by-day summary with page number references.
+
+It is normal to allow cewe2pdf to delete the index pdf but to retain the index png. That allows you to manually insert the index png onto the index page in the album editor, and thus have it as part of the album which is sent for quality printing (if you do that!). If you rerun the album pdf generation, creating a new index png to be merged into the album, the merge process will remove any old index png from the index page before adding the new one (based on best-effort recognition of the image in the pdf!)
+
+The page on which the index is to be placed is recognised by the presence of a text on the page. The text is identified with a regular expression defined in the .ini file, and would often be a visible text such as "Contents". If you don't want a visible text, you can always set the colour of the text to "None". Other things on the index page (photos, clip-art, text, etc) are left undisturbed and should be visible since the background of the index image is transparent.
+
+There are a host of index configuration options which can be specified in a separate section of the .ini file. No indexing will take place unless there is an __INDEX__ section and the __indexing__ value is __True__
+```
+[INDEX]
+indexing = False
+indexEntryFonts =
+	Arial Rounded MT Bold, 15
+indexFont = Helvetica
+indexFontSize = 12
+lineSpacing = 1.1
+pageWidth = 210
+pageHeight = 291 # A4 is 297. 291 is the size of the paper in a 30x30 album
+indexMarkerRegex = ^Contents$
+topMargin = 5
+bottomMargin = 0
+leftMargin = 7
+rightMargin = 7
+deleteIndexPdf = True
+deleteIndexPng = False
+```
+__indexEntryFonts__ specifies one or more font / font sizw combinations which will be used to recognise index terms in the album
+
+__indexFont, indexFontSize, lineSpacing, pageWidth, pageHeight__ determine how the index entries are formatted on the index pdf page
+
+__indexMarkerRegex__ specifies the regular expression against which all text items in the album are tested. Any page with a matching text will be used for insertion of the index png
+
+__topMargin__ etc determine the placement of the index png on the index page. The image is scaled appropriately to fit.
+
+__deleteIndexPdf__ etc determine whether or not the generated files are deleted after the album pdf has been updated.
+
+There are also margin settings for the creation of the index pdf, __pdfTopMargin__ etc. These may be useful if you intend to keep and use the generated index pdf, but default to 1 so that the pdf page is filled and the image margins are the most important.
+
+#### Large index limitations
+The current code only handles a single index page. If there are more index terms than fit on a single page, the index pdf will be correct, but the index image will only take the first page.
+
 ### additional_fonts.txt
 The code knows where to find the fonts delivered with the Cewe software, but if you use non-Cewe fonts then you must specify the location of those fonts. For historical reasons configuration of fonts is done with a separate (optional) configuration file, ``additional_fonts.txt``. The file should contain one line per font file or font directory to be added. Both `.ttf` or `.otf` files are read.
 

diff --git a/cewe2pdf.py b/cewe2pdf.py
@@ -102,7 +102,9 @@
 from pageNumbering import getPageNumberXy, PageNumberingInfo, PageNumberPosition
 from passepartout import Passepartout
 from pathutils import findFileInDirs
-from text import AppendItemTextInStyle, AppendSpanEnd, AppendSpanStart, AppendText, CollectFontInfo, CreateParagraphStyle, Dequote, noteFontSubstitution
+from text import AppendItemTextInStyle, AppendSpanEnd, AppendSpanStart, AppendText
+from text import CollectFontInfo, CollectItemFontFamily, CreateParagraphStyle, Dequote, noteFontSubstitution
+from index import Index
 from textart import handleTextArt
 
 
@@ -170,6 +172,7 @@ def __str__(self):
 # pdf_styleN = pdf_styles['Normal']
 pdf_flowableList = []
 
+albumIndex = None # set after we have got the configuration information
 clipartDict = dict[int, str]()    # a dictionary for clipart element IDs to file name
 clipartPathList = tuple[str]()
 passepartoutDict = None    # will be dict[int, str] for passepartout designElementIDs to file name
@@ -583,6 +586,7 @@ def processDecorationShadow(decoration, areaHeight, areaWidth, pdf):
         frm_table.wrapOn(pdf, shadowWidth, shadowHeight)
         frm_table.drawOn(pdf, shadowBottomLeft_x, shadowBottomLeft_y)
 
+
 def warnAndIgnoreEnabledDecorationShadow(decoration):
     if getConfigurationBool(defaultConfigSection, "noShadows", "False"):
         return
@@ -594,7 +598,7 @@ def warnAndIgnoreEnabledDecorationShadow(decoration):
                 continue
 
 
-def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, areaWidth, pdf, transx, transy): # noqa: C901 (too complex)
+def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, areaWidth, pdf, transx, transy, pgno): # noqa: C901 (too complex)
     # note: it would be better to use proper html processing here
     htmlxml = etree.XML(textTag.text)
     body = htmlxml.find('.//body')
@@ -675,6 +679,10 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
     # unset by CreateParagraphStyle
     # pdf_styleN.backColor = reportlab.lib.colors.HexColor("0xFFFF00")
 
+    # There may be multiple "index entry" paragraphs in the text area.
+    # Concatenating them to just one index entry seems to work in practice
+    indexEntryText = None
+
     htmlparas = body.findall(".//p")
     for p in htmlparas:
         maxfs = 0  # cannot use the bodyfs as a default, there may not actually be any text at body size
@@ -710,6 +718,9 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
             usefs = maxfs if maxfs > 0 else bodyfs
             pdf_styleN.leading = usefs * finalLeadingFactor # line spacing (text + leading)
             pdf_flowableList.append(Paragraph(paragraphText, pdf_styleN))
+            originalFont = CollectItemFontFamily(p, family)
+            if albumIndex.CheckForIndexEntry(originalFont, bodyfs):
+                indexEntryText = Index.AppendIndexText(indexEntryText, p.text)
 
         else:
             paragraphText = '<para autoLeading="max">'
@@ -748,6 +759,9 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
 
                     if span.text is not None:
                         paragraphText = AppendText(paragraphText, html.escape(span.text))
+                        originalFont = CollectItemFontFamily(span, family)
+                        if albumIndex.CheckForIndexEntry(originalFont, spanfs):
+                            indexEntryText = Index.AppendIndexText(indexEntryText, span.text)
 
                     # there might be (one or more, or only one?) line break within the span.
                     brs = span.findall(".//br")
@@ -783,6 +797,9 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
             except Exception:
                 logging.exception('Exception')
 
+    if indexEntryText:
+        albumIndex.AddIndexEntry(pgno, indexEntryText)
+
     # Add a frame object that can contain multiple paragraphs. Margins (padding) are specified in
     # the editor in mm, arriving in the mcf in 1/10 mm, but appearing in the html with the unit "px".
     # This is a bit strange, but ignoring the "px" and using mcf2rl seems to work ok.
@@ -951,7 +968,7 @@ def processElements(additional_fonts, fotobook, imagedir,
 
         # process text
         for textTag in area.findall('text'):
-            processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, areaWidth, pdf, transx, transy)
+            processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, areaWidth, pdf, transx, transy, pageNumber)
 
         # Clip-Art
         # In the clipartarea there are two similar elements, the <designElementIDs> and the <clipart>.
@@ -1007,6 +1024,7 @@ def convertMcf(albumname, keepDoublePages: bool, pageNumbers=None, mcfxTmpDir=No
     global bg_res  # pylint: disable=global-statement
     global defaultConfigSection  # pylint: disable=global-statement
     global pageNumberingInfo  # pylint: disable=global-statement
+    global albumIndex  # pylint: disable=global-statement
 
     clipartDict = {}    # a dictionary for clipart element IDs to file name
     clipartPathList = tuple()
@@ -1140,6 +1158,11 @@ def convertMcf(albumname, keepDoublePages: bool, pageNumbers=None, mcfxTmpDir=No
 
     bg_notFoundDirList = set([]) # keep a list of background folders that are not found, to prevent multiple errors for the same cause.
 
+    try:
+        albumIndex = Index(configuration['INDEX'])
+    except KeyError:
+        albumIndex = Index(None)
+
     # Load fonts
     availableFonts = findAndRegisterFonts(defaultConfigSection, appDataDir, albumBaseFolder, cewe_folder)
 
@@ -1201,6 +1224,19 @@ def convertMcf(albumname, keepDoublePages: bool, pageNumbers=None, mcfxTmpDir=No
 
     pdf = []
 
+    if albumIndex.indexing:
+        # At this point we have an index of items (selected on the basis of their font characteristics)
+        #   albumIndex.ShowIndex()
+        indexPdfFileName = albumIndex.SaveIndexPdf(outputFileName, albumTitle, pagesize)
+        indexPngFileName = albumIndex.SaveIndexPng(indexPdfFileName)
+        albumIndex.MergeAlbumAndIndexPng(outputFileName, indexPngFileName)
+        # most usual is to delete the index pdf, but leave the index png which could be added
+        # to the original with the cewe editor, and then you get it in the printed edition as well
+        if albumIndex.deleteIndexPdf and os.path.exists(indexPdfFileName):
+            os.remove(indexPdfFileName)
+        if albumIndex.deleteIndexPng and os.path.exists(indexPngFileName):
+            os.remove(indexPngFileName)
+
     # force the release of objects which might be holding on to picture file references
     # so that they will not prevent the removal of the files as we clean up and exit
     objectscollected = gc.collect()

diff --git a/cewe2pdf.pyproj b/cewe2pdf.pyproj
@@ -144,6 +144,17 @@
     <Content Include="tests\testFontSubstitution\previous_result_pdfs\testfontsubstitution.mcf.20250411S.pdf" />
     <Content Include="tests\testfontsubstitution\testfontsubstitution.mcf" />
     <Content Include="tests\testfontsubstitution\testfontsubstitution_mcf-Dateien\folderid.xml" />
+    <Content Include="tests\testIndexLarge\additional_fonts.txt" />
+    <Content Include="tests\testIndexLarge\cewe2pdf.ini" />
+    <Content Include="tests\testIndexLarge\test_indexLarge.mcf" />
+    <Content Include="tests\testIndexLarge\test_indexLarge_mcf-Dateien\6ds3xtbb_1_20200306_111748.jpg" />
+    <Content Include="tests\testIndexLarge\test_indexLarge_mcf-Dateien\folderid.xml" />
+    <Content Include="tests\testIndexLarge\test_indexLarge_mcf-Dateien\folderid.xml~" />
+    <Content Include="tests\testIndexLarge\test_indexLarge_mcf-Dateien\ud3dqwdw_1_test_index.mcf.s.idx.png" />
+    <Content Include="tests\testIndex\additional_fonts.txt" />
+    <Content Include="tests\testIndex\cewe2pdf.ini" />
+    <Content Include="tests\testIndex\test_index.mcf" />
+    <Content Include="tests\testIndex\test_index_mcf-Dateien\folderid.xml" />
     <Content Include="tests\testPageNumbers\additional_fonts.txt" />
     <Content Include="tests\testPageNumbers\cewe2pdf.ini" />
     <Content Include="tests\testPageNumbers\test_pagenumbers.mcf" />
@@ -269,6 +280,7 @@
     <Compile Include="imageUtils.py">
       <SubType>Code</SubType>
     </Compile>
+    <Compile Include="index.py" />
     <Compile Include="lineScales.py">
       <SubType>Code</SubType>
     </Compile>
@@ -290,6 +302,8 @@
     <Compile Include="tests\testEmptyPageOne\test_emptyPageOne.py" />
     <Compile Include="tests\testFontDoesNotExist\test_fontDoesNotExist.py" />
     <Compile Include="tests\testFontSubstitution\test_fontSubstitution.py" />
+    <Compile Include="tests\testIndexLarge\test_indexLarge.py" />
+    <Compile Include="tests\testIndex\test_index.py" />
     <Compile Include="tests\testMcfxExtraction\test_McfxExtraction.py" />
     <Compile Include="tests\testPageNumbers\test_pagenumbers.py" />
     <Compile Include="tests\testTextArt\test_textart.py" />
@@ -345,6 +359,11 @@
     <Folder Include="tests\testfontsubstitution\" />
     <Folder Include="tests\testFontSubstitution\previous_result_pdfs\" />
     <Folder Include="tests\testfontsubstitution\testfontsubstitution_mcf-Dateien\" />
+    <Folder Include="tests\testIndexLarge\" />
+    <Folder Include="tests\testIndexLarge\previous_result_pdfs\" />
+    <Folder Include="tests\testIndexLarge\test_indexLarge_mcf-Dateien\" />
+    <Folder Include="tests\testIndex\" />
+    <Folder Include="tests\testIndex\test_index_mcf-Dateien\" />
     <Folder Include="tests\testPageNumbers\" />
     <Folder Include="tests\testPageNumbers\test_pagenumbers_mcf-Dateien\" />
     <Folder Include="tests\testTextArt\" />

diff --git a/configUtils.py b/configUtils.py
@@ -14,11 +14,25 @@ def getConfigurationInt(configSection, itemName, defaultValue, minimumValue):
             returnValue = minimumValue
     return returnValue
 
+def getConfigurationFloat(configSection, itemName, defaultValue, minimumValue):
+    returnValue = minimumValue
+    if configSection is not None:
+        try:
+            # eg getConfigurationFloat(defaultConfigSection, 'pdfImageResolution', '1.15', 1.0)
+            returnValue = float(configSection.get(itemName, defaultValue))
+        except ValueError:
+            logging.error(f'Invalid configuration value supplied for {itemName}')
+            returnValue = float(defaultValue)
+        if returnValue < minimumValue:
+            logging.error(f'Configuration value supplied for {itemName} is less than {minimumValue}, using {minimumValue}')
+            returnValue = minimumValue
+    return returnValue
+
 def getConfigurationBool(configSection, itemName, defaultValue):
     returnValue = defaultValue
     if configSection is not None:
         try:
-            # eg getConfigurationBool(defaultConfigSection, 'insideCoverWhite', False)
+            # eg getConfigurationBool(defaultConfigSection, 'insideCoverWhite', 'False')
             bv = configSection.get(itemName, defaultValue)
             returnValue = bv.lower() == "true"
         except ValueError: