In [367]:
import os
import collections
import re
import pprint
from itertools import chain

from unicodedata import name as uname

from IPython.display import display, HTML, Image

import fitz

PP = pprint.PrettyPrinter(indent=4)

In [2]:
CSS = """
<style>
.r {
    font-family: normal, sans-serif;
    font-size: 24pt;
    direction: rtl;
}
p.r {
    text-align: right;
    direction: rtl;
}
.l {
    font-family: normal, sans-serif;
    font-size: 16pt;
    direction: ltr;
}
p.l {
    text-align: left;
    direction: ltr;
}
.p {
    font-family: monospace;
    font-size: 12pt;
    font-weight: bold;
    background-color: yellow;
    direction: ltr;
}
"""

display(HTML(CSS))

In [3]:
NAME = "Lakhnawi"
SOURCE = f"../_local/source/{NAME}/{NAME.lower()}.pdf"
FONT = f"../_local/source/{NAME}/Font report {NAME}.pdf"
DEST = f"../_local/source/{NAME}/{NAME.lower()}.txt"

In [240]:
U_LINE_RE = re.compile(r"""^U\+([0-9a-f]{4})([0-9a-f ]*)$""", re.I)
HEX_RE = re.compile(r"""^[0-9a-f]{4}$""", re.I)

DOUBLES = {}
PRIVATES = set()


def getCharInfo():
    doc = fitz.open(FONT)
    doubles = {}
    illegals = set()

    for page in doc:
        textPage = page.getTextPage()
        data = textPage.extractText()

        for (ln, line) in enumerate(data.split("\n")):
            if line.startswith("U+"):
                match = U_LINE_RE.match(line)
                if not match:
                    continue
                (main, rest) = match.group(1, 2)
                main = main.lower()
                nMain = int(main, base=16)
                if nMain in PUAS:
                    PRIVATES.add(nMain)
                    continue
                if nMain == 0:
                    continue
                uMain = chr(nMain)
                nameMain = uname(uMain)
                second = None
                rest = rest.replace(" ", "")
                if rest:
                    if HEX_RE.match(rest):
                        second = rest.lower()
                if second:
                    nSecond = int(second, base=16)
                    if nSecond > nMain:
                        DOUBLES[nMain] = nSecond
                    else:
                        DOUBLES[nSecond] = nMain

getCharInfo()

In [241]:
x = int("feac", base=16)
print(x)
print(x in DOUBLES)
print(x in DOUBLES.values())
DOUBLES

65196
False
False


{1575: 65165,
 1589: 65209,
 1608: 65261,
 1607: 65257,
 1617: 65148,
 1587: 65201,
 1635: 1779,
 1640: 1784,
 1578: 65173,
 1633: 1777,
 1634: 1778,
 1632: 1776,
 1605: 65249,
 1585: 65197,
 1583: 65193,
 1602: 65237,
 1591: 65217,
 1580: 65181,
 1584: 65195,
 8208: 8209,
 1641: 1785,
 1571: 65155,
 1576: 65167,
 1606: 65253,
 1570: 65153,
 1573: 65159,
 1618: 65150,
 1614: 65142,
 1639: 1783,
 1615: 65144,
 1616: 65146,
 1586: 65199,
 1604: 65245,
 1569: 65152,
 1613: 65140,
 1610: 65265,
 1601: 65233,
 1581: 65185,
 1594: 65229,
 1577: 65171,
 1603: 65241,
 1649: 64336,
 1611: 65136,
 1612: 65138,
 1609: 65263,
 1593: 65225,
 1590: 65213,
 1579: 65177,
 1572: 65157,
 1582: 65189,
 1588: 65205,
 1722: 64414,
 1592: 65221,
 1711: 64402,
 1740: 64508,
 1574: 65161,
 1548: 1644}

In [242]:
len(PRIVATES)

71

In [243]:
doc = fitz.open(SOURCE)

In [244]:
def keyCharV(char):
    return int(round((char[3] + char[1]) / 2))

def keyCharH(char):
    # return int(round((char[2] + char[0]) / 2))
    return char[2]

In [245]:
def clusterVert(data):
    keys = collections.Counter()
    for char in data:
        k = keyCharV(char)
        keys[k] += 1
        
    peaks = sorted(keys)
    if len(peaks) > 1:
        nDistances = len(peaks) - 1
        avPeakDist = int(round(sum(peaks[i + 1] - peaks[i] for i in range(nDistances)) / nDistances))
        
        peakThreshold = int(round(avPeakDist / 3))
        clusteredPeaks = {}
        for (k, n) in sorted(keys.items(), key=lambda x: (-x[1], x[0])):
            added = False
            for kc in clusteredPeaks:
                if abs(k - kc) <= peakThreshold:
                    clusteredPeaks[kc].add(k)
                    added = True
                    break
            if not added:
                clusteredPeaks[k] = {k}
    toCluster = {}
    for (kc, ks) in clusteredPeaks.items():
        for k in ks:
            toCluster[k] = kc
                
    def clusterKeyCharV(char):
        k = keyCharV(char)
        return toCluster[k]
    
    if False:
        print("PEAKS")
        for k in peaks:
            print(f"{k:>4} : {keys[k]:>4}")
        print("CLUSTERED_PEAKS")
        for kc in sorted(clusteredPeaks):
            peak = ", ".join(f"{k:>4}" for k in sorted(clusteredPeaks[kc]))
            print(f"{peak} : {sum(keys[k] for k in clusteredPeaks[kc]):>4}")
        
    return clusterKeyCharV

In [453]:
def trim(chars, info):
    page = info["page"]
    puas = info["puas"]
    
    preresult = []

    for char in chars:
        char = list(char)
        c = char[-1]
        uc = ord(c)
        
        if uc in IGNORES:
            continue
            
        if uc in REPLACE_SINGLE:
            uc = REPLACE_SINGLE[uc]
            char[-1] = chr(uc)
            
        if preresult:
            prevChar = preresult[-1]
            pc = prevChar[-1]
            if pc != "":
                puc = ord(prevChar[-1])
                repl = REPLACE_DOUBLE.get(puc, {}).get(uc, None)
                if repl is not None:
                    (puc, *ucs) = repl
                    preresult[-1][-1] = chr(puc)
                    if len(ucs) == 0:
                        char[-1] = ""
                    elif len(ucs) == 1:
                        uc = ucs[0]
                        char[-1] = chr(uc)
                    else:
                        char[-1] = "".join(chr(uc) for uc in ucs)

        preresult.append(char)
        
    result = []
    text = []
    prevLeft = None
    prevDir = "r"
    
    for char in preresult:
        left = int(round(char[0]))
        right = int(round(char[2]))
        
        if prevLeft is not None:
            if prevLeft - right >= 25:
                text.append(" ")

        c = char[-1]
        if c == "":
            prevLeft = left
            continue
            
        uc = ord(c)

        if uc not in NOSPACINGS:
            prevLeft = left
            
        thisDir = prevDir if uc in NEUTRALS else "r" if uc in RLS else "l"
        
        if prevDir != thisDir:
            result.append(f"""<span class="{prevDir}">{"".join(text)}</span>""")
            text = []
            prevDir = thisDir
            
        rep = c
        if uc in PUAS:
            rep = f"""<span class="p">[{uc:>04x}]</span>"""
            puas[f"{uc:>04x}"][page] += 1
        text.append(f"""<span class="p">[{uc:>04x}]</span>""" if uc in PUAS else c)


    if text:
        result.append(f"""<span class="{prevDir}">{"".join(text)}</span>""")

    return "".join(result)

In [454]:
def postRAWDICT(data):
    chars = []
    prevChar = None
    prevFont = None
    prevSize = None

    def addChar():
        box = tuple(int(round(x * 10)) for x in prevChar["bbox"])
        c = prevChar["c"]
        uc = ord(c)
        try:
            un = uname(c)
        except:
            un = "NO NAME"
        chars.append(
            (
                *box,
                prevFont,
                prevSize,
                f"{uc:>04x}",
                "PRIVATE" if uc in PUAS else un,
                c,
            )
        )

    def collectChars(data, font, size):
        nonlocal prevChar
        nonlocal prevFont
        nonlocal prevSize

        if type(data) is list:
            for elem in data:
                collectChars(elem, font, size)

        elif type(data) is dict:
            if "font" in data:
                font = data["font"]
            if "size" in data:
                size = data["size"]
            if "c" in data:
                c = data["c"]
                uc = ord(c)
                skip = False

                if prevChar is not None:
                    pc = prevChar["c"]
                    puc = ord(pc)
                    if puc in DOUBLES and DOUBLES[puc] == uc:
                        skip = True
                    if uc in DOUBLES and DOUBLES[uc] == puc:
                        prevChar = data
                        skip = True

                if not skip:
                    if prevChar is not None:
                        addChar()
                    prevChar = data
                    prevFont = font
                    prevSize = size

            for (k, v) in data.items():
                if type(v) in {list, dict}:
                    collectChars(v, font, size)

    collectChars(data, None, None)
    if prevChar is not None:
        addChar()

    # PP.pprint(chars[0:20])
    clusterKeyCharV = clusterVert(chars)
    lines = {}
    for char in sorted(chars, key=lambda c: (clusterKeyCharV(c), -keyCharH(c))):
        k = clusterKeyCharV(char)
        lines.setdefault(k, []).append(char)
    lines = tuple(line for line in lines.values())
    return lines


def toHtml(lines, info, show):
    html = []
    for (i, line) in enumerate(lines):
        html.append(f"""<p class="r">{trim(line, info)}</p>\n""")
    if show:
        display(HTML("".join(html)))

In [455]:
def showInfo(info):
    puas = info["puas"]
    if not puas:
        print("No private use characters in text")
    else:
        totalOccs = sum(sum(pages.values()) for pages in puas.values())
        totalPages = len(set(chain.from_iterable(puas.values())))
        print(
            f"{len(puas)} private use characters in text"
            f" in {totalOccs} occurrences on {totalPages} pages"
        )
        for xc in sorted(puas):
            pages = puas[xc]
            thistotal = sum(pages.values())
            print(f"{xc}: {thistotal} x on {len(pages)}")
            for page in sorted(pages):
                occs = pages[page]
                print(f"\tpage {page:>3}: {occs:>3} x")

In [456]:
def getText(page, info, result, show):
    info["page"] = page.number
    textPage = page.getTextPage()
    data = textPage.extractRAWDICT()
    # PP.pprint(data["blocks"][0]["lines"][0:1])
    lines = postRAWDICT(data)
    result.append(lines)
    toHtml(lines, info, show)

In [457]:
def getSetFromRanges(rngs):
    result = set()
    for (b, e) in rngs:
        for c in range(int(b, base=16), int(e, base=16) + 1):
            result.add(c)
    return result


def getDictFromDef(defs):
    result = {}
    for line in defs.strip().split("\n"):
        (val, repl, comment) = line.split(maxsplit=2)
        result[int(val, base=16)] = int(repl, base=16)
    return result


def getDictFromDef2(defs):
    result = {}
    for line in defs.strip().split("\n"):
        (vals, repls, comment) = line.split(maxsplit=2)
        (val1, val2) = vals.split("+")
        result.setdefault(int(val1, base=16), {})[int(val2, base=16)] = [
            int(repl, base=16) for repl in repls.split("+")
        ]
    return result

In [470]:
PUA_RANGES = (("e000", "f8ff"),)

SEMITIC_RANGES = (
    ("0600", "06ff"),
    ("0750", "077f"),
    ("08a0", "08ff"),
    ("206c", "206d"),
    ("fb50", "fdfd"),
    ("fe70", "fefc"),
    ("0591", "05f4"),
    ("fb1d", "fb4f"),
)

NO_SPACING_RANGES = (
    ("064b", "0652"),
)

IGNORE_RANGES = (
    ("e821", "e821"),
)

NEUTRAL_DIRECTION_RANGES = (
    ("0020", "002f"),
    ("003a", "0040"),
    ("005b", "0060"),
    ("007b", "00b1"),
    ("00b4", "00b8"),
    ("00ba", "00bf"),
    ("00f7", "00f7"),
    ("02d8", "02df"),
    ("02e5", "0362"),
    ("2000", "206f"),
)

REPLACE_SINGLE_DEF = """
e825 064e FATHA (high)
e826 064f DAMMA (high)
e828 0652 SUKUN (high)
e830 fcf2 SHADDA+FATHA (medial form)
e8e8 064e FATHA (mid)
e864 0650 KASRA (low)
e8df 0650 KASRA (low)
e8e9 064f DAMMA
"""

REPLACE_DOUBLE_DEF = """
e80e+e807 fefc      LAM + ALEF LIGATURE
0627+e815 0623+064e ALIF-HAMZA + FATA
"""

In [471]:
PUAS = getSetFromRanges(PUA_RANGES)
SEMIS = getSetFromRanges(SEMITIC_RANGES)
NEUTRALS = getSetFromRanges(NEUTRAL_DIRECTION_RANGES)
RLS = PUAS | SEMIS
NOSPACINGS = getSetFromRanges(NO_SPACING_RANGES)
IGNORES = getSetFromRanges(IGNORE_RANGES)
REPLACE_SINGLE = getDictFromDef(REPLACE_SINGLE_DEF)
REPLACE_DOUBLE = getDictFromDef2(REPLACE_DOUBLE_DEF)

In [472]:
result = []
info = dict(
    puas=collections.defaultdict(collections.Counter),
)

getText(doc[100], info, result, True)

showInfo(info)

10 private use characters in text in 49 occurrences on 1 pages
e806: 2 x on 1
	page 100:   2 x
e80a: 2 x on 1
	page 100:   2 x
e823: 6 x on 1
	page 100:   6 x
e827: 11 x on 1
	page 100:  11 x
e82b: 2 x on 1
	page 100:   2 x
e831: 3 x on 1
	page 100:   3 x
e833: 10 x on 1
	page 100:  10 x
e845: 2 x on 1
	page 100:   2 x
e8d4: 10 x on 1
	page 100:  10 x
e8eb: 1 x on 1
	page 100:   1 x


In [473]:
resultAll = []
infoAll = dict(
    puas=collections.defaultdict(collections.Counter),
)

for page in doc:
   getText(page, infoAll, result, False)

showInfo(infoAll)

62 private use characters in text in 23289 occurrences on 436 pages
e800: 5 x on 5
	page  11:   1 x
	page  47:   1 x
	page 109:   1 x
	page 128:   1 x
	page 253:   1 x
e806: 1540 x on 368
	page   7:   2 x
	page   9:   1 x
	page  10:   4 x
	page  11:   2 x
	page  12:   1 x
	page  13:   3 x
	page  14:   2 x
	page  15:   7 x
	page  16:   2 x
	page  17:   7 x
	page  18:   4 x
	page  19:   8 x
	page  20:   3 x
	page  21:   2 x
	page  22:   8 x
	page  23:   6 x
	page  24:   4 x
	page  25:   9 x
	page  26:   5 x
	page  27:   3 x
	page  28:   3 x
	page  30:   1 x
	page  31:   3 x
	page  32:   6 x
	page  33:   7 x
	page  34:   3 x
	page  35:   8 x
	page  36:  11 x
	page  37:  13 x
	page  38:   2 x
	page  39:   3 x
	page  40:   9 x
	page  41:   2 x
	page  42:   2 x
	page  43:   8 x
	page  44:   5 x
	page  45:   8 x
	page  46:   4 x
	page  47:   9 x
	page  48:   2 x
	page  49:   7 x
	page  50:   2 x
	page  51:   9 x
	page  52:   2 x
	page  54:  10 x
	page  56:   4 x
	page  57:   4 x
	page  58:   

In [254]:
def draw(page):
    pix = page.getPixmap(matrix=fitz.Matrix(4, 4), alpha = False)
    display(Image(data=pix.getPNGData(), format="png"))

In [None]:
draw(doc[100])

In [201]:
widths = collections.defaultdict(lambda: collections.defaultdict(collections.Counter))


def measureText(page, result):
    textPage = page.getTextPage()
    data = textPage.extractRAWDICT()
    PP.pprint(data["blocks"][0]["lines"][0:1])
    lines = postRAWDICT(data)
    
    for chars in lines:
        for char in chars:
            font = char[4]
            size = char[5]
            hx = char[6]
            w = int(round((char[2] - char[0]) / size))
            result[hx][font][w] += 1
            
measureText(doc[100], widths)

PP.pprint(widths)

[   {   'bbox': (   357.4787902832031,
                    86.697998046875,
                    401.9999694824219,
                    108.128662109375),
        'dir': (1.0, 0.0),
        'spans': [   {   'bbox': (   357.4787902832031,
                                     86.697998046875,
                                     401.9999694824219,
                                     108.128662109375),
                         'chars': [   {   'bbox': (   398.8486022949219,
                                                      86.697998046875,
                                                      401.9999694824219,
                                                      108.128662109375),
                                          'c': 'ا',
                                          'origin': (   398.8486022949219,
                                                        102.215576171875)},
                                      {   'bbox': (   387.9999694824219,
                               