In [2]:
import os
import re
import pprint
from unicodedata import name as uname

from IPython.display import display, HTML, Image

import fitz

PP = pprint.PrettyPrinter(indent=4)

In [3]:
PLACE = "Lakhnawi"
NAME = f"Font report {PLACE}"
SOURCE = f"../_local/source/{PLACE}/{NAME}.pdf"

In [4]:
PRIVATE_RE = re.compile(r"""[\ue000-\uf8ff]""")

def postText(data):
    print(PRIVATE_RE.sub("", data))

In [5]:
ENTITY_RE = re.compile(r"""&#x([0-9a-f]{1,4});""", re.S)

PUA_LOW = int("e000", base=16)
PUA_HIGH = int("f8ff", base=16)


def entityRepl(match):
    code = match.group(1)
    unicode = int(code, base=16)
    return f"[{code}]" if PUA_LOW <= unicode <= PUA_HIGH else chr(unicode)
    

PRIVATE_ENTITY_RE = re.compile(
    r"""
    &\#x
    (?:
        (?:
            e
            [0-9a-f]{3}
        )
        |
        (?:
            f
            [0-8]
            [0-9a-f]{2}
        )
    )
    ;
    """,
    re.S | re.X,
)

In [6]:
LINES_RE = re.compile(r"""<line\b[^>]*>(.*?)</line>""", re.S)
CHARS_RE = re.compile(r"""<char\b[^>]*>(.*?)/>""", re.S)
CHAR_RE = re.compile(r'''\bc="([^"]*)"''', re.S)


CSS = """
<style>
.ar {
    font-family: normal, sans-serif;
    font-size: 30pt;
}
"""

display(HTML(CSS))

In [7]:
doc = fitz.open(SOURCE)

In [8]:
def process(page, result):
    textPage = page.getTextPage()
    data = textPage.extractText()
    # print(data)
    result.append(data)

In [9]:
result = []

for page in doc:
    process(page, result)

In [10]:
singles = {}
doubles = {}
warnings = {}

U_LINE_RE = re.compile(r"""^U\+([0-9a-f]{4})([0-9a-f ]*)$""", re.I)
HEX_RE = re.compile(r"""^[0-9a-f]{4}$""", re.I)


for (p, page) in enumerate(result):
    for (ln, line) in enumerate(page.split("\n")):
        if line.startswith("U+"):
            match = U_LINE_RE.match(line)
            if not match:
                warnings.setdefault(p, {}).setdefault(ln, dict(text=line, errors=[]))["errors"].append("strange uline")
                continue
            (main, rest) = match.group(1, 2)
            main = main.lower()
            second = None
            rest = rest.replace(" ", "")
            if rest:
                if HEX_RE.match(rest):
                    second = rest.lower()
                else:
                    warnings.setdefault(p, {}).setdefault(ln, dict(text=line, errors=[]))["errors"].append("strange second")
            if second:
                doubles.setdefault(main, {}).setdefault(second, []).append((p, ln))
            else:
                singles.setdefault(main, []).append((p, ln))
            
PP.pprint(warnings)

{}


In [11]:
for main in singles:
    if main in doubles:
        print(f"{main} is double and single")
        for second in doubles[main]:
            print(f"\t{second}")

06cc is double and single
	fbfc
0627 is double and single
	fe8d
0623 is double and single
	fe83
0645 is double and single
	fee1
060c is double and single
	066c
0650 is double and single
	fe7a


In [12]:
for main in doubles:
    if len(doubles[main]) > 1:
        print(f"{main} is double with multiple seconds")
        for second in doubles[main]:
            print(f"\t{second}")

In [13]:
for main in sorted(doubles):
    second = list(doubles[main])[0]
    print(f"{main} {second}")

0000 007f
060c 066c
0621 fe80
0622 fe81
0623 fe83
0624 fe85
0625 fe87
0626 fe89
0627 fe8d
0628 fe8f
0629 fe93
062a fe95
062b fe99
062c fe9d
062d fea1
062e fea5
062f fea9
0630 feab
0631 fead
0632 feaf
0633 feb1
0634 feb5
0635 feb9
0636 febd
0637 fec1
0638 fec5
0639 fec9
063a fecd
0641 fed1
0642 fed5
0643 fed9
0644 fedd
0645 fee1
0646 fee5
0647 fee9
0648 feed
0649 feef
064a fef1
064b fe70
064c fe72
064d fe74
064e fe76
064f fe78
0650 fe7a
0651 fe7c
0652 fe7e
0660 06f0
0661 06f1
0662 06f2
0663 06f3
0667 06f7
0668 06f8
0669 06f9
0671 fb50
06af fb92
06ba fb9e
06cc fbfc
2010 2011


In [15]:
print({main: list(doubles[main].keys())[0] for main in doubles})

{'0627': 'fe8d', '0635': 'feb9', '0648': 'feed', '0647': 'fee9', '0651': 'fe7c', '0633': 'feb1', '0663': '06f3', '0668': '06f8', '062a': 'fe95', '0661': '06f1', '0662': '06f2', '0660': '06f0', '0645': 'fee1', '0631': 'fead', '062f': 'fea9', '0642': 'fed5', '0637': 'fec1', '062c': 'fe9d', '0630': 'feab', '2010': '2011', '0669': '06f9', '0623': 'fe83', '0628': 'fe8f', '0646': 'fee5', '0622': 'fe81', '0625': 'fe87', '0652': 'fe7e', '064e': 'fe76', '0667': '06f7', '064f': 'fe78', '0650': 'fe7a', '0632': 'feaf', '0644': 'fedd', '0621': 'fe80', '064d': 'fe74', '064a': 'fef1', '0641': 'fed1', '062d': 'fea1', '063a': 'fecd', '0629': 'fe93', '0643': 'fed9', '0671': 'fb50', '064b': 'fe70', '064c': 'fe72', '0649': 'feef', '0639': 'fec9', '0636': 'febd', '062b': 'fe99', '0000': '007f', '0624': 'fe85', '062e': 'fea5', '0634': 'feb5', '06ba': 'fb9e', '0638': 'fec5', '06af': 'fb92', '06cc': 'fbfc', '0626': 'fe89', '060c': '066c'}


In [20]:
regex = []
for main in sorted(doubles)[1:]:
    second = list(doubles[main])[0]
    regex.append(f"{chr(int(main, base=16))}{chr(int(second, base=16))}")
print("(\n\t  " + "\n\t| ".join(regex) + "\n)")

(
	  ،٬
	| ءﺀ
	| آﺁ
	| أﺃ
	| ؤﺅ
	| إﺇ
	| ئﺉ
	| اﺍ
	| بﺏ
	| ةﺓ
	| تﺕ
	| ثﺙ
	| جﺝ
	| حﺡ
	| خﺥ
	| دﺩ
	| ذﺫ
	| رﺭ
	| زﺯ
	| سﺱ
	| شﺵ
	| صﺹ
	| ضﺽ
	| طﻁ
	| ظﻅ
	| عﻉ
	| غﻍ
	| فﻑ
	| قﻕ
	| كﻙ
	| لﻝ
	| مﻡ
	| نﻥ
	| هﻩ
	| وﻭ
	| ىﻯ
	| يﻱ
	| ًﹰ
	| ٌﹲ
	| ٍﹴ
	| َﹶ
	| ُﹸ
	| ِﹺ
	| ّﹼ
	| ْﹾ
	| ٠۰
	| ١۱
	| ٢۲
	| ٣۳
	| ٧۷
	| ٨۸
	| ٩۹
	| ٱﭐ
	| گﮒ
	| ںﮞ
	| یﯼ
	| ‐‑
)


In [15]:
rows = []
rows.append("<table>")

for main in sorted(doubles)[1:]:
    second = list(doubles[main])[0]
    
    n1 = int(main, base=16)
    n2 = int(second, base=16)
    
    c1 = chr(n1)
    c2 = chr(n2)
    
    un1 = uname(c1)
    un2 = uname(c2)
    
    rows.append(f"""\
<tr>
    <td>main</td>
    <td class="ar">{c1}</td>
    <td>{main}</td>
    <td>{un1}</td>
</tr>
<tr>
    <td>second</td>
    <td class="ar">{c2}</td>
    <td>{second}</td>
    <td>{un2}</td>
</tr>
""")
rows.append("</table>")

display(HTML("".join(rows)))

0,1,2,3
main,،,060c,ARABIC COMMA
second,٬,066c,ARABIC THOUSANDS SEPARATOR
main,ء,0621,ARABIC LETTER HAMZA
second,ﺀ,fe80,ARABIC LETTER HAMZA ISOLATED FORM
main,آ,0622,ARABIC LETTER ALEF WITH MADDA ABOVE
second,ﺁ,fe81,ARABIC LETTER ALEF WITH MADDA ABOVE ISOLATED FORM
main,أ,0623,ARABIC LETTER ALEF WITH HAMZA ABOVE
second,ﺃ,fe83,ARABIC LETTER ALEF WITH HAMZA ABOVE ISOLATED FORM
main,ؤ,0624,ARABIC LETTER WAW WITH HAMZA ABOVE
second,ﺅ,fe85,ARABIC LETTER WAW WITH HAMZA ABOVE ISOLATED FORM
