arxiv/util/tex2utf.py

"""Convert between TeX escapes and UTF8."""
import re
from typing import Pattern, Dict, Match

accents = {
    # first accents with non-letter prefix, e.g. \'A
    "'A": 0x00c1, "'C": 0x0106, "'E": 0x00c9, "'I": 0x00cd,
    "'L": 0x0139, "'N": 0x0143, "'O": 0x00d3, "'R": 0x0154,
    "'S": 0x015a, "'U": 0x00da, "'Y": 0x00dd, "'Z": 0x0179,
    "'a": 0x00e1, "'c": 0x0107, "'e": 0x00e9, "'i": 0x00ed,
    "'l": 0x013a, "'n": 0x0144, "'o": 0x00f3, "'r": 0x0155,
    "'s": 0x015b, "'u": 0x00fa, "'y": 0x00fd, "'z": 0x017a,
    '"A': 0x00c4, '"E': 0x00cb, '"I': 0x00cf, '"O': 0x00d6,
    '"U': 0x00dc, '"Y': 0x0178, '"a': 0x00e4, '"e': 0x00eb,
    '"i': 0x00ef, '"o': 0x00f6, '"u': 0x00fc, '"y': 0x00ff,
    '.A': 0x0226, '.C': 0x010a, '.E': 0x0116, '.G': 0x0120,
    '.I': 0x0130, '.O': 0x022e, '.Z': 0x017b, '.a': 0x0227,
    '.c': 0x010b, '.e': 0x0117, '.g': 0x0121, '.o': 0x022f,
    '.z': 0x017c, '=A': 0x0100, '=E': 0x0112, '=I': 0x012a,
    '=O': 0x014c, '=U': 0x016a, '=Y': 0x0232, '=a': 0x0101,
    '=e': 0x0113, '=i': 0x012b, '=o': 0x014d, '=u': 0x016b,
    '=y': 0x0233, '^A': 0x00c2, '^C': 0x0108, '^E': 0x00ca,
    '^G': 0x011c, '^H': 0x0124, '^I': 0x00ce, '^J': 0x0134,
    '^O': 0x00d4, '^S': 0x015c, '^U': 0x00db, '^W': 0x0174,
    '^Y': 0x0176, '^a': 0x00e2, '^c': 0x0109, '^e': 0x00ea,
    '^g': 0x011d, '^h': 0x0125, '^i': 0x00ee, '^j': 0x0135,
    '^o': 0x00f4, '^s': 0x015d, '^u': 0x00fb, '^w': 0x0175,
    '^y': 0x0177, '`A': 0x00c0, '`E': 0x00c8, '`I': 0x00cc,
    '`O': 0x00d2, '`U': 0x00d9, '`a': 0x00e0, '`e': 0x00e8,
    '`i': 0x00ec, '`o': 0x00f2, '`u': 0x00f9, '~A': 0x00c3,
    '~I': 0x0128, '~N': 0x00d1, '~O': 0x00d5, '~U': 0x0168,
    '~a': 0x00e3, '~i': 0x0129, '~n': 0x00f1, '~o': 0x00f5,
    '~u': 0x0169,
    # and now ones with letter prefix \c{c} etc..
    'HO': 0x0150, 'HU': 0x0170, 'Ho': 0x0151, 'Hu': 0x0171,
    'cC': 0x00c7, 'cE': 0x0228,
    'cG': 0x0122, 'cK': 0x0136, 'cL': 0x013b, 'cN': 0x0145,
    'cR': 0x0156, 'cS': 0x015e, 'cT': 0x0162, 'cc': 0x00e7,
    'ce': 0x0229, 'cg': 0x0123, 'ck': 0x0137, 'cl': 0x013c,
    # Commented out due ARXIVDEV-2322 (bug reported by PG)
    # 'ci' : 'i\x{0327}' = chr(0x69).ch(0x327) # i with combining cedilla
    'cn': 0x0146, 'cr': 0x0157, 'cs': 0x015f, 'ct': 0x0163,
    'kA': 0x0104, 'kE': 0x0118, 'kI': 0x012e, 'kO': 0x01ea,
    'kU': 0x0172, 'ka': 0x0105, 'ke': 0x0119, 'ki': 0x012f,
    'ko': 0x01eb, 'ku': 0x0173, 'rA': 0x00c5, 'rU': 0x016e,
    'ra': 0x00e5, 'ru': 0x016f, 'uA': 0x0102, 'uE': 0x0114,
    'uG': 0x011e, 'uI': 0x012c, 'uO': 0x014e, 'uU': 0x016c,
    'ua': 0x0103, 'ue': 0x0115, 'ug': 0x011f,
    'ui': 0x012d, 'uo': 0x014f, 'uu': 0x016d,
    'vA': 0x01cd, 'vC': 0x010c, 'vD': 0x010e,
    'vE': 0x011a, 'vG': 0x01e6, 'vH': 0x021e, 'vI': 0x01cf,
    'vK': 0x01e8, 'vL': 0x013d, 'vN': 0x0147, 'vO': 0x01d1,
    'vR': 0x0158, 'vS': 0x0160, 'vT': 0x0164, 'vU': 0x01d3,
    'vZ': 0x017d, 'va': 0x01ce, 'vc': 0x010d, 'vd': 0x010f,
    've': 0x011b, 'vg': 0x01e7, 'vh': 0x021f, 'vi': 0x01d0,
    'vk': 0x01e9, 'vl': 0x013e, 'vn': 0x0148, 'vo': 0x01d2,
    'vr': 0x0159, 'vs': 0x0161, 'vt': 0x0165, 'vu': 0x01d4,
    'vz': 0x017e
}
r"""
Hash to lookup tex markup and convert to Unicode.

macron: a line above character (overbar \={} in TeX)
caron: v-shape above character (\v{ } in TeX)
See: http://www.unicode.org/charts/

"""

textlet = {
    'AA': 0x00c5, 'AE': 0x00c6, 'DH': 0x00d0, 'DJ': 0x0110,
    'ETH': 0x00d0, 'L': 0x0141, 'NG': 0x014a, 'O': 0x00d8,
    'oe': 0x0153, 'OE': 0x0152, 'TH': 0x00de, 'aa': 0x00e5,
    'ae': 0x00e6,
    'dh': 0x00f0, 'dj': 0x0111, 'eth': 0x00f0, 'i': 0x0131,
    'l': 0x0142, 'ng': 0x014b, 'o': 0x00f8, 'ss': 0x00df,
    'th': 0x00fe,
    }

textgreek = {
    # Greek (upper)
    'Gamma': 0x0393, 'Delta': 0x0394, 'Theta': 0x0398,
    'Lambda': 0x039b, 'Xi': 0x039E, 'Pi': 0x03a0,
    'Sigma': 0x03a3, 'Upsilon': 0x03a5, 'Phi': 0x03a6,
    'Psi': 0x03a8, 'Omega': 0x03a9,
    # Greek (lower)
    'alpha': 0x03b1, 'beta': 0x03b2, 'gamma': 0x03b3,
    'delta': 0x03b4, 'epsilon': 0x03b5, 'zeta': 0x03b6,
    'eta': 0x03b7, 'theta': 0x03b8, 'iota': 0x03b9,
    'kappa': 0x03ba, 'lambda': 0x03bb, 'mu': 0x03bc,
    'nu': 0x03bd, 'xi': 0x03be, 'omicron': 0x03bf,
    'pi': 0x03c0, 'rho': 0x03c1, 'varsigma': 0x03c2,
    'sigma': 0x03c3, 'tau': 0x03c4, 'upsion': 0x03c5,
    'varphi': 0x03C6,  # φ
    'phi':  0x03D5,  # ϕ
    'chi': 0x03c7, 'psi': 0x03c8, 'omega': 0x03c9,
}


def _p_to_match(tex_to_chr: Dict[str, int]) -> Pattern:
    # textsym and textlet both use the same sort of regex pattern.
    keys = r'\\(' + '|'.join(tex_to_chr.keys()) + ')'
    pstr = r'({)?' + keys + r'(\b|(?=_))(?(1)}|(\\(?= )| |{}|)?)'
    return re.compile(pstr)


textlet_pattern = _p_to_match(textlet)
textgreek_pattern = _p_to_match(textgreek)

textsym = {
    'P': 0x00b6, 'S': 0x00a7, 'copyright': 0x00a9,
    'guillemotleft': 0x00ab, 'guillemotright': 0x00bb,
    'pounds': 0x00a3, 'dag': 0x2020, 'ddag': 0x2021,
    'div': 0x00f7, 'deg': 0x00b0}

textsym_pattern = _p_to_match(textsym)


def _textlet_sub(match: Match) -> str:
    return chr(textlet[match.group(2)])


def _textsym_sub(match: Match) -> str:
    return chr(textsym[match.group(2)])


def _textgreek_sub(match: Match) -> str:
    return chr(textgreek[match.group(2)])


def texch2UTF(acc: str) -> str:
    """Convert single character TeX accents to UTF-8.

    Strip non-whitepsace characters from any sequence not recognized (hence
    could return an empty string if there are no word characters in the input
    string).

    chr(num) will automatically create a UTF8 string for big num
    """
    if acc in accents:
        return chr(accents[acc])
    else:
        return re.sub(r'[^\w]+', '', acc, flags=re.IGNORECASE)


def tex2utf(tex: str, greek: bool = True) -> str:
    r"""Convert some TeX accents and greek symbols to UTF-8 characters.

    :param tex: Text to filter.

    :param greek: If False, do not convert greek letters or
    ligatures.  Greek symbols can cause problems. Ex. \phi is not
    suppose to look like φ. φ looks like \varphi.  See ARXIVNG-1612

    :returns: string, possibly with some TeX replaced with UTF8

    """
    # Do dotless i,j -> plain i,j where they are part of an accented i or j
    utf = re.sub(r"/(\\['`\^\"\~\=\.uvH])\{\\([ij])\}", r"\g<1>\{\g<2>\}", tex)

    # Now work on the Tex sequences, first those with letters only match
    utf = textlet_pattern.sub(_textlet_sub, utf)

    if greek:
        utf = textgreek_pattern.sub(_textgreek_sub, utf)

    utf = textsym_pattern.sub(_textsym_sub, utf)

    utf = re.sub(r'\{\\j\}|\\j\s', 'j', utf)  # not in Unicode?

    # reduce {{x}}, {{{x}}}, ... down to {x}
    while re.search(r'\{\{([^\}]*)\}\}', utf):
        utf = re.sub(r'\{\{([^\}]*)\}\}', r'{\g<1>}', utf)

    # Accents which have a non-letter prefix in TeX, first \'e
    utf = re.sub(r'\\([\'`^"~=.][a-zA-Z])',
                 lambda m: texch2UTF(m.group(1)), utf)

    # then \'{e} form:
    utf = re.sub(r'\\([\'`^"~=.])\{([a-zA-Z])\}',
                 lambda m: texch2UTF(m.group(1) + m.group(2)), utf)

    # Accents which have a letter prefix in TeX
    #  \u{x} u above (breve), \v{x}   v above (caron), \H{x}   double accute...
    utf = re.sub(r'\\([Hckoruv])\{([a-zA-Z])\}',
                 lambda m: texch2UTF(m.group(1) + m.group(2)), utf)

    # Don't do \t{oo} yet,
    utf = re.sub(r'\\t{([^\}])\}', r'\g<1>', utf)

    # bdc34: commented out in original Perl
    # $utf =~ s/\{(.)\}/$1/g; #  remove { } from around {x}

    return utf