This notebook provides functions to create strings visually similar to an input string usin the Unicode data on confusable characters. The first cell contains the functions and the second cell demonstrates how to use them. To use the code in another notebook (make sure it's above the cells where you're going to call the functions), just copy the first cell into the second notebook and either place the file `unicode_confusablesSummary.txt` in the directory of that notebook or change the path to that file in the code.

In [24]:
import itertools
import pandas as pd

def get_confusables(char):

    '''
    Parse the unicode confusables summary file and extract a set of
    strings that may be visually similar to an input character.

    NOTE this function parses a file every time it's called, so it
    could create a lot of overhead in a loop or recursion

    Parameters
    ----------
    char : string
        A single character

    Returns
    -------
    pandas.Series
        A sequence of characters extracted from the unicode confusables
        summary file
    '''

    with open('unicode_confusablesSummary.txt', 'r', encoding='utf8') as f:
        confusables = f.readlines()

    confusables = [l for l in confusables if l.startswith('#')]
    nonprinting = confusables[7][1:-1].split('\t')[1:]
    confusables = confusables[8:-1]
    confusables = [l[1:-1].strip().split('\t') for l in confusables]
    confusables.append(nonprinting)

    confusables = pd.DataFrame(confusables)
    confusables = confusables.loc[
        confusables.apply(lambda x: x.isin([char]).any(), axis=1)
    ]

    if confusables.shape[0] > 1:
        raise ValueError('character found in multiple sets')

    return confusables.squeeze().dropna()

def get_confusable_combinations(characters):
    '''
    Take a string of characters and return a list containing every
    combination of unicode confusable characters comparable to the
    input. Confusable characters retain the positions of their
    archetypes in the original string, so an input of '12' would
    generate output containing the string 'lz' but not 'zl'. 

    Parameters
    ----------
    characters : string
        A list of characters to be identified with unicode confusables

    Return
    ------
    list
        A list of strings visually similar to the input string.
    '''

    combinations = itertools.product(*[get_confusables(c) for c in characters])

    return [''.join(l) for l in combinations]

In [23]:
# Make a regex matching all strings that could be visually confused
# with the string '12' when they are either the last two characters in a
# string or the second and third to last characters in the string to be
# searched.
terminators = get_confusable_combinations('12')
entry_terminator_regex = r'(\W({})\.?$)'.format('|'.join(terminators))

638