# In-Browser Decoder Playground

This environment is hosted completely in the browser, and can be used to experiment with the _decoder_ implementation. Note that many of these programs will fail to terminate in the alloted limits of iteration, input etc. These are expected (especially given WASM+Python which introduces a few orders of magnitude performance hit). We need much less than 1% of these to complete to generate reasonable inputs.

In [1]:
import random

## Status

We start by providing the status codes that we use in decoder. These are _complete_, _incomplete_, and _incorrect_.

In [2]:
import enum

In [3]:
class Status(enum.Enum):
    Complete = 0
    Incomplete = 1
    Incorrect = -1

## Alphabets
Our algorithm relies on iterating through all possible alphabets of the language; For convenience, we define it as the *printable* subset of ASCII letters.

In [4]:
import string
SET_OF_BYTES = {c for c in string.printable}

In [5]:
def new_byte(choices):
    v = random.choice(choices)
    return v

## Logger

We provide a simple logger.

In [6]:
import sys

In [7]:
def logit(*v):
    print(*v, file=sys.stderr)
    return

## Limits

We define a few limits to the algorithm. In particular, we do not go beyond `ITERATION_LIMIT` and we stop and discard the input if the input crosses `INPUT_LIMIT` without returning *complete*.

In [8]:
ITERATION_LIMIT=1000
INPUT_LIMIT=100

In [9]:
import itertools

## Exceptions
We need a few exceptions first.

In [10]:
class NeedMoreException(Exception): ...
class InvalidValueException(Exception): ...
class InputLimitException(Exception): ...
class IterationLimitException(Exception): ...
class BacktrackLimitException(Exception): ...

## The Decoder

### Choices

Normally, alphabets are sufficient as concat units for checking validity of prefixes. But sometimes, you need to produce longer concate units.

In [11]:
def till_n_length_choices(my_choices, rs):
    all_choices = []
    for r in range(1, rs+1):
        v = [''.join(i) for i in itertools.product(my_choices, repeat=r)]
        random.shuffle(v)
        all_choices.extend(v)
    return all_choices

In [12]:
till_n_length_choices(string.digits, 1)

['7', '9', '6', '3', '5', '4', '8', '0', '1', '2']

In [13]:
till_n_length_choices(string.digits, 2)

['8',
 '2',
 '4',
 '3',
 '6',
 '5',
 '7',
 '0',
 '1',
 '9',
 '37',
 '51',
 '43',
 '73',
 '45',
 '66',
 '26',
 '77',
 '89',
 '64',
 '48',
 '78',
 '35',
 '83',
 '87',
 '53',
 '59',
 '13',
 '47',
 '69',
 '98',
 '81',
 '75',
 '91',
 '36',
 '62',
 '86',
 '15',
 '23',
 '40',
 '70',
 '55',
 '44',
 '67',
 '61',
 '90',
 '84',
 '68',
 '29',
 '76',
 '10',
 '54',
 '63',
 '11',
 '65',
 '82',
 '56',
 '03',
 '18',
 '32',
 '88',
 '57',
 '99',
 '21',
 '31',
 '96',
 '07',
 '12',
 '33',
 '42',
 '16',
 '92',
 '46',
 '58',
 '04',
 '52',
 '97',
 '41',
 '85',
 '19',
 '17',
 '50',
 '79',
 '28',
 '38',
 '27',
 '80',
 '71',
 '25',
 '05',
 '39',
 '01',
 '95',
 '00',
 '14',
 '94',
 '30',
 '20',
 '34',
 '02',
 '49',
 '08',
 '60',
 '93',
 '72',
 '24',
 '06',
 '22',
 '74',
 '09']

### Backtracking

Sometimes you have to backtrack.

In [14]:
def backtrack(prev_bytes, all_choices, seen_at):
    if not prev_bytes:
        raise BacktrackLimitException('Cant backtrack beyond zero index')
    # backtrack one byte
    seen = seen_at[len(prev_bytes)-1]
    seen_at.pop()
    last_byte = prev_bytes[-1]
    logit('backtracking %d %s' % (len(prev_bytes), last_byte))
    #assert (last_byte,) in seen
    prev_bytes = prev_bytes[:-1]
    choices = [i for i in all_choices if i not in seen]
    if not choices:
        return backtrack(prev_bytes, all_choices, seen_at)
    return seen, prev_bytes, choices

### The Decoder algorithm

In [15]:
def generate(validate, prev_bytes=None, tokens=set()):
    seen_at = []
    alphabet = SET_OF_BYTES | tokens
    all_choices = alphabet
    if prev_bytes is None: prev_bytes = ''
    seen = set()
    iter_limit = ITERATION_LIMIT
    while iter_limit:
        if len(prev_bytes) > INPUT_LIMIT:
            raise InputLimitException('Exhausted %d bytes' % INPUT_LIMIT)
        iter_limit -= 1
        choices = [i for i in all_choices if i not in seen]
        if not choices:
            seen, prev_bytes, choices = backtrack(prev_bytes, all_choices, seen_at)

        byte = new_byte(choices)
        cur_bytes = prev_bytes + byte
        l_cur_bytes = len(cur_bytes)

        logit('%s %s' % (repr(cur_bytes), len(cur_bytes)))

        rv, n,s = validate(cur_bytes)
        if rv == Status.Complete:
            return cur_bytes
        elif rv == Status.Incomplete:
            seen.add(byte)  # dont explore this byte again
            prev_bytes = cur_bytes
            seen_at.append(seen)
            seen = set()

            # reset this if it was modified by incorrect
            all_choices = list(alphabet)
        elif rv == Status.Incorrect:
            if n is None or n == -1:
                seen.add(byte)
                continue
            else:
                logit("-%s %s" % (len(choices), len(seen)))
                if n < len(seen_at):
                    seen = seen_at[n]
                    seen_at = seen_at[:n]
                seen.add(byte)
                rs = len(cur_bytes) - n
                all_choices = till_n_length_choices(list(alphabet), min(rs, 2))
                prev_bytes = prev_bytes[:n]
        else:
            raise Exception(rv)
    raise IterationLimitException('Exhausted %d loops' % ITERATION_LIMIT)


## Example Hello

In [16]:
def conforming_hello(inputstr):
    try:
        if inputstr[0] != 'H':
            return Status.Incorrect, None, ''
        if inputstr[1] != 'E':
            return Status.Incorrect, None, ''
        if inputstr[2] != 'L':
            return Status.Incorrect, None, ''
        if inputstr[3] != 'L':
            return Status.Incorrect, None, ''
        if inputstr[4] != 'O':
            return Status.Incorrect, None, ''
        return Status.Complete, None, ''
    except IndexError:
        return Status.Incomplete, None, ''

In [17]:
generate(conforming_hello)

'n' 1
'{' 1
'2' 1
'h' 1
'/' 1
'm' 1
'%' 1
'`' 1
'6' 1
'<' 1
'\x0b' 1
'P' 1
'8' 1
'!' 1
'.' 1
'N' 1
'1' 1
'v' 1
'T' 1
'i' 1
'\t' 1
'p' 1
'H' 1
'HW' 2
'H^' 2
'HC' 2
'HS' 2
'H9' 2
'H\x0b' 2
'H!' 2
'Hi' 2
'H2' 2
'HN' 2
'H\n' 2
'HE' 2
'HE2' 3
'HE ' 3
"HE'" 3
'HE5' 3
'HE"' 3
'HEC' 3
'HEN' 3
'HEf' 3
'HEr' 3
'HE3' 3
'HEk' 3
'HED' 3
'HEu' 3
'HEb' 3
'HEW' 3
'HE\t' 3
'HE\x0b' 3
'HEd' 3
'HE`' 3
'HE+' 3
'HE{' 3
'HEI' 3
'HEn' 3
'HES' 3
'HE*' 3
'HE\\' 3
'HEj' 3
'HEv' 3
'HEo' 3
'HE.' 3
'HEF' 3
'HE!' 3
'HE\n' 3
'HEK' 3
'HE\r' 3
'HEz' 3
'HEi' 3
'HEp' 3
'HEH' 3
'HEM' 3
'HEt' 3
'HEx' 3
'HEB' 3
'HE#' 3
'HEh' 3
'HEY' 3
'HE8' 3
'HEq' 3
'HE1' 3
'HE}' 3
'HEO' 3
'HE(' 3
'HEe' 3
'HE>' 3
'HEg' 3
'HE\x0c' 3
'HEc' 3
'HEG' 3
'HEX' 3
'HE)' 3
'HE6' 3
'HE=' 3
'HEZ' 3
'HE$' 3
'HE&' 3
'HE~' 3
'HE^' 3
'HE4' 3
'HEm' 3
'HE0' 3
'HE_' 3
'HEV' 3
'HEE' 3
'HE[' 3
'HE%' 3
'HEl' 3
'HEA' 3
'HEP' 3
'HEQ' 3
'HEa' 3
'HEL' 3
'HELy' 4
'HEL\\' 4
'HELU' 4
'HELz' 4
'HEL1' 4
'HEL|' 4
'HELi' 4
'HELG' 4
'HELK' 4
'HELS' 4
'HELW' 4
'HEL6' 4
'HE

'HELLO'

## Example Paren

In [18]:
def parens(xs):
    stack = [[]]
    while True:
        x, xs = xs[0], xs[1:]
        if x == '(':
            stack[-1].append([])
            stack.append(stack[-1][-1])
        elif x == ')':
            stack.pop()
            if not stack:
                raise Exception('error: opening bracket is missing')
                #raise ValueError('error: opening bracket is missing')
        elif x in '01':
            stack[-1].append(x)
        else:
            raise Exception('error: Only binary numbers')
        if xs == '':
            break
    if len(stack) > 1:
        raise Exception('incomplete: closing bracket is missing')
        #raise ValueError('error: closing bracket is missing')
    return stack.pop()

In [19]:
def conforming_parens(input_str):
    try:
        parens(input_str)
        return Status.Complete,-1,""
    except Exception as e:
        msg = str(e)
        if msg.startswith("incomplete:"):
            n = len(msg)
            return Status.Incomplete, None, ""
        elif msg.startswith("error"):
            return Status.Incorrect,None, input_str[-1]
        else:
            raise e

In [20]:
generate(conforming_parens)

'J' 1
']' 1
'R' 1
'a' 1
'+' 1
'w' 1
';' 1
'=' 1
'C' 1
'I' 1
'\t' 1
'2' 1
'\x0c' 1
'`' 1
'Y' 1
'"' 1
'8' 1
'-' 1
'!' 1
'*' 1
'4' 1
'r' 1
'S' 1
'm' 1
'b' 1
'5' 1
'}' 1
'F' 1
'T' 1
'&' 1
'p' 1
'[' 1
'i' 1
'k' 1
'g' 1
'X' 1
'j' 1
'N' 1
't' 1
'~' 1
'?' 1
'#' 1
'G' 1
'\r' 1
'D' 1
'\n' 1
'0' 1


'0'

## Example JSON

In [21]:
import json

In [22]:
JSON_TOKENS = ['true', 'false', 'null']

In [23]:
PREFIX = {}
for token in JSON_TOKENS:
    PREFIX[token] = [token[0:i+1] for i in range(len(token)-1)]

In [24]:
PREFIX

{'true': ['t', 'tr', 'tru'],
 'false': ['f', 'fa', 'fal', 'fals'],
 'null': ['n', 'nu', 'nul']}

In [25]:
def it_fits(input_str):
    try:
        json.loads(input_str)
        logit('*', repr(input_str))
        return True
    except Exception as e:
        msg = str(e)
        if msg.startswith('Expecting'):
            # Expecting value: line 1 column 4 (char 3)
            n = int(msg.rstrip(')').split()[-1])
            if n >= len(input_str):
                logit('+', repr(input_str))
                return True
        return False

In [26]:
def conforming_json(input_str):
    try:
        json.loads(input_str)
        logit('*', repr(input_str))
        return Status.Complete, -1, ''
    except Exception as e:
        msg = str(e)
        if msg.startswith('Expecting'):
            # Expecting value: line 1 column 4 (char 3)
            n = int(msg.rstrip(')').split()[-1])
            # If the error is 'outside' the string, it can still be valid
            if n >= len(input_str):
                logit('+', repr(input_str))
                return Status.Incomplete, n, ''
            elif len(input_str) > 1 and input_str[-1] == '.' and input_str[-2].isdigit():
                # JSON returns incorrect for [3. rather than incomplete.
                return Status.Incomplete, n, ''
            else:
                logit('X', repr(input_str))
                remaining = input_str[n:]
                for word in JSON_TOKENS:
                    if remaining in PREFIX[word]:
                        # check if it fits first.
                        if it_fits(input_str[:n] + word):
                            return Status.Incomplete, n, input_str[n]
                    return Status.Incorrect, None, input_str[n]
                return Status.Incorrect, None, input_str[n]
        elif msg.startswith('Unterminated'):
            # Unterminated string starting at: line 1 column 1 (char 0)
            n = int(msg.rstrip(')').split()[-1])
            if n >= len(input_str):
                logit('+', repr(input_str))
                return Status.Incomplete, n, ''
            else:
                logit('+', repr(input_str))
                return Status.Incomplete, n, input_str[n]
        elif msg.startswith('Extra data'):
            n = int(msg.rstrip(')').split()[-1])
            if n >= len(input_str):
                logit('X', repr(input_str))
                return Status.Incorrect, None, ''
            else:
                logit('X', repr(input_str))
                return Status.Incorrect, None, input_str[n]
        elif msg.startswith('Invalid '):
            idx = msg.find('(char ')
            eidx = msg.find(')')
            s = msg[idx + 6:eidx]
            n = int(s)
            logit('X', repr(input_str))
            return Status.Incorrect, None, input_str[n]
        else:
            raise e

In [27]:
(js_ex := generate(conforming_json))

'X' 1
X 'X'
']' 1
X ']'
'a' 1
X 'a'
'8' 1
* '8'


'8'

In [28]:
print(json.dumps(json.loads(js_ex), indent=4))

8


## Example Imprecise Hello

In [29]:
def conforming_ihello(inputstr):
    try:
        if inputstr[0] != 'H':
            return Status.Incorrect, 0, ''
        if inputstr[1] != 'E':                                                   
            return Status.Incorrect, 1, ''
        if inputstr[2] != 'L':
            return Status.Incorrect, 2, ''
        v = inputstr[3:5]
        if len(v) != 2: raise IndexError
        if v != 'LO':
            return Status.Incorrect, 3, ''
        return Status.Complete, None, ''
    except IndexError:
        return Status.Incomplete, len(inputstr), ''

Unfortunately WASM+Python imposes a really huge overhead. So, we limit our alphabet to have any hope to finish in time.

In [30]:
SET_OF_BYTES = {c for c in string.ascii_uppercase}

In [31]:
generate(conforming_ihello)

'Q' 1
-26 0
'C' 1
-25 1
'K' 1
-24 2
'W' 1
-23 3
'U' 1
-22 4
'V' 1
-21 5
'F' 1
-20 6
'M' 1
-19 7
'S' 1
-18 8
'L' 1
-17 9
'X' 1
-16 10
'J' 1
-15 11
'N' 1
-14 12
'D' 1
-13 13
'T' 1
-12 14
'I' 1
-11 15
'R' 1
-10 16
'Y' 1
-9 17
'H' 1
'HM' 2
-26 0
'HV' 2
-25 1
'HD' 2
-24 2
'HO' 2
-23 3
'HU' 2
-22 4
'HN' 2
-21 5
'HK' 2
-20 6
'HY' 2
-19 7
'HT' 2
-18 8
'HF' 2
-17 9
'HR' 2
-16 10
'HW' 2
-15 11
'HQ' 2
-14 12
'HG' 2
-13 13
'HH' 2
-12 14
'HX' 2
-11 15
'HL' 2
-10 16
'HP' 2
-9 17
'HZ' 2
-8 18
'HC' 2
-7 19
'HB' 2
-6 20
'HA' 2
-5 21
'HE' 2
'HER' 3
-26 0
'HEU' 3
-25 1
'HEQ' 3
-24 2
'HEP' 3
-23 3
'HET' 3
-22 4
'HEC' 3
-21 5
'HES' 3
-20 6
'HEV' 3
-19 7
'HEI' 3
-18 8
'HEN' 3
-17 9
'HEL' 3
'HELV' 4
'HELVT' 5
-26 0
'HELHI' 5
-700 2
'HELTA' 5
-699 3
'HELKJ' 5
-698 4
'HELJF' 5
-697 5
'HELOP' 5
-696 6
'HELAH' 5
-695 7
'HELXY' 5
-694 8
'HELPK' 5
-693 9
'HELCB' 5
-692 10
'HELEX' 5
-691 11
'HELGT' 5
-690 12
'HELPZ' 5
-689 13
'HELIJ' 5
-688 14
'HELPQ' 5
-687 15
'HELHP' 5
-686 16
'HELGX' 5
-685 17
'HELDR' 5
-684 18


'HELLO'

## Example MathExpr

In [32]:
class Parser:
    def __init__(self, string, vars={}):
        self.string = string
        self.index = 0
        self.vars = {
            'pi': 3.141592653589793,
            'e': 2.718281828459045
        }
        for var in vars.keys():
            if self.getVarValue(var) != None:
                raise Exception("Cannot redefine the value of " + var)
            self.vars[var] = vars[var]

    def hasVar(self, v):
        for k in self.vars.keys():
            if v == k:
                return True
        return False

    def getVarValue(self, v, default):
        if not self.hasVar(v): return default
        return self.vars[v]

    def getValue(self):
        value = self.parseExpression()
        self.skipWhitespace()
        if self.hasNext():
            raise Exception(
                "Unexpected character found: '" +
                self.peek() +
                "' at index " +
                str(self.index))
        return value

    def peek(self):
        return self.string[self.index:self.index + 1]

    def hasNext(self):
        return self.string[self.index:] != ''

    def skipWhitespace(self):
        while self.hasNext():
            if self.peek() in ' \t\n\r':
                self.index += 1
            else:
                return

    def parseExpression(self):
        return self.parseAddition()

    def parseAddition(self):
        values = [self.parseMultiplication()]
        while True:
            self.skipWhitespace()
            char = self.peek()
            if char == '+':
                self.index += 1
                values.append(self.parseMultiplication())
            elif char == '-':
                self.index += 1
                values.append(-1 * self.parseMultiplication())
            else:
                break
        return sum(values)

    def parseMultiplication(self):
        values = [self.parseParenthesis()]
        while True:
            self.skipWhitespace()
            char = self.peek()
            if char == '*':
                self.index += 1
                values.append(self.parseParenthesis())
            elif char == '/':
                div_index = self.index
                self.index += 1
                denominator = self.parseParenthesis()
                if denominator == 0:
                    raise Exception(
                        "Division by 0 kills baby whales (occured at index " +
                        str(div_index) +
                        ")")
                values.append(1.0 / denominator)
            else:
                break
        value = 1.0
        for factor in values:
            value *= factor
        return value

    def parseParenthesis(self):
        self.skipWhitespace()
        char = self.peek()
        if char == '(':
            self.index += 1
            value = self.parseExpression()
            self.skipWhitespace()
            c = self.peek()
            if c and c != ')':
                raise Exception('Only numbers')
            if self.peek() != ')':
                raise Exception(
                    "No closing parenthesis found at character "
                    + str(self.index))
            self.index += 1
            return value
        else:
            return self.parseNegative()

    def parseNegative(self):
        self.skipWhitespace()
        char = self.peek()
        if char == '-':
            self.index += 1
            return -1 * self.parseParenthesis()
        else:
            return self.parseValue()

    def parseValue(self):
        self.skipWhitespace()
        char = self.peek()
        if char in '0123456789.':
            return self.parseNumber()
        else:
            raise Exception('Only numbers')
            #return self.parseVariable()

    def parseVariable(self):
        self.skipWhitespace()
        var = ''
        while self.hasNext():
            char = self.peek()
            if char.lower() in '_abcdefghijklmnopqrstuvwxyz0123456789':
                var += char
                self.index += 1
            else:
                break

        value = self.getVarValue(var, None)
        if value == None:
            raise Exception( "Unrecognized variable: '" + var + "'")
        return float(value)

    def parseNumber(self):
        self.skipWhitespace()
        strValue = ''
        decimal_found = False
        char = None

        while self.hasNext():
            char = self.peek()
            if char == '.':
                if decimal_found:
                    raise Exception(
                        "Found an extra period in a number at character " +
                        str(self.index) +
                        ". Are you European?")
                decimal_found = True
                strValue += '.'
            elif char in '0123456789':
                strValue += char
            else:
                break
            self.index += 1

        if len(strValue) == 0:
            if char == '' or char is None:
                raise Exception("Unexpected end found")
            else:
                raise Exception(
                    "I was expecting to find a number at character " +
                    str(self.index) +
                    " but instead I found a '" +
                    str(char) +
                    "'. What's up with that?")

        return float(strValue)

In [33]:
def conforming_mathexpr(s):
    try:
        p = Parser(s)
        p.getValue()
        return Status.Complete, None, ''
    except Exception as e:
        msg = str(e)
        if msg.startswith('Unexpected end'):
            return Status.Incomplete, None, ''
        if msg.startswith('Unrecognized variable:'):
            return Status.Incorrect, None, ''
        if msg.startswith('Unexpected character found'):
            return Status.Incorrect, None, ''
        if msg.startswith('Only numbers'):
            return Status.Incorrect, None, ''
        if msg.startswith('No closing parenthesis found'):
            return Status.Incomplete, None, ''
        if msg.startswith('could not convert string to float:'):
            # semantics
            return Status.Complete, None, ''
        if msg.startswith('Cannot redefine the value of '):
            # semantics
            return Status.Complete, None, ''
        if msg.startswith('Division by 0'):
            # semantics
            return Status.Complete, None, ''
        #print(e)
        #print(str(e))
        raise e

We reset the alphabet first.

In [34]:
SET_OF_BYTES = {c for c in string.printable}

In [35]:
(v := generate(conforming_mathexpr))

'`' 1
'y' 1
']' 1
't' 1
',' 1
'N' 1
's' 1
'!' 1
'$' 1
'A' 1
'b' 1
'_' 1
'<' 1
'\\' 1
'~' 1
'9' 1


'9'

## PyParsing

In [36]:
import pyparsing
from pyparsing import *

Pyparsing can have numerous alternative parses. So, we limit our iteration, so that our browser does not hang.

In [37]:
ITERATION_LIMIT=1000

In [38]:
INPUT_LIMIT=100

With pyparsing, the parser provides the required information directly. We do not have to do much work.

In [39]:
def conforming_pyparse(expr, s):
    try:
        expr.parseString(s)
        return Status.Complete, None, ''
    except ParseException as e:
        if e.loc < len(s):
            return Status.Incorrect, None, ''
        else:
            return Status.Incomplete, None, ''
        print(e.pstr)
        print(e)

Pyparsing returns incorrect result when escape sequences are involved. So, we filter them out

In [40]:
aword = Word(alphas) + "!"

In [41]:
for my_string in ['ab', 'a\tb', 'a\nb', 'a\\b']:
    try:
        aword.parseString(my_string)
    except ParseException as e:
        print(e.loc, repr(e.pstr))

2 'ab'
8 'a       b'
2 'a\nb'
1 'a\\b'


In [62]:
SET_OF_BYTES = {c for c in string.printable if c not in "\n\t\r\x0b\x0c\\"}

### Hello World

In [63]:
greet = Word(alphas) + "," + Word(alphas) + "!"

In [64]:
def conforming_greet(s):
    return conforming_pyparse(greet, s)

In [65]:
generate(conforming_greet)

'S' 1
'S?' 2
'SF' 2
'SFe' 3
'SFeu' 4
'SFeux' 5
'SFeux6' 6
'SFeux!' 6
'SFeuxp' 6
'SFeuxpf' 7
'SFeuxpf]' 8
'SFeuxpfa' 8
'SFeuxpfa+' 9
'SFeuxpfa|' 9
'SFeuxpfa2' 9
'SFeuxpfaN' 9
'SFeuxpfaNR' 10
'SFeuxpfaNR,' 11
'SFeuxpfaNR,]' 12
'SFeuxpfaNR,n' 12
'SFeuxpfaNR,n%' 13
'SFeuxpfaNR,nn' 13
'SFeuxpfaNR,nnN' 14
'SFeuxpfaNR,nnNn' 15
'SFeuxpfaNR,nnNnv' 16
'SFeuxpfaNR,nnNnvv' 17
'SFeuxpfaNR,nnNnvvQ' 18
'SFeuxpfaNR,nnNnvvQ/' 19
'SFeuxpfaNR,nnNnvvQW' 19
'SFeuxpfaNR,nnNnvvQW(' 20
'SFeuxpfaNR,nnNnvvQWp' 20
'SFeuxpfaNR,nnNnvvQWpY' 21
'SFeuxpfaNR,nnNnvvQWpY^' 22
'SFeuxpfaNR,nnNnvvQWpYT' 22
'SFeuxpfaNR,nnNnvvQWpYTk' 23
'SFeuxpfaNR,nnNnvvQWpYTk]' 24
'SFeuxpfaNR,nnNnvvQWpYTkw' 24
'SFeuxpfaNR,nnNnvvQWpYTkwV' 25
'SFeuxpfaNR,nnNnvvQWpYTkwV4' 26
'SFeuxpfaNR,nnNnvvQWpYTkwVh' 26
'SFeuxpfaNR,nnNnvvQWpYTkwVh9' 27
'SFeuxpfaNR,nnNnvvQWpYTkwVhH' 27
'SFeuxpfaNR,nnNnvvQWpYTkwVhH2' 28
'SFeuxpfaNR,nnNnvvQWpYTkwVhHi' 28
'SFeuxpfaNR,nnNnvvQWpYTkwVhHi?' 29
'SFeuxpfaNR,nnNnvvQWpYTkwVhHih' 29
'SFeuxpfaNR,nnNnvvQWpYTkwVhHih:' 30


'SFeuxpfaNR,nnNnvvQWpYTkwVhHihxJrAXmRdPVlp!'

### IP Address

With IP address, we limit to ipv4

In [66]:
tests="""#
127.0.0.1                       # The "localhost" IPv4 address
127.0.0.1:80                    # The "localhost" IPv4 address, with a specified port (80)
192.168.0.1                     # private
256.0.0.0                       # invalid, octet > 255 (currently not detected)
"""

def join(args):
    args[0]="".join(args)
    del args[1:]

def replace(val):
    def lambda_replace(args):
        args[0]=val
        del args[1:]
    return lambda_replace
 
def atoi(args): args[0]=int(args[0])
def itohex2(args): args[0]="%02x"%args[0]
 
def hextoi(args): args[0]=int(args[0], 16)
def itohex4(args): args[0]="%04x"%args[0]
 
def assert_in_range(lwb, upb):
    def range_check(args):
        return # turn range checking off
        if args[0] < lwb:
            raise ValueError("value %d < %d"%(args[0], lwb))
        if args[0] > upb:
            raise ValueError("value %d > %d"%(args[0], upb))
    return range_check
 
dot = Literal(".").suppress()("dot"); colon = Literal(":").suppress()("colon")
octet = Word(nums).setParseAction(atoi,assert_in_range(0,255),itohex2)("octet");
 
port = Word(nums).setParseAction(atoi,assert_in_range(0,256*256-1))("port")
ipv4 = (octet + (dot+octet)*3)("addr")
ipv4.setParseAction(join) #,hextoi)
 
ipv4_port = ipv4+colon.suppress()+port
 
ip_fmt = (
           (ipv4_port|ipv4)("ipv4")
         ) + LineEnd()


In [67]:
def conforming_ipaddress(s):
    return conforming_pyparse(ip_fmt, s)

In [68]:
generate(conforming_ipaddress)

'q' 1
'A' 1
']' 1
'z' 1
'W' 1
'}' 1
'R' 1
':' 1
'c' 1
'_' 1
'f' 1
'T' 1
'D' 1
'u' 1
'e' 1
'a' 1
'j' 1
'!' 1
'I' 1
'b' 1
'|' 1
'M' 1
'/' 1
')' 1
'#' 1
'1' 1
'1^' 2
'1~' 2
'1a' 2
'1`' 2
'1)' 2
'1c' 2
'1w' 2
'1?' 2
'1+' 2
'1H' 2
'1n' 2
'1*' 2
'1y' 2
'1<' 2
'1t' 2
'1Y' 2
'1o' 2
'1u' 2
'14' 2
'14U' 3
'14N' 3
'14,' 3
'14A' 3
'14 ' 3
'14 i' 4
'14 K' 4
'14 c' 4
'14 %' 4
'14 =' 4
'14 o' 4
'14 A' 4
'14 L' 4
'14 @' 4
'14 l' 4
'14 W' 4
'14 >' 4
'14 ,' 4
'14 M' 4
'14 +' 4
'14 f' 4
'14 J' 4
'14 C' 4
'14 g' 4
'14 h' 4
'14 .' 4
'14 .@' 5
'14 .-' 5
'14 .$' 5
'14 .a' 5
'14 .f' 5
'14 .g' 5
'14 .4' 5
'14 .4&' 6
"14 .4'" 6
'14 .4<' 6
'14 .4]' 6
'14 .4 ' 6
'14 .4 &' 7
'14 .4 %' 7
'14 .4 {' 7
'14 .4 J' 7
'14 .4 K' 7
'14 .4 5' 7
'14 .4 !' 7
'14 .4 h' 7
'14 .4 u' 7
'14 .4 .' 7
'14 .4 .7' 8
'14 .4 .7Y' 9
'14 .4 .7e' 9
'14 .4 .7V' 9
'14 .4 .7p' 9
'14 .4 .7{' 9
'14 .4 .74' 9
'14 .4 .74T' 10
'14 .4 .74!' 10
'14 .4 .74#' 10
'14 .4 .74c' 10
'14 .4 .74n' 10
'14 .4 .74P' 10
'14 .4 .745' 10
'14 .4 .745(' 11
'14 .4 .745

'14 .4 .745   .1'

### SSN

In [69]:
dash = '-'

ssn_parser = Combine(
  Word(nums, exact=3)
  + dash
  + Word(nums, exact=2)
  + dash
  + Word(nums, exact=4)
)

In [70]:
def conforming_ssn(s):
    return conforming_pyparse(ssn_parser, s)

In [71]:
generate(conforming_ssn)

';' 1
'3' 1
'3C' 2
'3=' 2
'3A' 2
'3o' 2
'3h' 2
'3!' 2
'3N' 2
'3$' 2
'3Y' 2
'3.' 2
'3v' 2
'3b' 2
'3s' 2
'3k' 2
'3*' 2
'3n' 2
'3E' 2
'3Z' 2
'38' 2
'38O' 3
'38p' 3
'38g' 3
'38_' 3
'389' 3
'389 ' 4
'389_' 4
'389J' 4
'389>' 4
'389$' 4
'389V' 4
'389-' 4
'389-&' 5
'389-I' 5
'389- ' 5
'389-f' 5
'389-l' 5
'389-g' 5
'389-)' 5
'389-,' 5
'389-(' 5
'389-K' 5
'389-o' 5
'389->' 5
'389-O' 5
'389-u' 5
'389-R' 5
'389-x' 5
'389-5' 5
'389-5y' 6
'389-59' 6
'389-59y' 7
'389-59[' 7
'389-59*' 7
'389-597' 7
'389-59V' 7
'389-59X' 7
'389-599' 7
'389-59^' 7
'389-59"' 7
'389-59.' 7
'389-59 ' 7
'389-59c' 7
'389-59n' 7
'389-59R' 7
'389-59}' 7
'389-59u' 7
"389-59'" 7
'389-59/' 7
'389-59@' 7
'389-59k' 7
'389-592' 7
'389-59F' 7
'389-59!' 7
'389-594' 7
'389-59s' 7
'389-59A' 7
'389-596' 7
'389-59>' 7
'389-59+' 7
'389-59C' 7
'389-59-' 7
'389-59-U' 8
'389-59-f' 8
'389-59-g' 8
'389-59-u' 8
'389-59->' 8
'389-59-7' 8
'389-59-76' 9
'389-59-76u' 10
'389-59-76$' 10
'389-59-76=' 10
'389-59-76Y' 10
'389-59-76D' 10
'389-59-76*' 10


'389-59-7670'

### BNF

In [72]:
import math
import operator

exprStack = []


def push_first(toks):
    exprStack.append(toks[0])


def push_unary_minus(toks):
    for t in toks:
        if t == "-":
            exprStack.append("unary -")
        else:
            break


bnf = None


def BNF():
    """
    expop   :: '^'
    multop  :: '*' | '/'
    addop   :: '+' | '-'
    integer :: ['+' | '-'] '0'..'9'+
    atom    :: PI | E | real | fn '(' expr ')' | '(' expr ')'
    factor  :: atom [ expop factor ]*
    term    :: factor [ multop factor ]*
    expr    :: term [ addop term ]*
    """
    global bnf
    if not bnf:
        # use CaselessKeyword for e and pi, to avoid accidentally matching
        # functions that start with 'e' or 'pi' (such as 'exp'); Keyword
        # and CaselessKeyword only match whole words
        e = CaselessKeyword("E")
        pi = CaselessKeyword("PI")
        # fnumber = Combine(Word("+-"+nums, nums) +
        #                    Optional("." + Optional(Word(nums))) +
        #                    Optional(e + Word("+-"+nums, nums)))
        # or use provided pyparsing_common.number, but convert back to str:
        # fnumber = ppc.number().addParseAction(lambda t: str(t[0]))
        fnumber = Regex(r"[+-]?\d+(?:\.\d*)?(?:[eE][+-]?\d+)?")
        ident = Word(alphas, alphanums + "_$")

        plus, minus, mult, div = map(Literal, "+-*/")
        lpar, rpar = map(Suppress, "()")
        addop = plus | minus
        multop = mult | div
        expop = Literal("^")

        expr = Forward()
        expr_list = delimitedList(Group(expr))
        # add parse action that replaces the function identifier with a (name, number of args) tuple
        def insert_fn_argcount_tuple(t):
            fn = t.pop(0)
            num_args = len(t[0])
            t.insert(0, (fn, num_args))

        fn_call = (ident + lpar - Group(expr_list) + rpar).setParseAction(
            insert_fn_argcount_tuple
        )
        atom = (
            addop[...]
            + (
                (fn_call | pi | e | fnumber | ident).setParseAction(push_first)
                | Group(lpar + expr + rpar)
            )
        ).setParseAction(push_unary_minus)

        # by defining exponentiation as "atom [ ^ factor ]..." instead of "atom [ ^ atom ]...", we get right-to-left
        # exponents, instead of left-to-right that is, 2^3^2 = 2^(3^2), not (2^3)^2.
        factor = Forward()
        factor <<= atom + (expop + factor).setParseAction(push_first)[...]
        term = factor + (multop + factor).setParseAction(push_first)[...]
        expr <<= term + (addop + term).setParseAction(push_first)[...]
        bnf = expr
    return bnf


In [73]:
def conforming_bnf(s):
    return conforming_pyparse(BNF(), s)

In [74]:
generate(conforming_bnf)

'8' 1


'8'

### URL Parser

In [75]:
url_chars = alphanums + '-_.~%+'

fragment  = Combine((Suppress('#') + Word(url_chars)))('fragment')

scheme = oneOf(['http', 'https', 'ftp', 'file'])('scheme')
host = Combine(delimitedList(Word(url_chars), '.'))('host')
port = Suppress(':') + Word(nums)('port')
user_info = (
  Word(url_chars)('username')
  + Suppress(':')
  + Word(url_chars)('password')
  + Suppress('@')
)

query_pair = Group(Word(url_chars) + Suppress('=') + Word(url_chars))
query = Group(Suppress('?') + delimitedList(query_pair, '&'))('query')

path = Combine(
  Suppress('/')
  + OneOrMore(~query + Word(url_chars + '/'))
)('path')

url_parser = (
  scheme
  + Suppress('://')
  + Optional(user_info)
  + host
  + Optional(port)
  + Optional(path)
  + Optional(query)
  + Optional(fragment)
)

In [76]:
def conforming_urls(s):
    return conforming_pyparse(url_parser, s)

Pyparser is bad at correctly accounting for spaces.

In [77]:
try:
    url_parser.parseString('http ://')
except ParseException as e:
    print(e.loc)
    print(str(e))

8
Expected W:(%+--.0-9A-Z_a-z~), found end of text  (at char 8), (line:1, col:9)


In [79]:
SET_OF_BYTES = {c for c in string.printable if c not in "\n\t\r\x0b\x0c\\ "}

In [80]:
(v := generate(conforming_urls, tokens={'http', 'https', 'ftp', 'file', '://'}))

'!' 1
'n' 1
'k' 1
'r' 1
'G' 1
'D' 1
'Y' 1
'$' 1
'U' 1
';' 1
'file' 4
'fileS' 5
'fileQ' 5
'filek' 5
'file+' 5
'fileY' 5
'fileB' 5
'fileI' 5
'file`' 5
'fileN' 5
'file*' 5
'filex' 5
'file:' 5
'filei' 5
'file8' 5
'file(' 5
'fileo' 5
'filef' 5
'filez' 5
'filej' 5
'filea' 5
'fileU' 5
'filec' 5
'file9' 5
'fileK' 5
'file6' 5
'file[' 5
'fileW' 5
'file@' 5
'filel' 5
'filey' 5
'file&' 5
'fileF' 5
'fileH' 5
'filem' 5
'fileD' 5
'filen' 5
'fileh' 5
'filer' 5
'files' 5
'file1' 5
'fileg' 5
'filee' 5
'fileE' 5
'file?' 5
'file://' 7
'file://file' 11


'file://file'

In [81]:
(v1 := generate(conforming_urls, prev_bytes=v, tokens={'http', 'https', 'ftp', 'file', '://'}))

'file://filey' 12


'file://filey'

In [82]:
(v2 := generate(conforming_urls,  prev_bytes=v1, tokens={'http', 'https', 'ftp', 'file', '://'}))

'file://fileyhttp' 16


'file://fileyhttp'

## End