In [39]:
def format_regex(regex):
    """
    Phase 1: Expand [...] into group (a|b|c)
    Phase 2: Insert explicit concatenation symbol '&' 
    """
    all_operators = ['|', '?', '+', '*', '^']
    binary_operators = ['^', '|']
    res = ""
    
    # --- Part 1: Handle [...] expansion ---
    # We will rebuild the string replacing [abc] with (a|b|c)

    i = 0
    while i < len(regex):
        c = regex[i]
        if c == '[':
            res += '('
            i += 1
            first = True
            while i < len(regex) and regex[i] != ']':
                if not first:
                    res += '|'
                res += regex[i]
                first = False
                i += 1
            res += ')'
        else:
            res += c
        i += 1
        
    # --- Part 2: Insert Explicit Concatenation '&' ---
    # We need '&' between:
    # 1. Literal & Literal (a b -> a&b)
    # 2. Literal & Open Paren (a ( -> a&()
    # 3. Close Paren & Literal () a -> )&a)
    # 4. Star/Plus/? & Literal (* a -> *&a)
    # Note: '.' (wildcard) is treated strictly as a Literal here.
    
    formatted_regex = ""
    for i in range(len(res)):
        c1 = res[i]
        formatted_regex += c1
        
        if i + 1 < len(res):
            c2 = res[i+1]
            
            # Define what counts as a "character" (including wildcard .)
            is_c1_op = c1 in all_operators or c1 == '('
            is_c2_op = c2 in all_operators or c2 == ')'
            
            # Logic: If C1 is NOT a standard operator (or is a closing paren/unary op)
            # AND C2 is NOT a standard operator (or is an opening paren)
            # THEN add concatenation.
            
            # Left side can be: Literal, '.', ')', '*', '+', '?'
            left_allows_concat = (c1 not in binary_operators and c1 != '(')
            
            # Right side can be: Literal, '.', '('
            right_allows_concat = (c2 not in all_operators and c2 != ')' and c2 != '|') or c2 == '('
            
            if left_allows_concat and right_allows_concat:
                formatted_regex += '&'
                
    return formatted_regex


In [58]:
def preprocess_regex(regex):
    ## Add | between characters in square brackets i.e Convert [ab] to (a|b)
    i = 0
    result =""
    while i < len(regex):
        c = regex[i]
    
        if c == '[':
            result += '(' 
            i += 1
            first = True
            
            # Loop until we find ']' or reach the end 
            while i < len(regex) and regex[i] != ']':
                if not first:
                    result += '|'
                result += regex[i]
                first = False
                i += 1
            
            # If the loop without finding ] , it's an error.
            if i >= len(regex):
                print("Invalid regex: Unbalanced parentheses (Unclosed square bracket)")
                return False
                
            result += ')'
            
        else:
            result += c
        
        i += 1
    
    if ']' in result:
        print("Invalid regex: Unbalanced parentheses (Closed square bracket)")
        return False
    
    special_operators = ['+' ,'*', '&', '?']
    literals = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.")
    
    """ 
    Preprocess the regex to add concatenation character & for NFA construction using postfix between the 2 characters in these cases:
    - literal followed by literal or open bracket
    - closed bracket followed by literal or open bracket
    - special operator followed by literal or open bracket
    """
    formatted_regex = ""
    for i, char in enumerate(result):
        formatted_regex += char
        
        if i + 1 < len(result):
            if ( (char in literals or char ==")") and (result[i+1] in literals or result[i+1] == "(") ) or (char in special_operators and (result[i+1] in literals or result[i+1] == "(") ):
                formatted_regex += "&"
    
    return formatted_regex
            

In [59]:
def shunting_yard(infix):
    preprocessed_infix = preprocess_regex(infix) 
    if preprocessed_infix:
        print(f"Preprocessed: {preprocessed_infix}")
    else:
        return False

    precedence = {
        '*': 5, 
        '+': 4, 
        '?': 3, 
        '&': 2, 
        '|': 1,
        '(': 0,
    }
    
    postfix = ""
    stack = []
    
    for char in preprocessed_infix:
        # if opening bracket, push to the stack, and if square bracket set the flag to true to add  |
        if char =='(':
            stack.append(char)
        
        # if closing square bracket, keep removing from the stack until finding an opening square bracket. If not found, the regex is invalid
        # elif char ==']' :
        #     while stack and stack[-1] != "[":
        #         postfix += stack.pop()
        #     if stack: 
        #         stack.pop()
        #     else:
        #         print("Invalid regex: Unbalanced parentheses (Extra closed square bracket)")
        #         return False
        #     flag = False
            
        # if closing bracket, keep removing from the stack until finding an opening square bracket. If not found, the regex is invalid
        elif char ==')':
            while stack and stack[-1] != "(":
                postfix += stack.pop()
            if stack: 
                stack.pop()
            else: 
                print("Invalid regex: Unbalanced parentheses (Extra closed bracket)")
                return False
                
        ## Check this
        # if another operator was added, keep removing from the stack until the remaining characters have less precedence
        elif char in precedence:
            while (stack and stack[-1] != '(' and 
                   precedence[stack[-1]] >= precedence[char]):
                postfix += stack.pop()
            stack.append(char)
            
        ## See if ? or & need any special handling
        
        else:
            postfix += char
        
    while stack:
        if stack[-1] == '(':
            print("Invalid regex: Unbalanced parentheses (Extra open bracket or square bracket)")
            return False
        postfix += stack.pop()
                
    return postfix


In [74]:
test_infix="(a?|b)*.[01]+"
print(shunting_yard(test_infix))

Preprocessed: (a?|b)*&.&(0|1)+
a?b|*.&01|+&


In [76]:
def run_stress_tests():
    test_cases = [
        ("(a|b|c)+",      "ab|c|+"),
        ("[ab]*|[cd]+",   "ab|*cd|+|"),
        ("(a.b|c.d)?",    "a.&b&c.&d&|?"),
        ("[ab](c|d)[ef]", "ab|cd|&ef|&"),
        ("a*b+c?",        "a*b+&c?&"),
        (".*.|.+",        ".*.&.+|"),
        ("[abc]+[xyz]*",  "ab|c|+xy|z|*&"),
        ("((a|b)|(c|d))", "ab|cd||"),
        ("a(b(c|d))e",    "abcd|&&e&"),
        ("(a*|b+)?(c|d)", "a*b+|?cd|&")
    ]

    print(f"{'INPUT':<20} | {'EXPECTED':<20} | {'ACTUAL':<20} | {'STATUS'}")
    print("-" * 75)
    
    passed = 0
    for regex, expected in test_cases:
        try:
            actual = shunting_yard(regex) 
            
            if actual == expected:
                status = "✅ PASS"
                passed += 1
            else:
                status = "❌ FAIL"
                
            print(f"{regex:<20} | {expected:<20} | {actual:<20} | {status}")
        except Exception as e:
            print(f"{regex:<20} | {expected:<20} | {'ERROR':<20} | ❌ CRASH: {e}")

    print("-" * 75)
    print(f"Total Score: {passed}/{len(test_cases)}")

run_stress_tests()

INPUT                | EXPECTED             | ACTUAL               | STATUS
---------------------------------------------------------------------------
Preprocessed: (a|b|c)+
(a|b|c)+             | ab|c|+               | ab|c|+               | ✅ PASS
Preprocessed: (a|b)*|(c|d)+
[ab]*|[cd]+          | ab|*cd|+|            | ab|*cd|+|            | ✅ PASS
Preprocessed: (a&.&b|c&.&d)?
(a.b|c.d)?           | a.&b&c.&d&|?         | a.&b&c.&d&|?         | ✅ PASS
Preprocessed: (a|b)&(c|d)&(e|f)
[ab](c|d)[ef]        | ab|cd|&ef|&          | ab|cd|&ef|&          | ✅ PASS
Preprocessed: a*&b+&c?
a*b+c?               | a*b+&c?&             | a*b+&c?&             | ✅ PASS
Preprocessed: .*&.|.+
.*.|.+               | .*.&.+|              | .*.&.+|              | ✅ PASS
Preprocessed: (a|b|c)+&(x|y|z)*
[abc]+[xyz]*         | ab|c|+xy|z|*&        | ab|c|+xy|z|*&        | ✅ PASS
Preprocessed: ((a|b)|(c|d))
((a|b)|(c|d))        | ab|cd||              | ab|cd||              | ✅ PASS
Preprocessed: a&(b&(c|d

In [77]:
def run_complex_tests():
    test_cases = [
        ("a.b|c",       "a.&b&c|"),
        ("a+|b*",       "a+b*|"),
        ("[01]+",       "01|+"),
        ("(ab)*",       "ab&*"),
        ("((a|b)c)*",   "ab|c&*"),
        ("a?b?c?",      "a?b?&c?&"),
        (".*|a.+",      ".*a.+&|"),
        ("[01].[01]",   "01|.&01|&"),
        ("[ab](c|d)",   "ab|cd|&"),
        ("(a?|b)*.[01]+", "a?b|*.&01|+&")
    ]

    print(f"{'INPUT':<20} | {'EXPECTED':<20} | {'ACTUAL':<20} | {'STATUS'}")
    print("-" * 75)
    
    passed = 0
    for regex, expected in test_cases:
        try:
            # Assuming your function is named 'shunting_yard' 
            # and it calls 'preprocess_regex' internally
            actual = shunting_yard(regex) 
            
            if actual == expected:
                status = "✅ PASS"
                passed += 1
            else:
                status = "❌ FAIL"
                
            print(f"{regex:<20} | {expected:<20} | {actual:<20} | {status}")
        except Exception as e:
            print(f"{regex:<20} | {expected:<20} | {'ERROR':<20} | ❌ CRASH: {e}")

    print("-" * 75)
    print(f"Total Score: {passed}/{len(test_cases)}")

# Run it
run_complex_tests()

INPUT                | EXPECTED             | ACTUAL               | STATUS
---------------------------------------------------------------------------
Preprocessed: a&.&b|c
a.b|c                | a.&b&c|              | a.&b&c|              | ✅ PASS
Preprocessed: a+|b*
a+|b*                | a+b*|                | a+b*|                | ✅ PASS
Preprocessed: (0|1)+
[01]+                | 01|+                 | 01|+                 | ✅ PASS
Preprocessed: (a&b)*
(ab)*                | ab&*                 | ab&*                 | ✅ PASS
Preprocessed: ((a|b)&c)*
((a|b)c)*            | ab|c&*               | ab|c&*               | ✅ PASS
Preprocessed: a?&b?&c?
a?b?c?               | a?b?&c?&             | a?b?&c?&             | ✅ PASS
Preprocessed: .*|a&.+
.*|a.+               | .*a.+&|              | .*a.+&|              | ✅ PASS
Preprocessed: (0|1)&.&(0|1)
[01].[01]            | 01|.&01|&            | 01|.&01|&            | ✅ PASS
Preprocessed: (a|b)&(c|d)
[ab](c|d)            | ab|cd|&  

In [78]:
def run_nightmare_tests():
    test_cases = [
        ("(a|b)*c+(d|e)?",  "ab|*c+&de|?&"),
        ("[abc]+|[def]*",   "ab|c|+de|f|*|"), # Note: expanded [a-c] manually to [abc] for the test string
        ("(.a)*|(b|.)+",    ".a&*b.|+|"),
        ("a(b(c(d|e)*)?)",  "bcde|*&?&&a&"), # Wait! Let's double check this one below.
        ("((((a))))",       "a"),
        ("a|b|c|d|e",       "ab|c|d|e|"),
        ("a?b+c*d",         "a?b+&c*&d&"),
        ("([ab]c)*",        "ab|c&*"),
        ("a.|.b",           "a.&.b&|"),
        ("(a+|b?)*",        "a+b?|*")
    ]

    print(f"{'INPUT':<20} | {'EXPECTED':<20} | {'ACTUAL':<20} | {'STATUS'}")
    print("-" * 75)
    
    passed = 0
    for regex, expected in test_cases:
        try:
            actual = shunting_yard(regex) 
            
            if actual == expected:
                status = "✅ PASS"
                passed += 1
            else:
                status = "❌ FAIL"
                
            print(f"{regex:<20} | {expected:<20} | {actual:<20} | {status}")
        except Exception as e:
            print(f"{regex:<20} | {expected:<20} | {'ERROR':<20} | ❌ CRASH: {e}")

    print("-" * 75)
    print(f"Total Score: {passed}/{len(test_cases)}")

run_nightmare_tests()

INPUT                | EXPECTED             | ACTUAL               | STATUS
---------------------------------------------------------------------------
Preprocessed: (a|b)*&c+&(d|e)?
(a|b)*c+(d|e)?       | ab|*c+&de|?&         | ab|*c+&de|?&         | ✅ PASS
Preprocessed: (a|b|c)+|(d|e|f)*
[abc]+|[def]*        | ab|c|+de|f|*|        | ab|c|+de|f|*|        | ✅ PASS
Preprocessed: (.&a)*|(b|.)+
(.a)*|(b|.)+         | .a&*b.|+|            | .a&*b.|+|            | ✅ PASS
Preprocessed: a&(b&(c&(d|e)*)?)
a(b(c(d|e)*)?)       | bcde|*&?&&a&         | abcde|*&?&&          | ❌ FAIL
Preprocessed: ((((a))))
((((a))))            | a                    | a                    | ✅ PASS
Preprocessed: a|b|c|d|e
a|b|c|d|e            | ab|c|d|e|            | ab|c|d|e|            | ✅ PASS
Preprocessed: a?&b+&c*&d
a?b+c*d              | a?b+&c*&d&           | a?b+&c*&d&           | ✅ PASS
Preprocessed: ((a|b)&c)*
([ab]c)*             | ab|c&*               | ab|c&*               | ✅ PASS
Preprocessed: a&.|.