<a href="https://colab.research.google.com/github/amrashraf15/RegularExpression-To-DFA/blob/main/AssPart1(REtoDFA).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install graphviz



In [1]:
import json
from graphviz import Digraph
from itertools import count

Classes


In [2]:
class State:
    c = count(0)
    def __init__(self):
        self.name = f"S{next(State.c)}"
        self.transitions = {}
        self.is_EndState = False
        self.is_StartState = False


In [3]:
class NFA:
    def __init__(self, start, end):
        self.start = start
        self.end = end


In [36]:
def create_tokens(regex):
    tokens = []
    i = 0

    while i < len(regex):
        char = regex[i]

        # Character class [ ... ]
        if char == '[':
            j = i + 1
            while j < len(regex) and regex[j] != ']':
                j += 1
            if j == len(regex):
                raise ValueError("Unclosed [ in regex")
            tokens.append(regex[i:j+1])
            i = j + 1
            continue


        if char == '.':
            tokens.append('.')
            i += 1
            continue


        if char in "*+?|()":
            tokens.append(char)
            i += 1
            continue


        if char.isalnum():
            tokens.append(char)
            i += 1
            continue

        raise ValueError(f"Invalid character: {char}")

    return tokens


Regex to Postfix Conversion (Shunting Yard Algorithm)

In [39]:
precedence = {'*': 3, '+': 3, '?': 3, '&': 2, '|': 1}

def addConcat(tokens):
    result = []

    for i in range(len(tokens)):
        result.append(tokens[i])

        if i + 1 < len(tokens):
            left = tokens[i]
            right = tokens[i+1]


            if (left not in "|(" and left != '&') and (right not in "|)*+?"):
                result.append('&')

    return result

def toPostfix(regex):
    tokens = create_tokens(regex)
    tokens = addConcat(tokens)

    output = []
    stack = []

    for token in tokens:


        if token not in precedence and token not in "()|":
            output.append(token)

        elif token == '(':
            stack.append(token)

        elif token == ')':
            while stack and stack[-1] != '(':
                output.append(stack.pop())
            stack.pop()

        elif token in precedence:
            while stack and stack[-1] != '(' and precedence.get(stack[-1], 0) >= precedence[token]:
                output.append(stack.pop())
            stack.append(token)

    while stack:
        output.append(stack.pop())

    return output


Thompson’s Construction Algorithm

In [41]:
def thompson(postfix):
    stack = []

    for token in postfix:

        # Literal or class or wildcard
        if token not in precedence and token not in ('&', '|', '*', '+', '?'):
            s0 = State()
            s1 = State()
            s0.transitions[token] = {s1}
            stack.append(NFA(s0, s1))

        elif token == '&':
            nfa2 = stack.pop()
            nfa1 = stack.pop()
            nfa1.end.transitions['ε'] = {nfa2.start}
            stack.append(NFA(nfa1.start, nfa2.end))

        elif token == '|':
            nfa2 = stack.pop()
            nfa1 = stack.pop()
            s0 = State()
            s1 = State()
            s0.transitions['ε'] = {nfa1.start, nfa2.start}
            nfa1.end.transitions['ε'] = {s1}
            nfa2.end.transitions['ε'] = {s1}
            stack.append(NFA(s0, s1))

        elif token == '*':
            nfa1 = stack.pop()
            s0 = State()
            s1 = State()
            s0.transitions['ε'] = {nfa1.start, s1}
            nfa1.end.transitions['ε'] = {nfa1.start, s1}
            stack.append(NFA(s0, s1))

        elif token == '+':
            nfa1 = stack.pop()
            s0 = State()
            s1 = State()
            s0.transitions['e'] = {nfa1.start}
            nfa1.end.transitions['e'] = {nfa1.start, s1}
            stack.append(NFA(s0, s1))

        elif token == '?':
            nfa1 = stack.pop()
            s0 = State()
            s1 = State()
            s0.transitions['ε'] = {nfa1.start, s1}
            nfa1.end.transitions['ε'] = {s1}
            stack.append(NFA(s0, s1))

    nfa = stack.pop()
    nfa.end.is_EndState = True
    return nfa


In [42]:
def nfaTojson(nfa, filename):
    states = {}
    visited = set()

    def dfs(state):
        if state.name in visited:
            return
        visited.add(state.name)
        transitions = {"isTerminatingState": state.is_EndState}
        for symbol, next_states in state.transitions.items():
            transitions[symbol] = [s.name for s in next_states]
            for s in next_states:
                dfs(s)
        states[state.name] = transitions

    dfs(nfa.start)
    data = {"startingState": nfa.start.name}
    data.update(states)

    with open(filename, "w") as f:
        json.dump(data, f, indent=2)
    return data


In [43]:
def draw_nfa(nfa, filename="nfa_graph"):
    dot = Digraph(format="png")
    dot.attr(rankdir="LR")

    visited = set()

    def dfs(state):
        if state.name in visited:
            return
        visited.add(state.name)


        if state.is_EndState:
            dot.node(state.name, shape="doublecircle")
        else:
            dot.node(state.name, shape="circle")

        for symbol, next_states in state.transitions.items():
            for s in next_states:
                label = "ε" if symbol == "ε" else symbol
                dot.edge(state.name, s.name, label=label)
                dfs(s)


    dot.node("start", shape="plaintext", label="")
    dot.edge("start", nfa.start.name)

    dfs(nfa.start)

    dot.render(filename, cleanup=True)
    print(f"Graph saved as {filename}.png")


In [44]:
def reset_state_counter():
    State.c = count(0)

In [45]:
def main():
    reset_state_counter()
    regex = "a[b-d]*c|e"

    postfix = toPostfix(regex)
    print(f"Regex: {regex}")
    print(f"Postfix: {postfix}")


    nfa = thompson(postfix)


    filename = "nfa.json"
    nfa_json = nfaTojson(nfa, filename)

    print(f"NFA saved to {filename}")
    print("NFA JSON structure:")
    print(json.dumps(nfa_json, indent=2))
    draw_nfa(nfa, "nfa_graph")


if __name__ == "__main__":
    main()


Regex: a[b-d]*c|e
Postfix: ['a', '[b-d]', '*', '&', 'c', '&', 'e', '|']
NFA saved to nfa.json
NFA JSON structure:
{
  "startingState": "S10",
  "S11": {
    "isTerminatingState": true
  },
  "S9": {
    "isTerminatingState": false,
    "\u03b5": [
      "S11"
    ]
  },
  "S8": {
    "isTerminatingState": false,
    "e": [
      "S9"
    ]
  },
  "S7": {
    "isTerminatingState": false,
    "\u03b5": [
      "S11"
    ]
  },
  "S6": {
    "isTerminatingState": false,
    "c": [
      "S7"
    ]
  },
  "S5": {
    "isTerminatingState": false,
    "\u03b5": [
      "S6"
    ]
  },
  "S3": {
    "isTerminatingState": false,
    "\u03b5": [
      "S5",
      "S2"
    ]
  },
  "S2": {
    "isTerminatingState": false,
    "[b-d]": [
      "S3"
    ]
  },
  "S4": {
    "isTerminatingState": false,
    "\u03b5": [
      "S5",
      "S2"
    ]
  },
  "S1": {
    "isTerminatingState": false,
    "\u03b5": [
      "S4"
    ]
  },
  "S0": {
    "isTerminatingState": false,
    "a": [
      "S1"
  