In [1]:
import re

def tokenize_x86(instruction):
    """
    Tokenizes a single x86 assembly instruction into components like mnemonics, registers, and operands.
    """
    # Define patterns for tokens
    token_pattern = r"""
        ;.*$             |  # Comments (semicolon to end of line)
        0x[a-fA-F0-9]+   |  # Hexadecimal numbers
        \b\d+\b          |  # Decimal numbers
        [a-zA-Z_][a-zA-Z0-9_]* |  # Mnemonics, labels, or registers
        \[|\]|,|:|\.      |  # Brackets, comma, colon, dot (symbols)
        [+\-*/()]         |  # Operators
        [\S]+               # Catch-all for any remaining symbols
    """
    # Compile regex pattern with verbose mode for readability
    token_regex = re.compile(token_pattern, re.VERBOSE)

    # Find all matches in the instruction
    tokens = token_regex.findall(instruction)

    # Remove comments and whitespace
    tokens = [token for token in tokens if not token.startswith(";")]

    return tokens


# Example usage
if __name__ == "__main__":
    # Sample x86 assembly instructions
    instructions = [
        "mov eax, 0x1 ; Load 1 into eax",
        "add eax, ebx ; Add ebx to eax",
        "jmp 0x00400000 ; Jump to address",
        "lea ecx, [ebx+4] ; Load effective address"
    ]

    # Tokenize each instruction
    for instr in instructions:
        tokens = tokenize_x86(instr)
        print(f"Instruction: {instr}")
        print(f"Tokens: {tokens}\n")


Instruction: mov eax, 0x1 ; Load 1 into eax
Tokens: ['mov', 'eax', ',', '0x1']

Instruction: add eax, ebx ; Add ebx to eax
Tokens: ['add', 'eax', ',', 'ebx']

Instruction: jmp 0x00400000 ; Jump to address
Tokens: ['jmp', '0x00400000']

Instruction: lea ecx, [ebx+4] ; Load effective address
Tokens: ['lea', 'ecx', ',', '[', 'ebx', '+', '4', ']']

