In [1]:
# http://devernay.free.fr/hacks/chip8/C8TECH10.HTM#Fx29

import re

START_ADDRESS = 0x200

assembler_code = '''
:start
    CLS ; comment
    LD V0, 0xFF
    LD V1, 0x1
    LD V2, 0x00
:loop
    SUB V0, V1
    SE V0, V2
    JP loop
    CALL sub
    JP start
:sub
    LD VF, 0x42
    RET
'''


def is_label(line):
    return line.startswith(':')

def count_commands(label, classified_lines):
    counter = 0
    for (line, islabel) in classified_lines:
        if not islabel:
            counter = counter + 1
        elif islabel and get_label(line) == label:
            return counter
        
def get_label(value):
    return value[1:]

def get_command(value):
    return value.strip()

def replace_label(value, labels_to_addresses):
    if value in labels_to_addresses:
        return hex(labels_to_addresses[value])
    else:
        return value

def replace_labels(command, labels_to_addresses):
    splitted_command = command.split(' ')
    return ' '.join([replace_label(command_part, labels_to_addresses) for command_part in splitted_command])
    
def replace_labels_from_code(start_address, code):
    classified_lines = [(line, is_label(line)) for line in code]
    
    labels = [label for (label, islabel) in classified_lines if islabel]
    commands = [get_command(command) for (command, islabel) in classified_lines if not islabel]
    
    labels_to_addresses = {}
    
    for label in labels:
        label_value = get_label(label)
        counter = count_commands(label_value, classified_lines)
        labels_to_addresses[label_value] = start_address + 2 * counter

    return [replace_labels(command, labels_to_addresses) for command in commands]

def prepare_code(start_address, assembler_code):
    no_empty_lines_assembler_code = [line for line in assembler_code.split('\n') if len(line) > 0]
    no_comments_assembler_code = [re.sub(r';.*', '', line) for line in no_empty_lines_assembler_code]

    return replace_labels_from_code(start_address, no_comments_assembler_code)

cleaned_code = prepare_code(START_ADDRESS, assembler_code)

print('Original:\n', assembler_code)
print('Cleaned:\n', cleaned_code)

Original:
 
:start
    CLS ; comment
    LD V0, 0xFF
    LD V1, 0x1
    LD V2, 0x00
:loop
    SUB V0, V1
    SE V0, V2
    JP loop
    CALL sub
    JP start
:sub
    LD VF, 0x42
    RET

Cleaned:
 ['CLS', 'LD V0, 0xFF', 'LD V1, 0x1', 'LD V2, 0x00', 'SUB V0, V1', 'SE V0, V2', 'JP 0x208', 'CALL 0x212', 'JP 0x200', 'LD VF, 0x42', 'RET']


In [13]:
ADDRESS = '(0x[0-9A-F]{3})'
REGISTER = 'V([0-9A-F])'
BYTE = '(0x[0-9A-F]{1,2})'
NIBBLE = '(0x[0-9A-F])'

system_call_pattern = re.compile('SYS {}'.format(ADDRESS))
clear_screen_pattern = re.compile('CLS')
return_pattern = re.compile('RET')
jump_address_pattern = re.compile('JP {}'.format(ADDRESS))
call_pattern = re.compile('CALL {}'.format(ADDRESS))
skip_equal_register_byte_pattern = re.compile('SE {}, {}'.format(REGISTER, BYTE))
skip_not_equal_register_byte_pattern = re.compile('SNE {}, {}'.format(REGISTER, BYTE))
skip_equal_register_register_pattern = re.compile('SE {}, {}'.format(REGISTER, REGISTER))
load_register_byte_pattern = re.compile('LD {}, {}'.format(REGISTER, BYTE))
add_register_byte_pattern = re.compile('ADD {}, {}'.format(REGISTER, BYTE))
load_register_register_pattern = re.compile('LF {}, {}'.format(REGISTER, REGISTER))
or_register_register_pattern = re.compile('OR {}, {}'.format(REGISTER, REGISTER))
and_register_register_pattern = re.compile('AND {}, {}'.format(REGISTER, REGISTER))
xor_register_register_pattern = re.compile('XOR {}, {}'.format(REGISTER, REGISTER))
add_register_register_pattern = re.compile('ADD {}, {}'.format(REGISTER, REGISTER))
subtract_register_register_pattern = re.compile('SUB {}, {}'.format(REGISTER, REGISTER))
shift_right_register_pattern = re.compile('SHR {}(, {})+'.format(REGISTER, REGISTER))
subtract_reversed_register_register_pattern = re.compile('SUBN {}, {}'.format(REGISTER, REGISTER))
shift_left_register_pattern = re.compile('SHL {}(, {})+'.format(REGISTER, REGISTER))
skip_not_equal_register_register_pattern = re.compile('SNE {}, {}'.format(REGISTER, REGISTER))
load_i_register_address_pattern = re.compile('LD I, {}'.format(ADDRESS))
jump_register_address_pattern = re.compile('JP V0, {}'.format(ADDRESS))
random_register_byte_pattern = re.compile('RND {}, {}'.format(REGISTER, BYTE))
display_register_register_nibble = re.compile('DRW {}, {}, {}'.format(REGISTER, REGISTER, NIBBLE))
skip_key_pressed_register_pattern = re.compile('SKP {}'.format(REGISTER))
skip_key_not_pressed_register_pattern = re.compile('SKNP {}'.format(REGISTER))
load_register_delay_timer_pattern = re.compile('LD {}, DT'.format(REGISTER))
load_register_key_pattern = re.compile('LD {}, K'.format(REGISTER))
load_delay_timer_register_pattern = re.compile('LD DT, {}'.format(REGISTER))
load_sound_timer_register_pattern = re.compile('LD ST, {}'.format(REGISTER))
add_i_register_register_pattern = re.compile('ADD I, {}'.format(REGISTER))
load_digit_register_pattern = re.compile('LD F, {}'.format(REGISTER)) # The data should be stored in the interpreter area of Chip-8 memory (0x000 to 0x1FF)
load_bcd_register_pattern = re.compile('LD B, {}'.format(REGISTER))
load_i_register_register_pattern = re.compile('LD \[I\], {}'.format(REGISTER))
load_register_i_register_pattern = re.compile('LD {}, \[I\]'.format(REGISTER))

def extract_variables(regex, string):
    match = regex.match(string)
    if match:
        return (True, match.groups())
    else:
        return (False, None)
    
def parse_address(address):
    address = int(address, 16)
    low_byte = address & 0xFF
    high_byte = (address >> 8) & 0x0F
    return [high_byte, low_byte]

def parse_register(register):
    register = int(register, 16)
    return register & 0xF

def parse_byte(byte):
    byte = int(byte, 16)
    return byte & 0xFF

def parse_nibble(nibble):
    nibble = int(nibble, 16)
    return nibble & 0xF
    
def parse_command(command):
    (match, variables) = extract_variables(system_call_pattern, command)
    if match and len(variables) == 1:
        [high_nibble, low_byte] = parse_address(variables[0])
        return [0x00 | high_nibble, low_byte]
    
    (match, variables) = extract_variables(clear_screen_pattern, command)
    if (match):
        return [0x00, 0xE0]
    
    (match, variables) = extract_variables(return_pattern, command)
    if (match):
        return [0x00, 0xEE]
    
    (match, variables) = extract_variables(jump_address_pattern, command)
    if match and len(variables) == 1:
        [high_nibble, low_byte] = parse_address(variables[0])
        return [0x10 | high_nibble, low_byte]
    
    (match, variables) = extract_variables(call_pattern, command)
    if match and len(variables) == 1:
        [high_nibble, low_byte] = parse_address(variables[0])
        return [0x20 | high_nibble, low_byte]
    
    (match, variables) = extract_variables(skip_equal_register_byte_pattern, command)
    if match and len(variables) == 2:
        register = parse_register(variables[0])
        byte = parse_byte(variables[1])
        return [0x30 | register, byte]
    
    (match, variables) = extract_variables(skip_not_equal_register_byte_pattern, command)
    if match and len(variables) == 2:
        register = parse_register(variables[0])
        byte = parse_byte(variables[1])
        return [0x40 | register, byte]
    
    (match, variables) = extract_variables(skip_equal_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x50 | register1, register2 << 4 | 0x00]
    
    (match, variables) = extract_variables(load_register_byte_pattern, command)
    if match and len(variables) == 2:
        register = parse_register(variables[0])
        byte = parse_byte(variables[1])
        return [0x60 | register, byte]
    
    (match, variables) = extract_variables(add_register_byte_pattern, command)
    if match and len(variables) == 2:
        register = parse_register(variables[0])
        byte = parse_byte(variables[1])
        return [0x70 | register, byte]
    
    (match, variables) = extract_variables(load_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x80 | register1, register2 << 4 | 0x00]
    
    (match, variables) = extract_variables(or_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x80 | register1, register2 << 4 | 0x01]
    
    (match, variables) = extract_variables(and_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x80 | register1, register2 << 4 | 0x02]
    
    (match, variables) = extract_variables(xor_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x80 | register1, register2 << 4 | 0x03]
    
    (match, variables) = extract_variables(add_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x80 | register1, register2 << 4 | 0x04]
    
    (match, variables) = extract_variables(subtract_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x80 | register1, register2 << 4 | 0x05]
    
    (match, variables) = extract_variables(shift_right_register_pattern, command)
    if match and len(variables) >= 0:
        register = parse_register(variables[0])
        return [0x80 | register1, 0x06]
    
    (match, variables) = extract_variables(subtract_reverse_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x80 | register1, register2 << 4 | 0x07]
    
    (match, variables) = extract_variables(shift_left_register_pattern, command)
    if match and len(variables) >= 0:
        register = parse_register(variables[0])
        return [0x80 | register1, 0x0E]
    
    (match, variables) = extract_variables(skip_not_equal_register_register_pattern, command)
    if match and len(variables) == 2:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[1])
        return [0x90 | register1, register2 << 4 | 0x00]
    
    (match, variables) = extract_variables(load_i_register_address_pattern, command)
    if match and len(variables) == 1:
        [high_nibble, low_byte] = parse_address(variables[0])
        return [0xA0 | high_nibble, low_byte]
    
    (match, variables) = extract_variables(jump_register_address_pattern, command)
    if match and len(variables) == 1:
        [high_nibble, low_byte] = parse_address(variables[1])
        return [0xB0 | high_nibble, low_byte]
    
    (match, variables) = extract_variables(random_register_byte_pattern, command)
    if match and len(variables) == 2:
        register = parse_register(variables[0])
        byte = parse_byte(variables[1])
        return [0xC0 | register, byte]
    
    (match, variables) = extract_variables(display_register_register_nibble, command)
    if match and len(variables) == 3:
        register1 = parse_register(variables[0])
        register2 = parse_register(variables[2])
        nibble = parse_nibble(variables[3])
        return [0xD0 | register1, register2 << 4 | nibble]
    
    (match, variables) = extract_variables(skip_key_pressed_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xE0 | register, 0x9E]
    
    (match, variables) = extract_variables(skip_key_not_pressed_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xE0 | register, 0xA1]
    
    (match, variables) = extract_variables(load_register_delay_timer_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x07]
    
    (match, variables) = extract_variables(load_register_key_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x0A]
    
    (match, variables) = extract_variables(load_delay_timer_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x15]
    
    (match, variables) = extract_variables(load_sound_timer_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x18]
    
    (match, variables) = extract_variables(add_i_register_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x1E]
    
    (match, variables) = extract_variables(load_digit_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x29]
    
    (match, variables) = extract_variables(load_bcd_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x33]
    
    (match, variables) = extract_variables(load_i_register_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x55]
    
    (match, variables) = extract_variables(load_register_i_register_pattern, command)
    if match and len(variables) == 1:
        register = parse_register(variables[0])
        return [0xF0 | register, 0x65]
    
    return ['Failure', command]

def parse_cleaned_code(cleaned_code):
    parsed = [part for command in cleaned_code for part in parse_command(command)]
    
    if 'Failure' in parsed:
        print('Failing while parsing the code:', parsed)
        return False
    
    return parsed

parsed = parse_cleaned_code(cleaned_code)
    
print('Parsed:\n', parsed)
print('Parsed hex:\n', '{ ' + ', '.join([hex(p) for p in parsed]) + ' }')

Parsed:
 [0, 224, 96, 255, 97, 1, 98, 0, 128, 21, 80, 32, 18, 8, 34, 18, 18, 0, 111, 66, 0, 238]
Parsed hex:
 { 0x0, 0xe0, 0x60, 0xff, 0x61, 0x1, 0x62, 0x0, 0x80, 0x15, 0x50, 0x20, 0x12, 0x8, 0x22, 0x12, 0x12, 0x0, 0x6f, 0x42, 0x0, 0xee }


In [8]:
with open('bytes.bin', 'wb') as file:
    file.write(bytearray(parsed))