In [None]:
# learning about compression 

buff = None
with open('names.txt', 'rb') as f: 
  buff = f.read()[0:10000]


In [None]:

def bitgen(buff):
  for c in buff: 
    for i in range(8):
      yield int(c & (0x80 >> i) != 0)

g = bitgen(buff)

from collections import defaultdict

ctx = defaultdict(lambda: [0, 0])

for i in range(8): 
  print(next(g))




In [73]:
def decode(inst):
  EXAMPLE_CODE = inst

  from iced_x86 import Decoder, Formatter,  FormatterSyntax

  EXAMPLE_CODE_BITNESS = 64
  EXAMPLE_CODE_RIP = 0x0 #0x0000_7FFA_C46A_CDA4
  decoder = Decoder(EXAMPLE_CODE_BITNESS, EXAMPLE_CODE, ip=EXAMPLE_CODE_RIP)

  formatter = Formatter(FormatterSyntax.MASM)

  formatter.digit_separator = "`"
  formatter.first_operand_char_index = 10
  print()
  print(f'EXAMPLE_CODE={EXAMPLE_CODE}')
  print()
  for instr in decoder:
      disasm = formatter.format(instr)
      start_index = instr.ip - EXAMPLE_CODE_RIP
      bytes_str = EXAMPLE_CODE[start_index:start_index + instr.len].hex().upper()
      print(f"{instr.ip:016X} {bytes_str:20} {disasm}")
  # decoder = Decoder(64, b"\x86\x64\x32\x16", ip=0x1234_5678)
  # instr = decoder.decode()
  # print(instr)


In [75]:
from enum import IntEnum, Enum

INT32_MAX = 2**32

Register = IntEnum(
  "Register", [
  # REX.r = 0
  'AL',  'CL',  'DL',  'BL',  'AH',  'CH',  'DH',  'BH',
  '_',   '__',  '___', '____', 'SPL', 'BPL', 'SIL', 'DIL',
  'AX',  'CX',  'DX',  'BX',  'SP',  'BP',  'SI',  'DI',
  'EAX', 'ECX', 'EDX', 'EBX', 'ESP', 'EBP', 'ESI', 'EDI',
  'RAX', 'RCX', 'RDX', 'RBX', 'RSP', 'RBP', 'RSI', 'RDI',
  # REX.r = 1
  'R8B', 'R9B', 'R10B', 'R11B', 'R12B', 'R13B', 'R14B', 'R15B',
  'R8W', 'R9W', 'R10W', 'R11W', 'R12W', 'R13W', 'R14W', 'R15W',
  'R8D', 'R9D', 'R10D', 'R11D', 'R12D', 'R13D', 'R14D', 'R15D',
  'R8',  'R9',  'R10',  'R11',  'R12',  'R13',  'R14',  'R15'
])

def get_reg(x): # returns reg number (1-7), bitness (8, 16, 32, 64), whether the register is a r8-r15 
  reg, bitness = (x.value-1) % 8, (x.value-1) // 8
  return reg, 2**[0, 0, 1, 2, 3, 0, 1, 2, 3, 3][bitness]*8, (bitness > 4) & 1

class Address: 
  def __init__(self, base=None, disp=0, index=None, scale=None):
    assert (isinstance(base, Register) or base is None) and (isinstance(index, Register) or index is None)
    assert type(disp) == int and disp < INT32_MAX
    assert scale is None or scale < 4, f'Scale must at most 3 but got {scale}'

    if base is not None and index is not None: 
      _, bitness1, _ = get_reg(base)
      _, bitness2, _ = get_reg(index)

      assert bitness1 == bitness2, f'Registers ({base.name}, {index.name}) must be of the same size ({bitness1} != {bitness2})'

    self.disp, self.index, self.base = disp, index, base 
    self.scale = scale if scale is not None else 0 
  def __repr__(self):
    get_string = lambda x: x.name if isinstance(x, IntEnum) else x if x is not None else ''
    disp, index, scale, base = get_string(self.disp), get_string(self.index), get_string(self.scale), get_string(self.base)
    return f'({base}{"+" if base is not "" else ""}{index}{"*" if scale is not "" else ""}{scale*2}{"+" if disp != 0 else ""}{hex(disp)})'


OPCODE_W = 1 << 0
OPCODE_S = 1 << 1
OPCODE_D = 1 << 2

def to_byte(x): return int.to_bytes(x, length=1, byteorder='little')
def to_bytes(x, l=0): return int.to_bytes(x, length=l, byteorder='little')

# rex       | opcode     | mod reg r/m 
# 0b100wrxb | 0b00000dsw | 11  rrr bbb

def get_mod_rm(operand1, operand2):

  # register to register
  if isinstance(operand1, Register) and isinstance(operand2, Register):
    r_m, _, _ = get_reg(operand1)
    reg, _, _ = get_reg(operand2)
    mod = 0b11
  
  # immediate to register e.g. add al, 0FFh
  elif isinstance(operand1, Register) and (isinstance(operand2, int) or isinstance(operand2, bytes)) :
    mod, reg = 0b11, 0
    r_m, _, _ = get_reg(operand1)

  # immediate to memory  
  elif isinstance(operand1, Address) and isinstance(operand2, bytes): 
    address = operand1
    base, index, scale, disp = address.base, address.index, address.scale, address.disp

    reg = 0
    r_m = get_reg(base)[0] if base else 0

    if base is None and index is None: mod= 0b00
    elif disp >= 256:                  mod= 0b10 # 10 - [eax] + dips32 ; r->m/m->r
    elif 0 < disp and disp < 256:      mod= 0b01 # 01 - [eax] + dips8  ; r->m/m->r
    elif disp == 0:                    mod= 0b00 # 00 - [eax]          ; r->m/m->r

    if index is not None or base == Register.RSP or (base is None and index is None): # use sib byte
      r_m = 0b100 # r_m == 0b100 because this will use the sib byte
  
  # memory to register or register to memory e.g. mov eax, [rbp+0x100]
  elif (isinstance(operand1, Register) and isinstance(operand2, Address)) or (isinstance(operand2, Register) and isinstance(operand1, Address)):

    register, address = operand1, operand2 
    if isinstance(operand2, Register) and isinstance(operand1, Address):
      register, address = address, register

    base, index, scale, disp = address.base, address.index, address.scale, address.disp

    reg, bitness_to, rex_r = get_reg(register)
    r_m = get_reg(base)[0] if base else 0

    if base is None and index is None: mod= 0b00
    elif disp >= 256:                  mod= 0b10 # 10 - [eax] + dips32 ; r->m/m->r
    elif 0 < disp and disp < 256:      mod= 0b01 # 01 - [eax] + dips8  ; r->m/m->r
    elif disp == 0:                    mod= 0b00 # 00 - [eax]          ; r->m/m->r

    if index is not None or base == Register.RSP or (base is None and index is None): # use sib byte
      r_m = 0b100 # r_m == 0b100 because this will use the sib byte

  ModRM = to_byte(mod << 6 | reg << 3 | r_m << 0)
  return ModRM

def get_sib(address):
  base, index, scale = address.base, address.index, address.scale

  # if you don't need to use the sib byte, it will not return a sib byte
  if not (index is not None or base == Register.RSP or (base is None and index is None)):
    return b''
    
  r_m = get_reg(base)[0] if base else 0

  # remove index using 0b100 value
  index = get_reg(index)[0] if index else 0b100
  # remove base using 0b101 value 
  base = r_m if base else 0b101

  sib = to_byte((scale << 6) | (index << 3) | base)
  return sib

def write_instruction(opcode, operand1, operand2, mod_rm=b'', sib=b'', constant=b''):
  rex_b, rex_r, rex_x = 0, 0, 0 # default to 0
  address_bitness, disp = 0, 0

  # register to register
  if isinstance(operand1, Register) and isinstance(operand2, Register):
    reg_to,   bitness_to,   rex_b = get_reg(operand1)
    reg_from, bitness_from, rex_r = get_reg(operand2)
    assert bitness_to == bitness_from, \
    f'Registers {operand1.name}, {operand2.name} are not of the same size ({bitness_to} != {bitness_from})'

  # immediate to register e.g. add al, 0FFh
  elif isinstance(operand1, Register) and (isinstance(operand2, int) or isinstance(operand2, bytes)) :
    r_m, bitness_to, rex_b = get_reg(operand1)
  
  else: # probably memory operations

    # immediate to memory 
    if isinstance(operand1, Address) and isinstance(operand2, bytes): 
      address, constant = operand1, operand2 
      base, index, scale, disp = address.base, address.index, address.scale, address.disp
      reg = 0
      bitness_to = len(constant) * 8
    # Address to Register or Register to Address e.g. mov eax, [rbp+0x100]
    elif (isinstance(operand1, Register) and isinstance(operand2, Address)) or (isinstance(operand2, Register) and isinstance(operand1, Address)):
      register, address = operand1, operand2 
      base, index, scale, disp = address.base, address.index, address.scale, address.disp
      reg, bitness_to, rex_r = get_reg(register)
    else: 
      raise Exception("Bug?")

    # we need to handle the address part that both operations have here
    if base:
      r_m, address_bitness, rex_b = get_reg(base)
    else: 
      r_m, address_bitness, rex_b = 0, 0, 0

    if base is None and index is None: length=4
    elif disp >= 256:                  length=4 # [eax] + dips32 ; r->m/m->r
    elif 0 < disp and disp < 256:      length=1 # [eax] + dips8  ; r->m/m->r
    elif disp == 0:                    length=0 # [eax]          ; r->m/m->r

    if index is not None or base == Register.RSP or (base is None and index is None): # use sib byte
      if index is None: index, rex_x = 0b100, 0 # remove index using 0b100 value
      else: index, _, rex_x = get_reg(index)

  REX_B = 1 << 0
  REX_X = 1 << 1
  REX_R = 1 << 2
  REX_W = 1 << 3
  REX =   1 << 6

  is_inst_64bit = bitness_to == 64
  prefix_bin  = to_byte(0x66) if bitness_to == 16 else b'' + to_byte(0x67) if address_bitness == 32 else b''
  rex_bin     = to_byte(REX | is_inst_64bit*REX_W | rex_b*REX_B | rex_r*REX_R | rex_x*REX_X) if rex_b or rex_r or rex_x or is_inst_64bit else b''
  opcode_bin  = to_byte(opcode) if bitness_to == 8 else to_byte(opcode| OPCODE_W)
  operand_bin = mod_rm
  sib_bin     = sib
  constant_bin = constant
  displace_bin = to_bytes(disp, l=length) if disp else b''

  return prefix_bin + rex_bin + opcode_bin + operand_bin + sib_bin + displace_bin + constant_bin 

class Builder: 
  def __init__(self): 
    self.buff = bytearray()

  def __call__(self):
    return self.buff

  def add(self, operand1, operand2): 
    opcode = 0x00

    assert isinstance(operand1, Register) or isinstance(operand1, Address), \
      f'First operand {operand1}, must be a Register or Address'
    
    # register to register
    if isinstance(operand1, Register) and isinstance(operand2, Register):
      result = write_instruction(opcode, operand1, operand2, 
                                mod_rm=get_mod_rm(operand1, operand2))

    # immediate to register e.g. add al, 0FFh
    elif isinstance(operand1, Register) and isinstance(operand2, int):
      reg, bitness, rex_b = get_reg(operand1)

      assert operand2 < 2**bitness, \
        f'Constant 0x{operand2:x} too big for {bitness}bit register {operand1.name} '
      constant = int.to_bytes(operand2, length=min(bitness//8, 4), byteorder='little')
        
      if reg == 0 and rex_b == 0: # special case for {al, ax, eax, rax} regs
        opcode |= OPCODE_D
        result = write_instruction(opcode, operand1, operand2, 
                                constant=constant)
      else: # regular case 
        if operand2 < 2**8: constant = to_byte(operand2)
        opcode = 0x80 if bitness == 8 or operand2 > 2**8 else 0x80 | OPCODE_S
        result = write_instruction(opcode, operand1, operand2, 
                                    mod_rm=get_mod_rm(operand1, operand2), 
                                    constant=constant)
     
    # immediate to memory e.g. add [rax], 0FFh
    elif isinstance(operand1, Address) and ((isinstance(operand2, int) or isinstance(operand2, bytes)) and not isinstance(operand2, Register)):

      assert isinstance(operand2, bytes), \
        f'Got int {operand2} with unknown size, please give the number in bytes type'

      address, constant = operand1, operand2
      opcode = 0x80 if len(constant) == 1 else 0x80 | OPCODE_W
      # TODO: I might want to use the imm8 sign extend to r16/r32/r64 
      # whilch will be opcode (0x80 | OPCODE_W | OPCODE_S) but until then... 
      result = write_instruction(opcode, address, constant,
                                  mod_rm=get_mod_rm(address, constant), 
                                  sib=get_sib(address),
                                  constant=constant)

    # Address to Register or Register to Address e.g. mov eax, [rbp+0x100]
    elif (isinstance(operand1, Register) or isinstance(operand1, Address)) and (isinstance(operand2, Register) or isinstance(operand2, Address)):

      register, address = operand1, operand2 
      if isinstance(operand1, Register) and isinstance(operand2, Address):
        opcode |= OPCODE_S
      else: 
        register, address = address, register

      result = write_instruction(opcode, register, address, 
                                mod_rm=get_mod_rm(operand1, operand2), 
                                sib=get_sib(address))
    else: 
      raise Exception("Bug? or not implemented")

    self.buff += result 
    return result


builder = Builder()
builder.add(Register.CL, Register.AL)
builder.add(Register.DL, Register.R9B)
builder.add(Register.EAX, Register.R9D)
builder.add(Register.RDI, Register.R15)
builder.add(Register.CL, 0x70)
builder.add(Register.EAX, 0x70)
builder.add(Register.R9, 0x700)
builder.add(Register.EAX, Address(Register.RAX))
builder.add(Register.EAX, Address(disp=0x400))
builder.add(Register.RAX, Address(Register.RCX, disp=0x100))
builder.add(Register.RAX, Address(Register.RCX, index=Register.RAX, scale=3, disp=0x100))
builder.add(Register.EAX, Address(Register.R9, index=Register.RBP, scale=1, disp=0x400))
builder.add(Register.EAX, Address(Register.RSP))
builder.add(Register.R10, Address(Register.RSP, disp=100))
builder.add(Register.RAX, Address(Register.RSP, index=Register.RBP))
builder.add(Register.EAX, Address(Register.RSP, index=Register.R15, disp=100))
builder.add(Address(Register.RSP, index=Register.R15, disp=100), Register.EAX)
builder.add(Address(Register.RAX, disp=0x100, index=Register.RAX, scale=1), to_bytes(0x70, l=1))
builder.add(Address(Register.R15, disp=0x100, index=Register.R9), to_bytes(0x70, l=4))

# inst = add(Register.EAX, Address(Register.RSP, index=Register.R15, disp=0x100))

decode(builder())




EXAMPLE_CODE=bytearray(b'\x00\xc1D\x00\xcaD\x01\xc8L\x01\xff\x80\xc1p\x05p\x00\x00\x00I\x81\xc1\x00\x07\x00\x00\x03\x00\x03\x04%\x00\x04\x00\x00H\x03\x81\x00\x01\x00\x00H\x03\x84\xc1\x00\x01\x00\x00A\x03\x84i\x00\x04\x00\x00\x03\x04$L\x03T$dH\x03\x04,B\x03D<dB\x01D<d\x80\x84@\x00\x01\x00\x00pC\x81\x84\x0f\x00\x01\x00\x00p\x00\x00\x00')

0000000000000000 00C1                 add       cl,al
0000000000000002 4400CA               add       dl,r9b
0000000000000005 4401C8               add       eax,r9d
0000000000000008 4C01FF               add       rdi,r15
000000000000000B 80C170               add       cl,70h
000000000000000E 0570000000           add       eax,70h
0000000000000013 4981C100070000       add       r9,700h
000000000000001A 0300                 add       eax,[rax]
000000000000001C 03042500040000       add       eax,[400h]
0000000000000023 48038100010000       add       rax,[rcx+100h]
000000000000002A 480384C100010000     add       rax,[rcx+rax*8+100h]
0000000000000032 410384