In [1]:
#!/usr/bin/env python3

import sys,os
from elftools.elf.elffile import ELFFile
from elftools.elf.segments import Segment



filePath = './../../binaries/array'



with open(filePath, 'rb') as f:
    elffile =  ELFFile(f)
    
    for segment in elffile.iter_segments():
        if segment.header.p_filesz != segment.header.p_memsz:
            seg_head = segment.header
            print(f"Type: {seg_head.p_type}\nOffset: {hex(seg_head.p_offset)}\nSize in file:{hex(seg_head.p_filesz)}\nSize in memory:{hex(seg_head.p_memsz)}")
            
    dwarfinfo = elffile.get_dwarf_info()
    for CU in dwarfinfo.iter_CUs():
        print('  Found a compile unit at offset %s, length %s' % (
            CU.cu_offset, CU['unit_length']))
        

        # Every compilation unit in the DWARF information may or may not
        # have a corresponding line program in .debug_line.
        # line_program = dwarfinfo.line_program_for_CU(CU)
        # if line_program is None:
        #     print('  DWARF info is missing a line program for this CU')
        #     continue

Type: PT_LOAD
Offset: 0x2da8
Size in file:0x268
Size in memory:0x270
  Found a compile unit at offset 0, length 325


In [2]:
def _format_hex( addr, fieldsize=None, fullhex=False, lead0x=True):
    """ Format an address into a hexadecimal string.
        fieldsize:
            Size of the hexadecimal field (with leading zeros to fit the
            address into. For example with fieldsize=8, the format will
            be %08x
            If None, the minimal required field size will be used.
        fullhex:
            If True, override fieldsize to set it to the maximal size
            needed for the elfclass
        lead0x:
            If True, leading 0x is added
    """
    s = '0x' if lead0x else ''
    if fullhex:
        fieldsize = 8 if elffile.elfclass == 32 else 16
    if fieldsize is None:
        field = '%x'
    else:
        field = '%' + '0%sx' % fieldsize
    return s + field % addr

from elftools.elf.descriptions import (
    describe_ei_class, describe_ei_data, describe_ei_version,
    describe_ei_osabi, describe_e_type, describe_e_machine,
    describe_e_version_numeric, describe_p_type, describe_p_flags,
    describe_sh_type, describe_sh_flags,
    describe_symbol_type, describe_symbol_bind, describe_symbol_visibility,
    describe_symbol_shndx, describe_reloc_type, describe_dyn_tag,
    )

In [3]:
from capstone import *

address_inst = {}

with open(filePath, 'rb') as f:
    elf = ELFFile(f)
    code = elf.get_section_by_name('.text')
    ops = code.data()
    print('code data size: ',code.data_size)
    addr = code['sh_addr']
    md = Cs(CS_ARCH_X86, CS_MODE_64)
    for i in md.disasm(ops, addr):        
        print(f'0x{i.address:x}:\t{i.mnemonic}\t{i.op_str}')
        address_inst[hex(i.address)] = i

code data size:  542
0x10a0:	endbr64	
0x10a4:	xor	ebp, ebp
0x10a6:	mov	r9, rdx
0x10a9:	pop	rsi
0x10aa:	mov	rdx, rsp
0x10ad:	and	rsp, 0xfffffffffffffff0
0x10b1:	push	rax
0x10b2:	push	rsp
0x10b3:	xor	r8d, r8d
0x10b6:	xor	ecx, ecx
0x10b8:	lea	rdi, [rip + 0xca]
0x10bf:	call	qword ptr [rip + 0x2f13]
0x10c5:	hlt	
0x10c6:	nop	word ptr cs:[rax + rax]
0x10d0:	lea	rdi, [rip + 0x2f39]
0x10d7:	lea	rax, [rip + 0x2f32]
0x10de:	cmp	rax, rdi
0x10e1:	je	0x10f8
0x10e3:	mov	rax, qword ptr [rip + 0x2ef6]
0x10ea:	test	rax, rax
0x10ed:	je	0x10f8
0x10ef:	jmp	rax
0x10f1:	nop	dword ptr [rax]
0x10f8:	ret	
0x10f9:	nop	dword ptr [rax]
0x1100:	lea	rdi, [rip + 0x2f09]
0x1107:	lea	rsi, [rip + 0x2f02]
0x110e:	sub	rsi, rdi
0x1111:	mov	rax, rsi
0x1114:	shr	rsi, 0x3f
0x1118:	sar	rax, 3
0x111c:	add	rsi, rax
0x111f:	sar	rsi, 1
0x1122:	je	0x1138
0x1124:	mov	rax, qword ptr [rip + 0x2ec5]
0x112b:	test	rax, rax
0x112e:	je	0x1138
0x1130:	jmp	rax
0x1132:	nop	word ptr [rax + rax]
0x1138:	ret	
0x1139:	nop	dword ptr [rax]
0x1140:	

In [4]:
# address_inst

In [5]:
#-------------------------------------------------------------------------------
# elftools example: dwarf_lineprogram_filenames.py
#
# In the .debug_line section, the Dwarf line program generates a matrix
# of address-source references. This example demonstrates accessing the state
# of each line program entry to retrieve the underlying filenames.
#
# William Woodruff (william@yossarian.net)
# This code is in the public domain
#-------------------------------------------------------------------------------
from __future__ import print_function
from collections import defaultdict
import os
import sys
import posixpath

# If pyelftools is not installed, the example can also run from the root or
# examples/ dir of the source distribution.
# sys.path[0:0] = ['.', '..']

from elftools.elf.elffile import ELFFile






def line_entry_mapping(line_program):
    filename_map = defaultdict(int)

    # The line program, when decoded, returns a list of line program
    # entries. Each entry contains a state, which we'll use to build
    # a reverse mapping of filename -> #entries.
    lp_entries = line_program.get_entries()
    for lpe in lp_entries:
        # We skip LPEs that don't have an associated file.
        # This can happen if instructions in the compiled binary
        # don't correspond directly to any original source file.
        if not lpe.state or lpe.state.file == 0:
            continue
        filename = lpe_filename(line_program, lpe.state.file)
        filename_map[filename] += 1

    for filename, lpe_count in filename_map.items():
        print("    filename=%s -> %d entries" % (filename, lpe_count))
    return filename_map

def lpe_filename(line_program, file_index):


    lp_header = line_program.header
    file_entries = lp_header["file_entry"]
    
    # print('lp_header', lp_header, '\n\n')
    # print("file_entries", file_entries,'\n\n__________________________________________\n\n')

    # File and directory indices are 1-indexed.
    file_entry = file_entries[file_index - 1]
    dir_index = file_entry["dir_index"]

    # A dir_index of 0 indicates that no absolute directory was recorded during
    # compilation; return just the basename.
    if dir_index == 0:
        return file_entry.name.decode()

    directory = lp_header["include_directory"][dir_index - 1]
    return posixpath.join(directory, file_entry.name).decode()



addr_lineProgram ={}
addr_sourceFile = {}

with open(filePath, 'rb') as f:
    elffile = ELFFile(f)

    if not elffile.has_dwarf_info():
        print('  file has no DWARF info')
        exit(0)

    dwarfinfo = elffile.get_dwarf_info()
    for CU in dwarfinfo.iter_CUs():
        print('  Found a compile unit at offset %s, length %s' % (
            CU.cu_offset, CU['unit_length']))

        # Every compilation unit in the DWARF information may or may not
        # have a corresponding line program in .debug_line.
        line_program = dwarfinfo.line_program_for_CU(CU)
        if line_program is None:
            print('  DWARF info is missing a line program for this CU')
            continue

        # Print a reverse mapping of filename -> #entries
        filename_map = line_entry_mapping(line_program)
        for line_entry in line_program.get_entries():
            # print(line_entry)
            if line_entry.state!=None:
                addr_lineProgram[hex(line_entry.state.address)] = line_entry
                addr_sourceFile [hex(line_entry.state.address)] = filename_map
        print("_____________________________________________________")
        
        


  Found a compile unit at offset 0, length 325
    filename=array.c -> 27 entries
_____________________________________________________


In [6]:
# line_entry

In [7]:


def getSource(sourceFileName, row, col):
    basePath = "/home/nahid/reverse/binaries/"
    sourceFilePath = os.path.join(basePath , sourceFileName)
    sourceFile = open(sourceFilePath, "r")
    fileContent = sourceFile.readlines()
    
    row_content =  fileContent[row-1]
    
    row_content = row_content[:(col-1)] + "@" +row_content[col:]
    
    # print(row_content)
    
    return row_content


In [8]:
with open(filePath, 'rb') as f:
    elffile = ELFFile(f)

    if not elffile.has_dwarf_info():
        print('  file has no DWARF info')
        exit(0)

    dwarfinfo = elffile.get_dwarf_info()
    arangesInfo = dwarfinfo.debug_aranges_sec
    pubTypes = dwarfinfo.debug_pubtypes_sec
    


In [9]:
# address_inst = {}        add->instruction
# addr_lineProgram = {}    add-> lineInfo (row,col)
# addr_sourceFile =  {}    add-> sourceFIleName

In [10]:
def get_filename_lineinfo(die): # TODO find actual path, currently just returns filename
    lp_header = die.dwarfinfo.line_program_for_CU(die.cu).header
    files = lp_header["file_entry"]
    includes = lp_header["include_directory"]

    fileinfo = files[die.attributes["DW_AT_decl_file"].value - 1]
    filename = fileinfo.name.decode("utf-8")
    filedir = includes[fileinfo.dir_index - 1].decode("utf-8")

    path = os.path.join(filedir, filename)
    lineno = die.attributes["DW_AT_decl_line"].value
    columno = die.attributes["DW_AT_decl_column"].value
    return filename, path, lineno , columno 


In [11]:


sourceLocation_die = {} #directory for DIE for C source line and col

import sys

# If pyelftools is not installed, the example can also run from the root or
# examples/ dir of the source distribution.

from elftools.elf.elffile import ELFFile
from elftools.dwarf.descriptions import (
    describe_DWARF_expr, set_global_machine_arch)
from elftools.dwarf.locationlists import (
    LocationEntry, LocationExpr, LocationParser)




def process_file(filename):
    print('Processing file:', filename)
    with open(filename, 'rb') as f:
        elffile = ELFFile(f)

        if not elffile.has_dwarf_info():
            print('  file has no DWARF info')
            return

        # get_dwarf_info returns a DWARFInfo context object, which is the
        # starting point for all DWARF-based processing in pyelftools.
        dwarfinfo = elffile.get_dwarf_info()

        # The location lists are extracted by DWARFInfo from the .debug_loc
        # section, and returned here as a LocationLists object.
        location_lists = dwarfinfo.location_lists()

        # This is required for the descriptions module to correctly decode
        # register names contained in DWARF expressions.
        set_global_machine_arch(elffile.get_machine_arch())

        # Create a LocationParser object that parses the DIE attributes and
        # creates objects representing the actual location information.
        loc_parser = LocationParser(location_lists)

        for CU in dwarfinfo.iter_CUs():
            # DWARFInfo allows to iterate over the compile units contained in
            # the .debug_info section. CU is a CompileUnit object, with some
            # computed attributes (such as its offset in the section) and
            # a header which conforms to the DWARF standard. The access to
            # header elements is, as usual, via item-lookup.
            print('  Found a compile unit at offset %s, length %s' % (
                CU.cu_offset, CU['unit_length']))

            # A CU provides a simple API to iterate over all the DIEs in it.
            die_depth = 0
            for DIE in CU.iter_DIEs():
                # Go over all attributes of the DIE. Each attribute is an
                # AttributeValue object (from elftools.dwarf.die), which we
                # can examine.
                
                
                # print("TREE DEPTH: ",die_depth)

                for attr in DIE.attributes.values():
                    
                    # print(attr)
                    
                    if attr.name == "DW_AT_decl_file":
                        src_file, _ , line,column = get_filename_lineinfo(DIE)
                        dict_key = src_file +"#"+str(line) #TODO column should be in key
                        if dict_key in sourceLocation_die.keys():
                            sourceLocation_die [dict_key ][column] = DIE
                        else:
                            sourceLocation_die [dict_key ] = {}
                            sourceLocation_die [dict_key ][column] = DIE
                            
                            
                print("\n\n\n")
                
                if DIE.is_null(): #https://chromium.googlesource.com/chromiumos/third_party/pyelftools/+/25a77f7738d7fe824f2ed4d33a123136b9d8e88a/scripts/readelf.py
                    die_depth -= 1
                    continue
                if DIE.has_children:
                    die_depth += 1
                    
            # break



process_file(filePath)

Processing file: ./../../binaries/array
  Found a compile unit at offset 0, length 325


















































































































































In [14]:

with open('array.s', 'w') as outFile:
    # outFile.write('file contents\n')
    lastSource = ""
    for address in address_inst:
        inst = address_inst[address]
        instrctionCode = (address+":\t"+ inst.mnemonic+" "+inst.op_str).ljust(50)+ '#'+' '*20
        if address in addr_lineProgram:
#             print()
            line = addr_lineProgram[address]

            srcFileName =list(addr_sourceFile[address].keys())[0] #TODO not single file always 
            
            if srcFileName!=lastSource:
                outFile.write("\n"+ '#'*100+"\n"+ srcFileName.rjust(45) +'\n'+'#'*100+ "\n\n")
                lastSource = srcFileName
            
            sourceCode = getSource(srcFileName,addr_lineProgram[address].state.line, addr_lineProgram[address].state.column)
            src_key = srcFileName+"#"+str(addr_lineProgram[address].state.line)
            
            if src_key in sourceLocation_die.keys():
                die_info = sourceLocation_die[src_key]
                
            if '\n' not in  sourceCode:
                sourceCode+=sourceCode+"\n"
            outFile.write(instrctionCode+"#"+ sourceCode  )
            print(instrctionCode+"#"+ sourceCode)
        else:
            outFile.write(instrctionCode+ '\n'  )
            print(instrctionCode)
    

0x10a0:	endbr64                                   #                    
0x10a4:	xor ebp, ebp                              #                    
0x10a6:	mov r9, rdx                               #                    
0x10a9:	pop rsi                                   #                    
0x10aa:	mov rdx, rsp                              #                    
0x10ad:	and rsp, 0xfffffffffffffff0               #                    
0x10b1:	push rax                                  #                    
0x10b2:	push rsp                                  #                    
0x10b3:	xor r8d, r8d                              #                    
0x10b6:	xor ecx, ecx                              #                    
0x10b8:	lea rdi, [rip + 0xca]                     #                    
0x10bf:	call qword ptr [rip + 0x2f13]             #                    
0x10c5:	hlt                                       #                    
0x10c6:	nop word ptr cs:[rax + rax]               #             

In [13]:
# sourceLocation_die