In [1]:
from accelerate import Accelerator
import torch
import multiprocessing as mp
accelerator = Accelerator()
accelerator.state.num_processes = 3  # For 3 GPUs
device = accelerator.device
# device = 'cuda:0' if torch.cuda.is_available() else 'cpu'


In [2]:
import os,sys, json,re, pickle ,threading
import magic, hashlib,  traceback ,ntpath, collections ,lief
from capstone import *
from capstone.x86 import *
import torch.nn as nn
import lief
from elftools.elf.elffile import ELFFile
from transformers import AdamW,AutoTokenizer
from tqdm import tqdm  # for our progress bar
from sklearn.metrics import precision_recall_fscore_support , accuracy_score,f1_score, confusion_matrix,mean_squared_error, mean_absolute_error, r2_score
from numpy import *
from num2words import num2words
import pandas as pd
from capstone import Cs, CS_ARCH_X86, CS_MODE_64

In [3]:
BIN_FILE_TYPE = 'PE' #or ELF
bin_path = '/home/raisul/DATA/x86_pe_msvc_O2_static/' #/home/raisul/DATA/temp/x86_pe_msvc_O2_static/'
bin_files = [os.path.join(bin_path, f) for f in os.listdir(bin_path) if f.endswith(".exe")]#[:2]
ground_truth_path ='/home/raisul/ANALYSED_DATA/ghidra_x86_pe_msvc_O2_static/'#'/home/raisul/DATA/temp/ghidra_x86_pe_msvc_O2_debug/'  
MODEL_SAVE_PATH= '/home/raisul/probabilistic_disassembly/models/'
EXPERIMENT_NAME = 'prototype_pe_small'

MAX_TOKEN_SIZE = 120
MAX_SEQUENCE_LENGTH = 10
VOCAB_SIZE = 500
BATCH_SIZE = 15000
VALIDATION_DISPLAY_SIZE = 100000
MAX_FILE_TO_USE = 30000
MODEL_NAME= "microsoft/MiniLM-L12-H384-uncased"# #bert-base-uncased
pkl_data_save_path = MODEL_SAVE_PATH+'training_data_pe'+str(MAX_FILE_TO_USE)+'.ignore.pkl'

In [4]:

def make_train_test_split(bin_path):
    bin_files = [ f for f in os.listdir(bin_path) ][0:MAX_FILE_TO_USE] #if f.endswith(".exe")
    temp_dict = {key: sum(ord(c) for c in key) % 10 for key in bin_files}
    # temp_dict = dict(sorted(temp_dict.items(), key=lambda item: item[1]))
    dict_train = {k: v for k, v in temp_dict.items() if 0 <= v <= 7}
    dict_test = {k: v for k, v in temp_dict.items() if 8 <= v <= 9}
    print(len(list(dict_train.items())) ,len(list(dict_test.items())) )
    return list(dict_train.keys()) , list(dict_test.keys())
train_bins , test_bins = make_train_test_split(bin_path)

train_bins = train_bins
test_bins  = test_bins

23993 6007


  temp_dict = {key: sum(ord(c) for c in key) % 10 for key in bin_files}


In [5]:
def replace_num_with_word(input_string , replace_dict):
    def num_to_word(match):
        number = int( match.group(0))
        return num2words(replace_dict[number]).replace(' ','').replace('-',"")
    result_string = re.sub(r'\b\d+\b', num_to_word, input_string)
    return result_string



def replace_hex_with_decimal(input_string):
    # Regular expression to find hexadecimal numbers prefixed with "0x" or "0X"
    hex_pattern = r'0[xX][0-9a-fA-F]+'
    
    # Function to convert each found hex number to decimal
    def hex_to_decimal(match):
        hex_value = match.group(0)  # Extract the matched hex number
        decimal_value = str(int(hex_value, 16))  # Convert hex to decimal
        return decimal_value
    # Substitute all hex numbers in the string with their decimal equivalents
    result_string = re.sub(hex_pattern, hex_to_decimal, input_string)
    return result_string



In [6]:

def get_ground_truth_ghidra(exe_path, text_section_offset , text_section_len):

    text_sextion_end = text_section_offset + text_section_len
    
    elf_file_name = os.path.basename(exe_path)
    ghidra_file_path = os.path.join(ground_truth_path, elf_file_name.split('.')[0]) + '.json'
    
    with open(ghidra_file_path, "r") as file:
        ghidra_data = json.load(file)

    ground_truth_offsets = list(ghidra_data.keys())

    ground_truth_offsets = [int(i) for i in ground_truth_offsets]
    ground_truth_offsets = [x for x in ground_truth_offsets if text_section_offset <= x <= text_sextion_end]
    ground_truth_offsets.sort()
    return ground_truth_offsets



def find_data_in_textsection(ground_truth_offsets , text_section_offset , text_section_len, offset_inst_dict):
    data_offsets = []
    for i in range(1, len(ground_truth_offsets)-1):
        distance = ground_truth_offsets[i+1] - ground_truth_offsets[i]

        inst_len = offset_inst_dict[ground_truth_offsets[i]].size 
        
        if distance!=inst_len:
            # print('offset_ranges[i]: ',ground_truth_offsets[i] , 'offset_ranges[i-1]: ',ground_truth_offsets[i-1], ' inst_len: ',inst_len  )
            # print(ground_truth_offsets[i],' ' ,hex(ground_truth_offsets[i]) , offset_inst_dict[ground_truth_offsets[i]], ' len',offset_inst_dict[ground_truth_offsets[i]].size )
            # print("\nByte GAP ###### ",distance ,' Missing bytes: ', distance - inst_len)
            
            for j in range( ground_truth_offsets[i] +inst_len , ground_truth_offsets[i+1]  ):
                data_offsets.append(j)
                # if offset_inst_dict[j]:
                #     print("# ",j, offset_inst_dict[j].mnemonic, offset_inst_dict[j].op_str , 'inst len:',offset_inst_dict[j].size )
                # else:
                #     print("# ",j, " invalid ")
            # print('\n')
        else:
            # print(ground_truth_offsets[i],' ', hex(ground_truth_offsets[i]) , offset_inst_dict[ground_truth_offsets[i]].mnemonic,offset_inst_dict[ground_truth_offsets[i]].op_str ,' len',offset_inst_dict[ground_truth_offsets[i]].size)
            pass
    return data_offsets
    

def linear_sweep(offset_inst , target_offset):
    inst_sequence = ''
    address_list = []
    
    current_offset = target_offset
    for q in range(MAX_SEQUENCE_LENGTH):

        if current_offset in offset_inst: #if end of text section
            current_instruction = offset_inst[current_offset]
            if current_instruction is None:
                return  None
                
            current_offset = current_offset + current_instruction.size
            inst_sequence+= str( hex(current_instruction.address)) +" "+ current_instruction.mnemonic +' '+ current_instruction.op_str+ ' ; ' 
            address_list.append(current_instruction.address)
            
            if current_instruction.mnemonic in ["ret", "jmp"]: #break linear sweep
                break
                

    return inst_sequence, address_list
    

In [7]:

# with open(pkl_data_save_path , 'wb') as f:
#     pickle.dump([train_sequences,train_labels,validation_sequences, validation_labels], f)
with open(pkl_data_save_path, "rb") as f:
    train_sequences,train_labels,validation_sequences, validation_labels = pickle.load(f)


In [8]:


for j in range(100):
    if True:#'int3' in SEQUENCES[j]: #LABELS[j] :
        print(train_labels[j] , ' > ' , train_sequences[j] ,'\n' )

0.0  >  zero int3  ; one int3  ; two int3  ; three int3  ; four int3  ; five int3  ; six int3  ; seven int3  ; eight int3  ; nine int3  ;  

1.0  >  two jbe ten ; three cmp edx, eleven ; four je ten ; five mov rbx, qword ptr [rsp + one] ; six xor eax, eax ; seven add rsp, zero ; eight pop rdi ; nine ret  ;  

1.0  >  one sub rax, r8 ; two cmp rdx, rax ; three ja eight ; four mov rax, qword ptr [rip + ten] ; five test rax, rax ; six je eight ; seven call rax ; eight add rsp, zero ; nine ret  ;  

1.0  >  seven mov dword ptr [rbp - five], ebx ; eight mov dword ptr [rbp - four], ecx ; nine mov dword ptr [rbp - two], edx ; ten mov rax, qword ptr [rip + twenty] ; eleven mov ebx, three ; twelve and rax, twentyone ; thirteen mov dword ptr [rip + nineteen], zero ; fourteen mov dword ptr [rip + eighteen], one ; fifteen mov qword ptr [rip + seventeen], rax ; sixteen bt edi, six ;  

0.0  >  zero int3  ; one int3  ; two int3  ; three int3  ; four int3  ; five int3  ; six int3  ; seven int3  ; eig

In [9]:
import sys,os

from transformers import BertTokenizer,BertForSequenceClassification

# If using a character-level tokenizer for sequences like DNA/Protein:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)#BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer = tokenizer.train_new_from_iterator(train_sequences, VOCAB_SIZE)

TOKENIZER_SAVE_PATH = MODEL_SAVE_PATH + EXPERIMENT_NAME+"/tokenizer"+str(MAX_FILE_TO_USE)
tokenizer.save_pretrained(TOKENIZER_SAVE_PATH)







('/home/raisul/probabilistic_disassembly/models/prototype_pe_small/tokenizer30000/tokenizer_config.json',
 '/home/raisul/probabilistic_disassembly/models/prototype_pe_small/tokenizer30000/special_tokens_map.json',
 '/home/raisul/probabilistic_disassembly/models/prototype_pe_small/tokenizer30000/vocab.txt',
 '/home/raisul/probabilistic_disassembly/models/prototype_pe_small/tokenizer30000/added_tokens.json',
 '/home/raisul/probabilistic_disassembly/models/prototype_pe_small/tokenizer30000/tokenizer.json')

In [10]:
#jupyter nbconvert --to script data_pipe.ipynb
# accelerate launch data_pipe.py > log.txt

In [11]:
print('done')

done
