Clean Alignment Data

In [20]:
import os
import json
from collections import defaultdict

folder_path = "/home/tommy/Projects/pcodeFcg/document/copied_binaries_Os_output/results"
output_path = "/home/tommy/Projects/pcodeFcg/embedding/Word2Vec/data"
archs = ["arm_32", "mips_32"]

# 儲存每個架構的函數資料
arch_data = {arch: defaultdict(dict) for arch in archs}

# 遍歷所有子資料夾
for root, dirs, files in os.walk(folder_path):
    for json_file in files:
        if json_file.endswith('.json'):
            # 取得子資料夾名稱
            subfolder = os.path.basename(root)
            
            # 判斷架構
            arch = None
            for arch_name in archs:
                if arch_name in subfolder:
                    arch = arch_name
                    break
            
            if arch is None:
                continue
            
            # 提取 fileName (以 "_" 分割的最後區塊)
            fileName = subfolder.split("_")[-1]
            
            # 讀取 JSON 檔案
            json_path = os.path.join(root, json_file)
            try:
                with open(json_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # 處理每個函數
                for func_addr, func_data in data.items():
                    function_name = func_data.get('function_name', '')
                    instructions = func_data.get('instructions', [])
                    
                    # 提取所有 opcode
                    opcodes = [instr.get('opcode', '') for instr in instructions]
                    opcodes_str = ' '.join(opcodes)
                    
                    # 儲存到對應架構的資料結構中
                    if function_name:
                        key = f"{fileName}_{function_name}"
                        arch_data[arch][key] = opcodes_str
            
            except Exception as e:
                print(f"Error reading {json_path}: {e}")

# 確保輸出目錄存在
os.makedirs(output_path, exist_ok=True)

# 為每個架構生成 txt 檔案
for arch in archs:
    output_file = os.path.join(output_path, f"{arch}_opcodes.txt")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        # 按照 key 排序以確保一致性
        for key in sorted(arch_data[arch].keys()):
            opcodes = arch_data[arch][key]
            f.write(f"{opcodes}\n")
    
    print(f"Generated {output_file} with {len(arch_data[arch])} functions")

# 生成對齊的檔案（只包含兩個架構都有的函數）
common_keys = set(arch_data[archs[0]].keys()) & set(arch_data[archs[1]].keys())

if common_keys:
    for arch in archs:
        aligned_output_file = os.path.join(output_path, f"{arch}_opcodes_aligned.txt")
        
        with open(aligned_output_file, 'w', encoding='utf-8') as f:
            for key in sorted(common_keys):
                opcodes = arch_data[arch][key]
                f.write(f"{opcodes}\n")
        
        print(f"Generated aligned {aligned_output_file} with {len(common_keys)} common functions")


Error reading /home/tommy/Projects/pcodeFcg/document/copied_binaries_Os_output/results/glpk-5.0_gcc-9.4.0_mips_32_Os_libglpk.so.40.3.1/glpk-5.0_gcc-9.4.0_mips_32_Os_libglpk.so.40.3.1.json: Expecting property name enclosed in double quotes: line 1193205 column 4 (char 49979392)
Generated /home/tommy/Projects/pcodeFcg/embedding/Word2Vec/data/arm_32_opcodes.txt with 82881 functions
Generated /home/tommy/Projects/pcodeFcg/embedding/Word2Vec/data/mips_32_opcodes.txt with 82252 functions
Generated aligned /home/tommy/Projects/pcodeFcg/embedding/Word2Vec/data/arm_32_opcodes_aligned.txt with 67647 common functions
Generated aligned /home/tommy/Projects/pcodeFcg/embedding/Word2Vec/data/mips_32_opcodes_aligned.txt with 67647 common functions
