Compare two file

In [4]:
import re
from typing import List

#Regex pattern preprocessing
#1)  opcode_pattern: Extract P-Code
#2)  opcode_pattern: Extract Calculation

_opcode_pat = re.compile(r"(?:\)\s+|---\s+)([A-Z_]+)")
_operand_pattern = re.compile(r"\(([^ ,]+)\s*,\s*[^,]*,\s*([0-9]+)\)")

def _map_operand(op_type:str, size:str | None) -> str:
    op_type_l = op_type.lower()
    if op_type_l == 'register':
        return f"REG{size}" if size else "REG"
    if op_type_l == 'ram':
        return f"MEM{size}" if size else "MEM"
    if op_type_l in {'const', 'unique'}:
        return f"CONST{size}" if size else "CONST"
    return op_type.upper()

def _tokenize_line(line:str) -> List[str]:
    command = _opcode_pat.search(line)
    if not command:
        return []
    command = command.group(1)
    arguments : List[str] = []
    for operand, size in _operand_pattern.findall(line)[:5]:
        arguments.append(_map_operand(operand, size))
    combined = "-".join([command] + arguments)
    return [combined]

In [9]:
import json

file_path = "/home/tommy/Projects/pcodeFcg/document/BinKit_normal_ouptut/a2ps/results/a2ps-4.14_clang-11.0_mips_64_O3_fixnt/a2ps-4.14_clang-11.0_mips_64_O3_fixnt.json"
file_path2 = "/home/tommy/Projects/pcodeFcg/document/BinKit_normal_ouptut/a2ps/results/a2ps-4.14_clang-11.0_mips_32_O3_fixnt/a2ps-4.14_clang-11.0_mips_32_O3_fixnt.json"

with open(file_path, 'r') as file:
    data1 = json.load(file)

with open(file_path2, 'r') as file2:
    data2 = json.load(file2)
    
function1 = {entry["function_name"] for entry in data1.values()}
function2 = {entry["function_name"] for entry in data2.values()}

print(len(function1))
print(len(function2))

function1_only = function1 - function2
function2_only = function2 - function1
print("Functions only in first file:")
for func in function1_only:
    print(func)
print("\nFunctions only in second file:")
for func in function2_only:
    print(func)

39
39
Functions only in first file:
frame_dummy

Functions only in second file:
_PROCEDURE_LINKAGE_TABLE_


Instruction

In [10]:
def calculate_similarity(func1_data, func2_data):
   # 提取所有指令的tokenized結果
   tokens1 = []
   tokens2 = []
   
   for instruction in func1_data['instructions']:
       tokens1.extend(_tokenize_line(instruction['operation']))
   
   for instruction in func2_data['instructions']:
       tokens2.extend(_tokenize_line(instruction['operation']))
   
   # 計算相似度 (Jaccard similarity)
   set1 = set(tokens1)
   set2 = set(tokens2)
   
   if not set1 and not set2:
       return 1.0
   
   intersection = len(set1 & set2)
   union = len(set1 | set2)
   
   return intersection / union if union > 0 else 0.0

# 比較相同函數名的相似度
common_functions = function1 & function2
similarities = []

for func_name in common_functions:
   # 找到對應的函數資料
   func1_data = None
   func2_data = None
   
   for key, value in data1.items():
       if value['function_name'] == func_name:
           func1_data = value
           break
   
   for key, value in data2.items():
       if value['function_name'] == func_name:
           func2_data = value
           break
   
   if func1_data and func2_data:
       similarity = calculate_similarity(func1_data, func2_data)
       similarities.append((func_name, similarity))

similarities.sort(key=lambda x: x[1], reverse=True)

for func_name, similarity in similarities:
    print(f"Function: {func_name}, Similarity: {similarity:.4f}")
       

Function: yywrap, Similarity: 0.3333
Function: __start, Similarity: 0.1333
Function: yylex, Similarity: 0.0992
Function: __do_global_ctors_aux, Similarity: 0.0952
Function: main, Similarity: 0.0938
Function: yy_create_buffer, Similarity: 0.0778
Function: yy_scan_buffer, Similarity: 0.0702
Function: yy_scan_bytes, Similarity: 0.0588
Function: __do_global_dtors_aux, Similarity: 0.0526
Function: _init, Similarity: 0.0500
Function: yylex_destroy, Similarity: 0.0492
Function: yyrestart, Similarity: 0.0435
Function: yypop_buffer_state, Similarity: 0.0312
Function: yy_delete_buffer, Similarity: 0.0270
Function: reassemble, Similarity: 0.0208
Function: yypush_buffer_state, Similarity: 0.0206
Function: yy_switch_to_buffer, Similarity: 0.0204
Function: yy_flush_buffer, Similarity: 0.0200
Function: register_tm_clones, Similarity: 0.0000
Function: yy_fatal_error, Similarity: 0.0000
Function: yyget_lineno, Similarity: 0.0000
Function: yyget_in, Similarity: 0.0000
Function: yyget_out, Similarity: 0.

Opcode

In [11]:
def calculate_similarity(func1_data, func2_data):
    # 提取所有指令的tokenized結果
    tokens1 = []
    tokens2 = []
    
    for instruction in func1_data['instructions']:
        # 改這一行 - 只取 opcode
        tokens1.append(instruction['opcode'])
    
    for instruction in func2_data['instructions']:
        # 改這一行 - 只取 opcode  
        tokens2.append(instruction['opcode'])
    
    # 計算相似度 (Jaccard similarity)
    set1 = set(tokens1)
    set2 = set(tokens2)
    
    if not set1 and not set2:
        return 1.0
    
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    
    return intersection / union if union > 0 else 0.0

# 比較相同函數名的相似度
common_functions = function1 & function2
similarities = []

for func_name in common_functions:
   # 找到對應的函數資料
   func1_data = None
   func2_data = None
   
   for key, value in data1.items():
       if value['function_name'] == func_name:
           func1_data = value
           break
   
   for key, value in data2.items():
       if value['function_name'] == func_name:
           func2_data = value
           break
   
   if func1_data and func2_data:
       similarity = calculate_similarity(func1_data, func2_data)
       similarities.append((func_name, similarity))

similarities.sort(key=lambda x: x[1], reverse=True)

for func_name, similarity in similarities:
    print(f"Function: {func_name}, Similarity: {similarity:.4f}")

Function: __libc_csu_fini, Similarity: 1.0000
Function: yywrap, Similarity: 1.0000
Function: yylex, Similarity: 0.9355
Function: yy_create_buffer, Similarity: 0.8947
Function: _init, Similarity: 0.8889
Function: __start, Similarity: 0.8333
Function: __libc_csu_init, Similarity: 0.8333
Function: _fini, Similarity: 0.8333
Function: yyrestart, Similarity: 0.8000
Function: yy_flush_buffer, Similarity: 0.7857
Function: __do_global_ctors_aux, Similarity: 0.7500
Function: yylex_destroy, Similarity: 0.7059
Function: yypush_buffer_state, Similarity: 0.6957
Function: yy_switch_to_buffer, Similarity: 0.6957
Function: yy_delete_buffer, Similarity: 0.6875
Function: reassemble, Similarity: 0.6154
Function: yypop_buffer_state, Similarity: 0.5882
Function: yy_scan_buffer, Similarity: 0.5417
Function: main, Similarity: 0.5385
Function: yyget_lineno, Similarity: 0.5000
Function: yyget_in, Similarity: 0.5000
Function: yyget_out, Similarity: 0.5000
Function: yyget_text, Similarity: 0.5000
Function: yyget_

Compare O3, O0 and different architecture in O3

In [None]:
import json
import os
from itertools import combinations

base_path = "/home/tommy/Projects/pcodeFcg/document/BinKit_normal_ouptut/a2ps/results"

# 架構列表
architectures = ['x86_32', 'x86_64', 'arm_32', 'arm_64', 'mips_32', 'mips_64', 'mipseb_32', 'mipseb_64']
optimizations = ['O0', 'O1', 'O2', 'O3', 'Os', 'Ofast']

def get_file_path(arch, opt):
   folder_name = f"a2ps-4.14_clang-11.0_{arch}_{opt}_fixnt"
   return os.path.join(base_path, folder_name, f"{folder_name}.json")

def load_functions(file_path):
   if not os.path.exists(file_path):
       return {}
   with open(file_path, 'r') as f:
       data = json.load(f)
   return {entry['function_name']: entry for entry in data.values()}

# 1. 比較不同架構的O3相似程度
print("=== 不同架構O3相似程度 ===")
o3_data = {}
for arch in architectures:
   file_path = get_file_path(arch, 'O3')
   o3_data[arch] = load_functions(file_path)

for arch1, arch2 in combinations(architectures, 2):
   if arch1 in o3_data and arch2 in o3_data:
       functions1 = set(o3_data[arch1].keys())
       functions2 = set(o3_data[arch2].keys())
       common_functions = functions1 & functions2
       
       if common_functions:
           similarities = []
           for func_name in common_functions:
               similarity = calculate_similarity(o3_data[arch1][func_name], o3_data[arch2][func_name])
               similarities.append(similarity)
           
           avg_similarity = sum(similarities) / len(similarities)
           print(f"{arch1} vs {arch2}: {avg_similarity:.3f} ({len(common_functions)} functions)")

# 2. 比較同架構O3 vs O0差異
print("\n=== 同架構O3 vs O0差異 ===")
for arch in architectures:
   o3_path = get_file_path(arch, 'O3')
   o0_path = get_file_path(arch, 'O0')
   
   o3_funcs = load_functions(o3_path)
   o0_funcs = load_functions(o0_path)
   
   if o3_funcs and o0_funcs:
       common_functions = set(o3_funcs.keys()) & set(o0_funcs.keys())
       
       if common_functions:
           similarities = []
           for func_name in common_functions:
               similarity = calculate_similarity(o3_funcs[func_name], o0_funcs[func_name])
               similarities.append(similarity)
           
           avg_similarity = sum(similarities) / len(similarities)
           print(f"{arch} O3 vs O0: {avg_similarity:.3f} ({len(common_functions)} functions)")
           
# 3. 比較同架構不同優化等級的差異
print("\n=== 同架構不同優化等級差異 ===")
for arch in architectures:
   arch_data = {}
   for opt in optimizations:
       file_path = get_file_path(arch, opt)
       arch_data[opt] = load_functions(file_path)
   
   print(f"\n{arch}:")
   for opt1, opt2 in combinations(optimizations, 2):
       if opt1 in arch_data and opt2 in arch_data and arch_data[opt1] and arch_data[opt2]:
           common_functions = set(arch_data[opt1].keys()) & set(arch_data[opt2].keys())
           
           if common_functions:
               similarities = []
               for func_name in common_functions:
                   similarity = calculate_similarity(arch_data[opt1][func_name], arch_data[opt2][func_name])
                   similarities.append(similarity)
               
               avg_similarity = sum(similarities) / len(similarities)
               print(f"  {opt1} vs {opt2}: {avg_similarity:.3f} ({len(common_functions)} functions)")
               

# 4. 統計每個架構+優化的函數數量
print("\n=== 函數數量統計 ===")
for arch in architectures:
   print(f"\n{arch}:")
   for opt in optimizations:
       file_path = get_file_path(arch, opt)
       funcs = load_functions(file_path)
       if funcs:
           print(f"  {opt}: {len(funcs)} functions")
       else:
           print(f"  {opt}: 檔案不存在")