Check the structure of Gpickle

In [1]:
import networkx as nx
import pickle

# 目標檔案路徑
gpickle_path = "/home/tommy/Projects/cross-architecture/Gpickle/20250509_new_train_450_token/00/00a6f39a8f7b14f223fa51a9a23aa110112a524799e910e321b162847a875593.gpickle"

with open(gpickle_path, 'rb') as f:
    G = pickle.load(f)

# 印出基本資訊
print("圖類型：", type(G))
print("節點數量：", G.number_of_nodes())
print("邊數量：", G.number_of_edges())

# 印出前幾個節點和其屬性
for i, (node, data) in enumerate(G.nodes(data=True)):
    print(f"節點 {node}：{data}")
    if i >= 4:  # 只看前5個節點
        break

# 檢查是否為有向圖
if isinstance(G, nx.DiGraph):
    print("✅ 是有向圖 (DiGraph)")
else:
    print("⚠️ 不是 DiGraph，可能結構錯誤")


圖類型： <class 'networkx.classes.digraph.DiGraph'>
節點數量： 341
邊數量： 650
節點 0x80d4L：{'pcode': ['RETURN', 'CAST']}
節點 0x80f0L：{'pcode': ['CALLIND', 'RETURN']}
節點 0x80f4L：{'pcode': ['CALLIND', 'RETURN']}
節點 0x80f8L：{'pcode': ['CALLIND', 'RETURN']}
節點 0x80fcL：{'pcode': ['CALLIND', 'RETURN']}
✅ 是有向圖 (DiGraph)


Empty Node

In [None]:
empty_attr_nodes = [node for node, data in G.nodes(data=True) if not data]
print(f"沒有屬性的節點數量：{len(empty_attr_nodes)}")
if empty_attr_nodes:
    print("⚠️ 以下節點沒有屬性：", empty_attr_nodes)

Clean Gpickle Testing code

In [None]:
from pathlib import Path
import pandas as pd
from networkx import DiGraph    
import networkx as nx

def read_csv(csv_file_path: str | Path):
    df = pd.read_csv(csv_file_path)
    file_names = df['file_name'].tolist()
    return file_names

def clean_data(json_data, G_raw: nx.DiGraph) -> nx.DiGraph:
    G = nx.DiGraph()
    for node in G_raw.nodes():
        addr = str(node)
        func = json_data.get(addr)
        if not func:
            continue

        instructions = func.get("instructions", [])
        pcode_list = [p for instr in instructions if isinstance(instr, dict)
                      for p in instr.get("pcode", [])]

        if pcode_list:
            G.add_node(addr, pcode=pcode_list)
    for src, dst in G_raw.edges():
        src, dst = str(src), str(dst)
        if G.has_node(src) and G.has_node(dst):
            G.add_edge(src, dst)

    return G

def process_single_file_data(file_info, output_base_path):
    json_path, dot_path, file_name = file_info
    
    try:
        with open(json_path, 'r') as f:
            json_data = json.load(f)
        G_raw = nx.drawing.nx_pydot.read_dot(dot_path)
        G = clean_data(json_data, G_raw)
        
        # Prepare output path
        prefix = file_name[:2]
        output_dir = output_base_path / prefix
        output_dir.mkdir(parents=True, exist_ok=True)
        output_file = output_dir / f"{file_name}.gpickle"
        
        try:
            with open(output_file, 'wb') as f:
                pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print(f"Error saving graph to {output_file}: {str(e)}")
            raise

        # Clear variables to free memory
        del json_data, G_raw, G
        
        return f"Successfully processed {file_name}"
    except Exception as e:
        return f"Error processing {file_name}: {str(e)}"



In [None]:
from tqdm import tqdm
from functools import partial
import multiprocessing as mp


csv_file_path = "/home/tommy/Projects/cross-architecture/Experiment3.1/dataset/cleaned_20250509_test_600.csv" 
root_dir = "/home/tommy/Projects/cross-architecture/reverse/output_new/results" 
output_base_dir = "/home/tommy/Projects/cross-architecture/Gpickle/20250509_new_test_600"

root_dir = Path(root_dir)
output_base_path = Path(output_base_dir)
file_info_list = []
file_names = read_csv(csv_file_path)
num_processes= None 
for file_name in tqdm(file_names, desc="Collecting file paths"):
    json_path = root_dir / file_name / f"{file_name}.json"
    dot_path = root_dir / file_name / f"{file_name}.dot"
    
    if json_path.exists() and dot_path.exists():
        file_info_list.append((json_path, dot_path, file_name))
    else:
        missing = []
        if not json_path.exists():
            missing.append("JSON")
        if not dot_path.exists():
            missing.append("DOT")
        print(f"Missing {', '.join(missing)} file(s) for: {file_name}")

# Use multiprocessing to process files in parallel
if num_processes is None:
    num_processes = mp.cpu_count()

print(f"Processing {len(file_info_list)} files using {num_processes} processes...")

# Create a partial function with fixed output_base_path
process_func = partial(process_single_file_data, output_base_path=output_base_path)

# Use multiprocessing pool to process files
with mp.Pool(processes=num_processes) as pool:
    results = list(tqdm(
        pool.imap(process_func, file_info_list),
        total=len(file_info_list),
        desc="Processing files"
    ))
    
