TSNE Check Embeddings Performance

In [None]:
import pandas as pd
import pickle
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from typing import List, Union
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances

# 忽略 tqdm 的 FutureWarning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def extract_vectors_from_gpickle(csv_file_path: Union[str, Path], root_dir: Union[str, Path], cpu_filter: List[str]):
    """
    從 CSV 讀取檔案資訊，並從 .gpickle 檔案中提取向量
    """
    root_path = Path(root_dir)
    df = pd.read_csv(csv_file_path)
    
    filtered_df = df[df['CPU'].isin(cpu_filter)]
    
    print(f"原始資料筆數：{len(df)}")
    print(f"篩選後資料筆數：{len(filtered_df)}")
    
    vectors = []
    cpus = []
    families = []
    
    for _, row in tqdm(filtered_df.iterrows(), total=len(filtered_df), desc="Processing Gpickle files"):
        file_name = row['file_name']
        cpu = row['CPU']
        family = row['family']
        
        prefix = file_name[:2]
        path = root_path / prefix / f"{file_name}.gpickle"
        
        if path.exists():
            try:
                with open(path, "rb") as fp:
                    data = pickle.load(fp)
                
                if 'node_embeddings' in data and data['node_embeddings']:
                    node_embeddings = data['node_embeddings']
                    node_vectors = [np.array(emb) for emb in node_embeddings.values() if len(emb) == 256]
                    
                    if node_vectors:
                        avg_vector = np.mean(node_vectors, axis=0)
                        vectors.append(avg_vector)
                        cpus.append(cpu)
                        families.append(family)
            except Exception as e:
                tqdm.write(f"[Error] Load Gpickle Failed {path}: {e}")
            
    return np.array(vectors), cpus, families

def visualize_and_analyze_tsne(vectors, cpus, families):
    """
    執行 t-SNE，視覺化並輸出數值化分析報告。
    """
    # t-SNE 降維
    tsne = TSNE(n_components=2, random_state=42, perplexity=15)
    vectors_2d = tsne.fit_transform(vectors)
    
    # 視覺化
    cpu_markers = {'ARM-32': 's', 'AMD X86-64': 'o'}
    family_colors = plt.cm.get_cmap('tab10', len(set(families)))
    family_color_map = {family: family_colors(i) for i, family in enumerate(sorted(list(set(families))))}
    
    fig, ax = plt.subplots(figsize=(12, 8))
    
    unique_cpus = sorted(list(set(cpus)))
    unique_families = sorted(list(set(families)))

    for cpu in unique_cpus:
        for family in unique_families:
            mask = [(c == cpu and f == family) for c, f in zip(cpus, families)]
            if any(mask):
                current_vectors = vectors_2d[mask]
                ax.scatter(current_vectors[:, 0], current_vectors[:, 1],
                            c=[family_color_map.get(family)], marker=cpu_markers.get(cpu, 'o'),
                            alpha=0.8, s=80, label=f'{cpu} - {family}')
    
    ax.set_title('t-SNE Visualization by CPU (Shape) and Family (Color)')
    ax.set_xlabel('t-SNE 1')
    ax.set_ylabel('t-SNE 2')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    # 數值化分析
    print("\n" + "="*50)
    print("Numerical Analysis: Cluster Centroids & Distances")
    print("="*50)
    
    cluster_centroids = {}
    labels = []
    
    for cpu in unique_cpus:
        for family in unique_families:
            mask = [(c == cpu and f == family) for c, f in zip(cpus, families)]
            if any(mask):
                current_vectors = vectors_2d[mask]
                centroid = np.mean(current_vectors, axis=0)
                cluster_label = f'{cpu}-{family}'
                cluster_centroids[cluster_label] = centroid
                labels.append(cluster_label)
    
    centroids_array = np.array(list(cluster_centroids.values()))
    
    # 計算歐式距離矩陣
    distances = euclidean_distances(centroids_array)
    
    # 建立 DataFrame 進行視覺化輸出
    distance_df = pd.DataFrame(distances, index=labels, columns=labels).round(2)
    
    print("\nCluster Centroid Coordinates (t-SNE 2D):")
    centroids_df = pd.DataFrame(centroids_array, index=labels, columns=['t-SNE 1', 't-SNE 2']).round(2)
    print(centroids_df)
    
    print("\nEuclidean Distance Between Clusters:")
    print(distance_df)

# 主程式
def main():
    csv_file_path = "/home/tommy/Project/PcodeBERT/dataset/csv/base_dataset_filtered_v2.csv"
    gpickle_dir = "/home/tommy/Project/PcodeBERT/outputs/embeddings_adapted"
    
    target_cpus = ['AMD X86-64', 'ARM-32']

    print("提取向量和家族資訊...")
    vectors, cpus, families = extract_vectors_from_gpickle(
        csv_file_path, gpickle_dir, target_cpus
    )
    
    print(f"總共載入 {len(vectors)} 個向量")
    if len(vectors) > 0:
        if len(vectors) > 5000:
            print("數據量較大，進行抽樣以加速 t-SNE 運算。")
            import random
            sample_size = 5000
            indices = random.sample(range(len(vectors)), sample_size)
            vectors_sampled = vectors[indices]
            cpus_sampled = [cpus[i] for i in indices]
            families_sampled = [families[i] for i in indices]
        else:
            vectors_sampled = vectors
            cpus_sampled = cpus
            families_sampled = families
        
        print("執行 t-SNE 並創建視覺化...")
        visualize_and_analyze_tsne(vectors_sampled, cpus_sampled, families_sampled)
    else:
        print("沒有找到符合條件的數據。")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import torch
import os
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data
import warnings

# 忽略 scikit-learn 的 FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

def get_gnn_config():
    BASE_PATH = "/home/tommy/Project/PcodeBERT"
    
    config = {
        "source_cpus": ["AMD X86-64"],     
        "target_cpus": ["ARM-32"],        
        
        "csv_path": f"{BASE_PATH}/dataset/csv/base_dataset_filtered_v2.csv",
        "graph_dir": f"{BASE_PATH}/outputs/embeddings",
        "cache_file": f"{BASE_PATH}/outputs/cache/gnn_data_by_family_label.pkl",
        "model_output_dir": f"{BASE_PATH}/outputs/models/gnn",
        
        "batch_size": 32,
        "hidden_channels": 64,
        "learning_rate": 0.01,
        "epochs": 200,
        "patience": 20,
        
        "seeds": [42, 123, 2025, 31415, 8888],
        "device": "cuda"
    }
    
    return config

def load_graphs_from_df(df, graph_dir, label_col='label'):
    """
    從 DataFrame 載入圖資料。
    新增了 label_col 參數來決定使用哪一欄作為標籤。
    """
    graphs = []
    labels = []
    
    for _, row in df.iterrows():
        file_name = row['file_name']
        prefix = file_name[:2]
        label = row[label_col]
        graph_path = Path(graph_dir) / prefix / f"{file_name}.gpickle"

        if not graph_path.exists():
            continue
            
        with open(graph_path, 'rb') as f:
            data = pickle.load(f)
        
        node_embeddings = data['node_embeddings']
        if not node_embeddings:
            continue
        
        embeddings = [list(emb) for emb in node_embeddings.values()]
        x = torch.tensor(embeddings, dtype=torch.float)
        
        num_nodes = len(embeddings)
        edge_list = []
        for i in range(num_nodes - 1):
            edge_list.extend([[i, i+1], [i+1, i]])
        
        if edge_list:
            edge_index = torch.tensor(edge_list, dtype=torch.long).t()
        else:
            edge_index = torch.tensor([[0], [0]], dtype=torch.long)
        
        graph_data = Data(x=x, edge_index=edge_index)
        graphs.append(graph_data)
        labels.append(label)
    
    return graphs, labels

def load_cross_arch_data_with_family_label(csv_path, graph_dir, source_cpus, target_cpus, cache_file, val_size=0.2, random_state=42, force_reload=False):
    """
    載入跨架構的圖資料，並以 'family' 作為標籤。
    """
    if force_reload and os.path.exists(cache_file):
        os.remove(cache_file)
    
    if os.path.exists(cache_file):
        print(f"Loading cached data from: {cache_file}")
        with open(cache_file, 'rb') as f:
            cached_data = pickle.load(f)
        return (cached_data['train_graphs'], 
                cached_data['val_graphs'],
                cached_data['test_graphs'], 
                cached_data['label_encoder'], 
                cached_data['num_classes'])
    
    print("Loading CSV data...")
    df = pd.read_csv(csv_path)
    
    train_df = df[df['CPU'].isin(source_cpus)]
    test_df = df[df['CPU'].isin(target_cpus)]
    
    print(f"Training data: {len(train_df)} samples (architectures: {source_cpus})")
    print(f"Test data: {len(test_df)} samples (architectures: {target_cpus})")
    
    train_graphs, train_labels = load_graphs_from_df(train_df, graph_dir, label_col='family')
    test_graphs, test_labels = load_graphs_from_df(test_df, graph_dir, label_col='family')
    
    train_graphs, val_graphs, train_labels, val_labels = train_test_split(
        train_graphs, train_labels, test_size=val_size, 
        stratify=train_labels, random_state=random_state
    )
    
    label_encoder = LabelEncoder()
    all_labels = train_labels + val_labels + test_labels
    label_encoder.fit(all_labels)
    
    encoded_train_labels = label_encoder.transform(train_labels)
    encoded_val_labels = label_encoder.transform(val_labels)
    encoded_test_labels = label_encoder.transform(test_labels)
    
    num_classes = len(label_encoder.classes_)
    
    for i, data in enumerate(train_graphs):
        data.y = torch.tensor(encoded_train_labels[i], dtype=torch.long)
        
    for i, data in enumerate(val_graphs):
        data.y = torch.tensor(encoded_val_labels[i], dtype=torch.long)
        
    for i, data in enumerate(test_graphs):
        data.y = torch.tensor(encoded_test_labels[i], dtype=torch.long)

    os.makedirs(os.path.dirname(cache_file), exist_ok=True)
    cache_data = {
        'train_graphs': train_graphs,
        'val_graphs': val_graphs,
        'test_graphs': test_graphs,
        'label_encoder': label_encoder,
        'num_classes': num_classes
    }
    
    with open(cache_file, 'wb') as f:
        pickle.dump(cache_data, f)
    
    print(f"Data has been cached to: {cache_file}")
    return train_graphs, val_graphs, test_graphs, label_encoder, num_classes

def analyze_dataset_by_family(graphs, label_encoder):
    """
    Analyzes the node distribution for each family and returns a dictionary of stats.
    """
    family_stats = {}
    if not graphs:
        return family_stats

    # Group graphs by family
    family_groups = defaultdict(list)
    for g in graphs:
        family = label_encoder.inverse_transform([int(g.y)])[0]
        family_groups[family].append(g.num_nodes)

    for family, node_counts in family_groups.items():
        family_stats[family] = {
            "Total Graphs": len(node_counts),
            "Avg Nodes": np.mean(node_counts),
            "Median Nodes": np.median(node_counts),
            "Max Nodes": np.max(node_counts),
            "Min Nodes": np.min(node_counts)
        }
    return family_stats

def compare_datasets(train_graphs, val_graphs, test_graphs, label_encoder):
    """
    Compares key statistics of training, validation, and test datasets.
    """
    # First, print the overall comparison report
    train_stats = analyze_dataset_by_family(train_graphs, label_encoder)
    test_stats = analyze_dataset_by_family(test_graphs, label_encoder)

    all_families = sorted(list(set(train_stats.keys()) | set(test_stats.keys())))

    print("\n" + "="*50)
    print("Family-wise Node Count Comparison")
    print("="*50)

    for family in all_families:
        train_data = train_stats.get(family, {"Total Graphs": 0, "Avg Nodes": 0, "Median Nodes": 0, "Max Nodes": 0, "Min Nodes": 0})
        test_data = test_stats.get(family, {"Total Graphs": 0, "Avg Nodes": 0, "Median Nodes": 0, "Max Nodes": 0, "Min Nodes": 0})
        
        print(f"\n--- Family: {family} ---")
        
        comparison_df = pd.DataFrame({
            "Metric": ["Total Graphs", "Avg Nodes", "Median Nodes", "Max Nodes", "Min Nodes"],
            "Training Set": [train_data["Total Graphs"], f"{train_data['Avg Nodes']:.2f}", train_data["Median Nodes"], train_data["Max Nodes"], train_data["Min Nodes"]],
            "Test Set": [test_data["Total Graphs"], f"{test_data['Avg Nodes']:.2f}", test_data["Median Nodes"], test_data["Max Nodes"], test_data["Min Nodes"]]
        })
        print(comparison_df.to_string(index=False))

# Main execution
if __name__ == "__main__":
    gnn_config = get_gnn_config()
    
    train_graphs, val_graphs, test_graphs, label_encoder, num_classes = load_cross_arch_data_with_family_label(
        csv_path=gnn_config["csv_path"],
        graph_dir=gnn_config["graph_dir"],
        source_cpus=gnn_config["source_cpus"],
        target_cpus=gnn_config["target_cpus"],
        cache_file=gnn_config["cache_file"],
        force_reload=False
    )

    compare_datasets(train_graphs, val_graphs, test_graphs, label_encoder)

Adapters

In [1]:
import pickle

file_path = "/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_embeddings.pickle"

with open(file_path, "rb") as fp:
    arch_data = pickle.load(fp)


#check label distribution
#data format is list of (vec1, vec2, label)
label_count = {}
for item in arch_data:
    label = item[2]
    if label not in label_count:
        label_count[label] = 0
    label_count[label] += 1

# Print label distribution
for label, count in label_count.items():
    print(f"Label: {label}, Count: {count}")


Label: 1, Count: 68688


In [5]:
from tqdm import tqdm
import numpy as np
import torch
import pickle

path = "/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_embeddings.pickle"

with open(path, "rb") as f:
    data = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 定義 similarity 函式 ===
def similarity_score(x1, x2):
    dist = torch.norm(x1 - x2, dim=1)  
    return 1 / (1 + dist)             

# === 批次計算平均 similarity ===
scores = []
for v1, v2, _ in tqdm(data, desc="計算中"):
    t1 = torch.tensor(v1, dtype=torch.float32, device=device).unsqueeze(0)
    t2 = torch.tensor(v2, dtype=torch.float32, device=device).unsqueeze(0)

    score = similarity_score(t1, t2)
    scores.append(score.item())

avg_score = np.mean(scores)
print(f"平均 similarity score: {avg_score:.4f}")

計算中: 100%|██████████| 68688/68688 [06:22<00:00, 179.71it/s] 

平均 similarity score: 0.1724





In [None]:
import pickle
import torch
import numpy as np
import random
from tqdm import tqdm

input_path = "/home/tommy/Project/PcodeBERT/outputs/alignment_vector/train_arm_vector_mix_bert.pickle"
output_path = "/home/tommy/Project/PcodeBERT/outputs/alignment_vector/train_arm_vector_contrastive_bert.pickle"

with open(input_path, "rb") as f:
    data = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === similarity 函式 ===
def similarity_score(x1, x2):
    dist = torch.norm(x1 - x2, dim=1)  # L2 distance
    return 1 / (1 + dist)

# === 計算所有 similarity score ===
scored_data = []
for v1, v2, label in tqdm(data, desc="計算相似度"):
    t1 = torch.tensor(v1, dtype=torch.float32, device=device).unsqueeze(0)
    t2 = torch.tensor(v2, dtype=torch.float32, device=device).unsqueeze(0)
    score = similarity_score(t1, t2).item()
    scored_data.append((v1, v2, label, score))

# === 依照相似度排序 ===
scored_data.sort(key=lambda x: x[3], reverse=True)
 

# === 分成兩半 ===
half = len(scored_data) // 2
positive_samples = [(v1, v2, 1) for v1, v2, _, _ in scored_data[:half]]
negative_candidates = [ (v1, v2) for v1, v2, _, _ in scored_data[half:] ]

print(f"[INFO] 正樣本數: {len(positive_samples)}")
print(f"[INFO] 負樣本候選數: {len(negative_candidates)}")

# === 打亂後配對建立負樣本 ===
all_vec1 = [v1 for v1, _ in negative_candidates]
all_vec2 = [v2 for _, v2 in negative_candidates]
random.shuffle(all_vec2)

negative_samples = [(v1, v2, 0) for v1, v2 in zip(all_vec1, all_vec2)]

#print sample of negative samples
print("\n[INFO] 負樣本範例:")
for v1, v2, _ in negative_samples[:5]:
    print(f"  - {v1} <-> {v2}")

# === 合併正負樣本 ===
final_dataset = positive_samples + negative_samples
random.shuffle(final_dataset)

# === 儲存 ===
with open(output_path, "wb") as f:
    pickle.dump(final_dataset, f)

print(f"[INFO] 原始樣本數: {len(data)}")
print(f"[INFO] 新資料集樣本數: {len(final_dataset)}")
print(f"[INFO] 已儲存至: {output_path}")

In [None]:
#check label 1 and label 0 simiarity score
import pickle 

with open("/home/tommy/Project/PcodeBERT/outputs/alignment_vector/train_arm_vector_contrastive_bert.pickle", "rb") as f:
    data = pickle.load(f)

#check label distribution
from collections import Counter
labels = [label for _, _, label in data]
label_counts = Counter(labels)
print("Label distribution:", label_counts)

for v1, v2, label in data:
    if label == 1:
        t1 = torch.tensor(v1, dtype=torch.float32, device=device).unsqueeze(0)
        t2 = torch.tensor(v2, dtype=torch.float32, device=device).unsqueeze(0)
        pos_score = similarity_score(t1, t2)
        print(f"Positive pair similarity score: {pos_score.item():.4f}")
        break
for v1, v2, label in data:
    if label == 0:
        t1 = torch.tensor(v1, dtype=torch.float32, device=device).unsqueeze(0)
        t2 = torch.tensor(v2, dtype=torch.float32, device=device).unsqueeze(0)
        neg_score = similarity_score(t1, t2)
        print(f"Negative pair similarity score: {neg_score.item():.4f}")
        break

Check family embedding across architectures

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import euclidean_distances

# --- 配置路徑 ---
csv_path = "/home/tommy/Project/PcodeBERT/dataset/csv/base_dataset_filtered.csv"
embedding_path = "/home/tommy/Project/PcodeBERT/outputs/embeddings"
# 相似度比較的樣本數量
NUM_SAMPLES_TO_COMPARE = 5
# 每個樣本找出的最相似函式數量
TOP_K = 5

# --- 輔助函式 ---

def load_embeddings(func_id: str, arch: str) -> np.ndarray | None:
    """載入特定函式ID和架構的嵌入向量。"""
    # 假設嵌入檔案的命名格式為 {func_id}_{arch}.npy
    # 範例: 0000_AMD X86-64.npy
    safe_arch = arch.replace(' ', '_')  # 替換空格以適應檔案名
    embedding_file = f"{func_id}_{safe_arch}.npy"
    full_path = os.path.join(embedding_path, safe_arch, embedding_file)
    
    # 檢查檔案是否存在
    if not os.path.exists(full_path):
        # print(f"Warning: Embedding file not found for {func_id} ({arch}) at {full_path}")
        return None
        
    try:
        # 嵌入向量檔案通常儲存為 numpy array
        return np.load(full_path)
    except Exception as e:
        # print(f"Error loading embedding for {func_id} ({arch}): {e}")
        return None

def find_most_similar(query_embed: np.ndarray, target_df: pd.DataFrame, target_arch: str, k: int = 5) -> list[tuple[str, float]]:
    """
    計算查詢向量與目標資料集中所有函式的歐式距離，並返回最相似的 K 個結果。
    """
    target_embeddings = []
    target_func_ids = []
    
    # 1. 批次載入目標架構的嵌入向量
    for _, row in target_df.iterrows():
        func_id = str(row['FunctionID'])
        embed = load_embeddings(func_id, target_arch)
        if embed is not None:
            # PcodeBERT 輸出的嵌入向量通常是 (1, N) 或 (N,) 維度，確保是 (N,)
            target_embeddings.append(embed.flatten())
            target_func_ids.append(func_id)

    if not target_embeddings:
        return []

    target_embeddings_matrix = np.array(target_embeddings)
    
    # 2. 計算歐式距離 (Euclidean Distance)
    # distance matrix D[i][j] 是 query_embed[i] 和 target_embeddings_matrix[j] 的距離
    # 這裡只有一個查詢向量，所以結果是一個 (1, M) 的矩陣
    distances = euclidean_distances(query_embed.reshape(1, -1), target_embeddings_matrix)[0]

    # 3. 排序並取出前 K 個最小距離（最相似）
    # argsort 返回排序後的索引
    sorted_indices = np.argsort(distances)
    
    results = []
    for i in sorted_indices[:k]:
        func_id = target_func_ids[i]
        distance = distances[i]
        results.append((func_id, distance))
        
    return results

# --- 主要邏輯 ---

def run_similarity_analysis():
    """執行跨架構相似性分析"""
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_path}")
        return
    
    # 定義要比較的兩個群組
    # 假設 ARM 群組使用 'ARM' 架構
    ARCH_A = 'AMD X86-64'
    ARCH_B = 'ARM-32' # 或其他 ARM 架構名稱，請依據您的 CSV 調整
    FAMILY = 'mirai'
    
    # 過濾 Mirai 樣本
    df_mirai = df[df['family'] == FAMILY].copy()
    
    # 1. 篩選 X86 Mirai 樣本作為查詢集 (Query Set)
    query_df = df_mirai[df_mirai['CPU'] == ARCH_A].head(NUM_SAMPLES_TO_COMPARE)
    print(f"--- 查詢集 ({ARCH_A} {FAMILY}) 樣本數: {len(query_df)} ---")
    # print(query_df[['FunctionID', 'CPU', 'filename']].to_string(index=False))

    # 2. 篩選 ARM Mirai 樣本作為目標集 (Target Set)
    target_df = df_mirai[df_mirai['CPU'] == ARCH_B]
    print(f"--- 目標集 ({ARCH_B} {FAMILY}) 樣本數: {len(target_df)} ---")
    if target_df.empty:
        print(f"Error: No samples found for {ARCH_B} {FAMILY}. Please check the CPU name.")
        return

    # 3. 執行相似度比較
    print("\n--- 跨架構相似度比較結果 (X86 -> ARM) ---")
    results = []

    for _, query_row in query_df.iterrows():
        query_func_id = str(query_row['FunctionID'])
        query_arch = query_row['CPU']
        
        # 載入查詢函式的嵌入向量
        query_embed = load_embeddings(query_func_id, query_arch)
        
        if query_embed is None:
            continue
        
        # 查找最相似的 ARM 函式
        similar_functions = find_most_similar(
            query_embed.flatten(), 
            target_df, 
            target_arch=ARCH_B, 
            k=TOP_K
        )

        results.append({
            'Query_ID': query_func_id,
            'Query_Arch': query_arch,
            'Target_Arch': ARCH_B,
            'Top_Similar': similar_functions
        })
        
        print(f"Query {query_func_id} ({query_arch}):")
        if similar_functions:
            for target_id, dist in similar_functions:
                print(f"  -> Target {target_id} (Dist: {dist:.4f})")
        else:
            print("  -> No similar functions found (可能是目標集嵌入向量載入失敗)")

    # 4. (可選) 後續分析共同點
    # 這裡只輸出相似函式ID和距離，要分析共同點，
    # 您需要進一步處理這些相似函式對的原始 P-code/彙編程式碼。
    
    # print("\n--- 完整結果摘要 ---")
    # for res in results:
    #     print(res)

if __name__ == '__main__':
    run_similarity_analysis()

Chech Prtrain Model 

In [None]:
from transformers import AutoTokenizer, RobertaForMaskedLM
import torch
import pickle

file = "/home/tommy/Project/PcodeBERT/outputs/gpickle_merged_adjusted_filtered/0e/0e0ebdff7ac27afdcf1d7e555e29002cbf3647cf408e5830ceb699c2ead5cd35.gpickle"


def load_pretrained_model():
    """載入預訓練的模型和tokenizer"""
    model_path = "/home/tommy/Project/PcodeBERT/checkpoints/model_epoch_50"

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = RobertaForMaskedLM.from_pretrained(model_path)
    # print(model.config)
    # 設定device

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"模型總參數數量 (Total parameters): {total_params:,}")
    print(f"可訓練參數數量 (Trainable parameters): {trainable_params:,}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    print(f"Model loaded successfully on device: {device}")
    return model, tokenizer, device

model, tokenizer, device = load_pretrained_model()

# with open(file, "rb") as fp:
#     data = pickle.load(fp)
#     input = tokenizer(data.nodes["0x1001a4b8L"]['sentence'], return_tensors="pt", truncation=True, max_length=512, padding=True)
# print("0x1001a4b8L node sentence:", data.nodes["0x1001a4b8L"]['sentence'])
# print("Input IDs:", input['input_ids'])
# print("Token Count:", len(input['input_ids'][0]))

# for node, node_data in data.nodes(data=True):
#     sentence = node_data.get("sentence")
#     input = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)
#     inputs = {k: v.to(device) for k, v in input.items()}
#     #check sentence length
#     input_length = inputs['input_ids'].shape[1]
#     # print(f"Node: {node}, Sentence length: {input_length}")
#     with torch.no_grad():
#         outputs = model.roberta(**inputs)
#         embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
#     print(f"Node: {node}, Embedding shape: {embedding.shape}")



: 

In [None]:
import pickle

file = "/home/tommy/Project/PcodeBERT/outputs/gpickle_merged_adjusted_filtered/0e/0e0ebdff7ac27afdcf1d7e555e29002cbf3647cf408e5830ceb699c2ead5cd35.gpickle"

with open(file, "rb") as fp:
    data = pickle.load(fp)

print(data)

for node, node_data in data.nodes(data=True):
    print(f"Node {node}: {node_data}")
    embedding = node_data.get('sentence', None)
    