TSNE Check Embeddings Performance

In [None]:
import pandas as pd
import pickle
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from typing import List, Union
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances

# 忽略 tqdm 的 FutureWarning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def extract_vectors_from_gpickle(csv_file_path: Union[str, Path], root_dir: Union[str, Path], cpu_filter: List[str]):
    """
    從 CSV 讀取檔案資訊，並從 .gpickle 檔案中提取向量
    """
    root_path = Path(root_dir)
    df = pd.read_csv(csv_file_path)
    
    filtered_df = df[df['CPU'].isin(cpu_filter)]
    
    print(f"原始資料筆數：{len(df)}")
    print(f"篩選後資料筆數：{len(filtered_df)}")
    
    vectors = []
    cpus = []
    families = []
    
    for _, row in tqdm(filtered_df.iterrows(), total=len(filtered_df), desc="Processing Gpickle files"):
        file_name = row['file_name']
        cpu = row['CPU']
        family = row['family']
        
        prefix = file_name[:2]
        path = root_path / prefix / f"{file_name}.gpickle"
        
        if path.exists():
            try:
                with open(path, "rb") as fp:
                    data = pickle.load(fp)
                
                if 'node_embeddings' in data and data['node_embeddings']:
                    node_embeddings = data['node_embeddings']
                    node_vectors = [np.array(emb) for emb in node_embeddings.values() if len(emb) == 256]
                    
                    if node_vectors:
                        avg_vector = np.mean(node_vectors, axis=0)
                        vectors.append(avg_vector)
                        cpus.append(cpu)
                        families.append(family)
            except Exception as e:
                tqdm.write(f"[Error] Load Gpickle Failed {path}: {e}")
            
    return np.array(vectors), cpus, families

def visualize_and_analyze_tsne(vectors, cpus, families):
    """
    執行 t-SNE，視覺化並輸出數值化分析報告。
    """
    # t-SNE 降維
    tsne = TSNE(n_components=2, random_state=42, perplexity=15)
    vectors_2d = tsne.fit_transform(vectors)
    
    # 視覺化
    cpu_markers = {'ARM-32': 's', 'AMD X86-64': 'o'}
    family_colors = plt.cm.get_cmap('tab10', len(set(families)))
    family_color_map = {family: family_colors(i) for i, family in enumerate(sorted(list(set(families))))}
    
    fig, ax = plt.subplots(figsize=(12, 8))
    
    unique_cpus = sorted(list(set(cpus)))
    unique_families = sorted(list(set(families)))

    for cpu in unique_cpus:
        for family in unique_families:
            mask = [(c == cpu and f == family) for c, f in zip(cpus, families)]
            if any(mask):
                current_vectors = vectors_2d[mask]
                ax.scatter(current_vectors[:, 0], current_vectors[:, 1],
                            c=[family_color_map.get(family)], marker=cpu_markers.get(cpu, 'o'),
                            alpha=0.8, s=80, label=f'{cpu} - {family}')
    
    ax.set_title('t-SNE Visualization by CPU (Shape) and Family (Color)')
    ax.set_xlabel('t-SNE 1')
    ax.set_ylabel('t-SNE 2')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    # 數值化分析
    print("\n" + "="*50)
    print("Numerical Analysis: Cluster Centroids & Distances")
    print("="*50)
    
    cluster_centroids = {}
    labels = []
    
    for cpu in unique_cpus:
        for family in unique_families:
            mask = [(c == cpu and f == family) for c, f in zip(cpus, families)]
            if any(mask):
                current_vectors = vectors_2d[mask]
                centroid = np.mean(current_vectors, axis=0)
                cluster_label = f'{cpu}-{family}'
                cluster_centroids[cluster_label] = centroid
                labels.append(cluster_label)
    
    centroids_array = np.array(list(cluster_centroids.values()))
    
    # 計算歐式距離矩陣
    distances = euclidean_distances(centroids_array)
    
    # 建立 DataFrame 進行視覺化輸出
    distance_df = pd.DataFrame(distances, index=labels, columns=labels).round(2)
    
    print("\nCluster Centroid Coordinates (t-SNE 2D):")
    centroids_df = pd.DataFrame(centroids_array, index=labels, columns=['t-SNE 1', 't-SNE 2']).round(2)
    print(centroids_df)
    
    print("\nEuclidean Distance Between Clusters:")
    print(distance_df)

# 主程式
def main():
    csv_file_path = "/home/tommy/Project/PcodeBERT/dataset/csv/base_dataset_filtered_v2.csv"
    gpickle_dir = "/home/tommy/Project/PcodeBERT/outputs/embeddings"
    
    target_cpus = ['AMD X86-64', 'ARM-32']

    print("提取向量和家族資訊...")
    vectors, cpus, families = extract_vectors_from_gpickle(
        csv_file_path, gpickle_dir, target_cpus
    )
    
    print(f"總共載入 {len(vectors)} 個向量")
    if len(vectors) > 0:
        if len(vectors) > 5000:
            print("數據量較大，進行抽樣以加速 t-SNE 運算。")
            import random
            sample_size = 5000
            indices = random.sample(range(len(vectors)), sample_size)
            vectors_sampled = vectors[indices]
            cpus_sampled = [cpus[i] for i in indices]
            families_sampled = [families[i] for i in indices]
        else:
            vectors_sampled = vectors
            cpus_sampled = cpus
            families_sampled = families
        
        print("執行 t-SNE 並創建視覺化...")
        visualize_and_analyze_tsne(vectors_sampled, cpus_sampled, families_sampled)
    else:
        print("沒有找到符合條件的數據。")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import torch
import os
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data
import warnings

# 忽略 scikit-learn 的 FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

def get_gnn_config():
    BASE_PATH = "/home/tommy/Project/PcodeBERT"
    
    config = {
        "source_cpus": ["AMD X86-64"],     
        "target_cpus": ["ARM-32"],        
        
        "csv_path": f"{BASE_PATH}/dataset/csv/base_dataset_filtered_v2.csv",
        "graph_dir": f"{BASE_PATH}/outputs/embeddings",
        "cache_file": f"{BASE_PATH}/outputs/cache/gnn_data_by_family_label.pkl",
        "model_output_dir": f"{BASE_PATH}/outputs/models/gnn",
        
        "batch_size": 32,
        "hidden_channels": 64,
        "learning_rate": 0.01,
        "epochs": 200,
        "patience": 20,
        
        "seeds": [42, 123, 2025, 31415, 8888],
        "device": "cuda"
    }
    
    return config

def load_graphs_from_df(df, graph_dir, label_col='label'):
    """
    從 DataFrame 載入圖資料。
    新增了 label_col 參數來決定使用哪一欄作為標籤。
    """
    graphs = []
    labels = []
    
    for _, row in df.iterrows():
        file_name = row['file_name']
        prefix = file_name[:2]
        label = row[label_col]
        graph_path = Path(graph_dir) / prefix / f"{file_name}.gpickle"

        if not graph_path.exists():
            continue
            
        with open(graph_path, 'rb') as f:
            data = pickle.load(f)
        
        node_embeddings = data['node_embeddings']
        if not node_embeddings:
            continue
        
        embeddings = [list(emb) for emb in node_embeddings.values()]
        x = torch.tensor(embeddings, dtype=torch.float)
        
        num_nodes = len(embeddings)
        edge_list = []
        for i in range(num_nodes - 1):
            edge_list.extend([[i, i+1], [i+1, i]])
        
        if edge_list:
            edge_index = torch.tensor(edge_list, dtype=torch.long).t()
        else:
            edge_index = torch.tensor([[0], [0]], dtype=torch.long)
        
        graph_data = Data(x=x, edge_index=edge_index)
        graphs.append(graph_data)
        labels.append(label)
    
    return graphs, labels

def load_cross_arch_data_with_family_label(csv_path, graph_dir, source_cpus, target_cpus, cache_file, val_size=0.2, random_state=42, force_reload=False):
    """
    載入跨架構的圖資料，並以 'family' 作為標籤。
    """
    if force_reload and os.path.exists(cache_file):
        os.remove(cache_file)
    
    if os.path.exists(cache_file):
        print(f"Loading cached data from: {cache_file}")
        with open(cache_file, 'rb') as f:
            cached_data = pickle.load(f)
        return (cached_data['train_graphs'], 
                cached_data['val_graphs'],
                cached_data['test_graphs'], 
                cached_data['label_encoder'], 
                cached_data['num_classes'])
    
    print("Loading CSV data...")
    df = pd.read_csv(csv_path)
    
    train_df = df[df['CPU'].isin(source_cpus)]
    test_df = df[df['CPU'].isin(target_cpus)]
    
    print(f"Training data: {len(train_df)} samples (architectures: {source_cpus})")
    print(f"Test data: {len(test_df)} samples (architectures: {target_cpus})")
    
    train_graphs, train_labels = load_graphs_from_df(train_df, graph_dir, label_col='family')
    test_graphs, test_labels = load_graphs_from_df(test_df, graph_dir, label_col='family')
    
    train_graphs, val_graphs, train_labels, val_labels = train_test_split(
        train_graphs, train_labels, test_size=val_size, 
        stratify=train_labels, random_state=random_state
    )
    
    label_encoder = LabelEncoder()
    all_labels = train_labels + val_labels + test_labels
    label_encoder.fit(all_labels)
    
    encoded_train_labels = label_encoder.transform(train_labels)
    encoded_val_labels = label_encoder.transform(val_labels)
    encoded_test_labels = label_encoder.transform(test_labels)
    
    num_classes = len(label_encoder.classes_)
    
    for i, data in enumerate(train_graphs):
        data.y = torch.tensor(encoded_train_labels[i], dtype=torch.long)
        
    for i, data in enumerate(val_graphs):
        data.y = torch.tensor(encoded_val_labels[i], dtype=torch.long)
        
    for i, data in enumerate(test_graphs):
        data.y = torch.tensor(encoded_test_labels[i], dtype=torch.long)

    os.makedirs(os.path.dirname(cache_file), exist_ok=True)
    cache_data = {
        'train_graphs': train_graphs,
        'val_graphs': val_graphs,
        'test_graphs': test_graphs,
        'label_encoder': label_encoder,
        'num_classes': num_classes
    }
    
    with open(cache_file, 'wb') as f:
        pickle.dump(cache_data, f)
    
    print(f"Data has been cached to: {cache_file}")
    return train_graphs, val_graphs, test_graphs, label_encoder, num_classes

def analyze_dataset_by_family(graphs, label_encoder):
    """
    Analyzes the node distribution for each family and returns a dictionary of stats.
    """
    family_stats = {}
    if not graphs:
        return family_stats

    # Group graphs by family
    family_groups = defaultdict(list)
    for g in graphs:
        family = label_encoder.inverse_transform([int(g.y)])[0]
        family_groups[family].append(g.num_nodes)

    for family, node_counts in family_groups.items():
        family_stats[family] = {
            "Total Graphs": len(node_counts),
            "Avg Nodes": np.mean(node_counts),
            "Median Nodes": np.median(node_counts),
            "Max Nodes": np.max(node_counts),
            "Min Nodes": np.min(node_counts)
        }
    return family_stats

def compare_datasets(train_graphs, val_graphs, test_graphs, label_encoder):
    """
    Compares key statistics of training, validation, and test datasets.
    """
    # First, print the overall comparison report
    train_stats = analyze_dataset_by_family(train_graphs, label_encoder)
    test_stats = analyze_dataset_by_family(test_graphs, label_encoder)

    all_families = sorted(list(set(train_stats.keys()) | set(test_stats.keys())))

    print("\n" + "="*50)
    print("Family-wise Node Count Comparison")
    print("="*50)

    for family in all_families:
        train_data = train_stats.get(family, {"Total Graphs": 0, "Avg Nodes": 0, "Median Nodes": 0, "Max Nodes": 0, "Min Nodes": 0})
        test_data = test_stats.get(family, {"Total Graphs": 0, "Avg Nodes": 0, "Median Nodes": 0, "Max Nodes": 0, "Min Nodes": 0})
        
        print(f"\n--- Family: {family} ---")
        
        comparison_df = pd.DataFrame({
            "Metric": ["Total Graphs", "Avg Nodes", "Median Nodes", "Max Nodes", "Min Nodes"],
            "Training Set": [train_data["Total Graphs"], f"{train_data['Avg Nodes']:.2f}", train_data["Median Nodes"], train_data["Max Nodes"], train_data["Min Nodes"]],
            "Test Set": [test_data["Total Graphs"], f"{test_data['Avg Nodes']:.2f}", test_data["Median Nodes"], test_data["Max Nodes"], test_data["Min Nodes"]]
        })
        print(comparison_df.to_string(index=False))

# Main execution
if __name__ == "__main__":
    gnn_config = get_gnn_config()
    
    train_graphs, val_graphs, test_graphs, label_encoder, num_classes = load_cross_arch_data_with_family_label(
        csv_path=gnn_config["csv_path"],
        graph_dir=gnn_config["graph_dir"],
        source_cpus=gnn_config["source_cpus"],
        target_cpus=gnn_config["target_cpus"],
        cache_file=gnn_config["cache_file"],
        force_reload=False
    )

    compare_datasets(train_graphs, val_graphs, test_graphs, label_encoder)

Adapter

In [None]:
from tqdm import tqdm

path = "/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_embeddings.pickle"

with open(path, "rb") as f:
    data = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 定義 similarity 函式 ===
def similarity_score(x1, x2):
    dist = torch.norm(x1 - x2, dim=1)  
    return 1 / (1 + dist)             

# === 批次計算平均 similarity ===
scores = []
for v1, v2, _ in tqdm(data, desc="計算中"):
    t1 = torch.tensor(v1, dtype=torch.float32, device=device).unsqueeze(0)
    t2 = torch.tensor(v2, dtype=torch.float32, device=device).unsqueeze(0)

    score = similarity_score(t1, t2)
    scores.append(score.item())

avg_score = np.mean(scores)
print(f"平均 similarity score: {avg_score:.4f}")

In [None]:
import pickle
import torch
import numpy as np
import random
from tqdm import tqdm

input_path = "/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_embeddings.pickle"
output_path = "/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_balanced_embeddings.pickle"

with open(input_path, "rb") as f:
    data = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === similarity 函式 ===
def similarity_score(x1, x2):
    dist = torch.norm(x1 - x2, dim=1)  # L2 distance
    return 1 / (1 + dist)

# === 計算所有 similarity score ===
scored_data = []
for v1, v2, label in tqdm(data, desc="計算相似度"):
    t1 = torch.tensor(v1, dtype=torch.float32, device=device).unsqueeze(0)
    t2 = torch.tensor(v2, dtype=torch.float32, device=device).unsqueeze(0)
    score = similarity_score(t1, t2).item()
    scored_data.append((v1, v2, label, score))

# === 依照相似度排序 ===
scored_data.sort(key=lambda x: x[3], reverse=True)
 

# === 分成兩半 ===
half = len(scored_data) // 2
positive_samples = [(v1, v2, 1) for v1, v2, _, _ in scored_data[:half]]
negative_candidates = [ (v1, v2) for v1, v2, _, _ in scored_data[half:] ]

print(f"[INFO] 正樣本數: {len(positive_samples)}")
print(f"[INFO] 負樣本候選數: {len(negative_candidates)}")

# === 打亂後配對建立負樣本 ===
all_vec1 = [v1 for v1, _ in negative_candidates]
all_vec2 = [v2 for _, v2 in negative_candidates]
random.shuffle(all_vec2)

negative_samples = [(v1, v2, 0) for v1, v2 in zip(all_vec1, all_vec2)]

#print sample of negative samples
print("\n[INFO] 負樣本範例:")
for v1, v2, _ in negative_samples[:5]:
    print(f"  - {v1} <-> {v2}")

# === 合併正負樣本 ===
final_dataset = positive_samples + negative_samples
random.shuffle(final_dataset)

# === 儲存 ===
with open(output_path, "wb") as f:
    pickle.dump(final_dataset, f)

print(f"[INFO] 原始樣本數: {len(data)}")
print(f"[INFO] 新資料集樣本數: {len(final_dataset)}")
print(f"[INFO] 已儲存至: {output_path}")

In [None]:
#check label 1 and label 0 simiarity score
import pickle 

with open("/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_balanced_embeddings.pickle", "rb") as f:
    data = pickle.load(f)

#check label distribution
from collections import Counter
labels = [label for _, _, label in data]
label_counts = Counter(labels)
print("Label distribution:", label_counts)

for v1, v2, label in data:
    if label == 1:
        t1 = torch.tensor(v1, dtype=torch.float32, device=device).unsqueeze(0)
        t2 = torch.tensor(v2, dtype=torch.float32, device=device).unsqueeze(0)
        pos_score = similarity_score(t1, t2)
        print(f"Positive pair similarity score: {pos_score.item():.4f}")
        break
for v1, v2, label in data:
    if label == 0:
        t1 = torch.tensor(v1, dtype=torch.float32, device=device).unsqueeze(0)
        t2 = torch.tensor(v2, dtype=torch.float32, device=device).unsqueeze(0)
        neg_score = similarity_score(t1, t2)
        print(f"Negative pair similarity score: {neg_score.item():.4f}")
        break

Check performance adapter

In [None]:
import torch
import torch.nn as nn

class AdapterMapper(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        super().__init__()
        self.mapper = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim), 
            nn.Linear(hidden_dim, output_dim)
        )
        self.use_residual = (input_dim == output_dim)

    def forward(self, x):
        if self.use_residual:
            return x + self.mapper(x)
        else:
            return self.mapper(x)


In [None]:
import pickle
import torch
import torch.nn as nn
from transformers import RobertaForMaskedLM, AutoTokenizer
import torch.nn.functional as F

class AdapterMapper(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        super().__init__()
        self.mapper = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim), 
            nn.Linear(hidden_dim, output_dim)
        )

        # self.mapper = nn.Sequential(
        # nn.Linear(input_dim, hidden_dim),
        # nn.GELU(), 
        # nn.LayerNorm(hidden_dim),
        # nn.Linear(hidden_dim, hidden_dim), 
        # nn.GELU(),
        # nn.LayerNorm(hidden_dim),
        # nn.Linear(hidden_dim, output_dim)
        # )   
        self.use_residual = (input_dim == output_dim)
    def forward(self, x):
        if self.use_residual:
            return x + self.mapper(x)
        else:
            return self.mapper(x)

# === 定義 similarity 函式 ===
# def similarity_score(x1, x2):
#     dist = torch.norm(x1 - x2, dim=1)  
#     return 1 / (1 + dist)

# def similarity_score(x1, x2):
#     # F.cosine_similarity 輸出的範圍是 -1 到 1
#     # 這裡假設 x1 和 x2 都是 (batch_size, dim)
#     sim = F.cosine_similarity(x1, x2, dim=1)
    
#     # 您可以選擇是否要將其轉換到 0 到 1 之間
#     # return (sim + 1) / 2 
#     return sim

def similarity_score(x1, x2):
    # 1. 計算兩個向量的夾角 (的餘弦值)
    # 範例：x1=[2, 0], x2=[3, 0] -> 夾角 0 度, sim = 1
    # 範例：x1=[2, 0], x2=[0, 2] -> 夾角 90 度, sim = 0
    # 範例：x1=[2, 0], x2=[-2, 0] -> 夾角 180 度, sim = -1
    sim = F.cosine_similarity(x1, x2, dim=1)
    
    # 2. 直接返回該值 (範圍 -1 到 1)
    return sim

# --- 1. 設定 ---
FILE_PATH = "/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_functions_deduped.pickle"
BERT_PATH = '/home/tommy/Project/PcodeBERT/outputs/model_epoch_50'
ADAPTER_PATH = "/home/tommy/Project/PcodeBERT/outputs/adapter/adapter_model_100_cosine.pt"

NUM_SAMPLES = 1000
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- 2. 載入模型 ---
tokenizer = AutoTokenizer.from_pretrained(BERT_PATH)
roberta = RobertaForMaskedLM.from_pretrained(BERT_PATH).to(DEVICE)
roberta.eval()

hidden_size = roberta.config.hidden_size
print(f"RoBERTa hidden size: {hidden_size}")

adapter = AdapterMapper(input_dim=hidden_size, output_dim=hidden_size).to(DEVICE)
state_dict = torch.load(ADAPTER_PATH, map_location=DEVICE)
adapter.load_state_dict(state_dict)
adapter.eval()
print("Adapter state_dict 載入成功。")

# --- 3. 嵌入函數 ---
def text_to_embedding(texts, model, tokenizer, device):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.roberta(**inputs)
    return outputs.last_hidden_state[:, 0, :]

# --- 4. 載入資料 ---
with open(FILE_PATH, 'rb') as f:
    data = pickle.load(f)

data = data[:NUM_SAMPLES]
texts1 = [d[0] for d in data]
texts2 = [d[1] for d in data]

# --- 5. 批次處理以獲取嵌入 ---
v1_roberta_list, v2_roberta_list = [], []
v1_adapter_list, v2_adapter_list = [], []

with torch.no_grad():
    for i in range(0, len(texts1), BATCH_SIZE):
        print(f"Processing batch {i//BATCH_SIZE + 1}...")
        batch_texts1 = texts1[i:i+BATCH_SIZE]
        batch_texts2 = texts2[i:i+BATCH_SIZE]

        r_v1 = text_to_embedding(batch_texts1, roberta, tokenizer, DEVICE)
        r_v2 = text_to_embedding(batch_texts2, roberta, tokenizer, DEVICE)
        
        a_v1 = adapter(r_v1)
        a_v2 = adapter(r_v2)

        v1_roberta_list.append(r_v1.cpu())
        v2_roberta_list.append(r_v2.cpu())
        v1_adapter_list.append(a_v1.cpu())
        v2_adapter_list.append(a_v2.cpu())

v1_roberta = torch.cat(v1_roberta_list, dim=0)
v2_roberta = torch.cat(v2_roberta_list, dim=0)
v1_adapter = torch.cat(v1_adapter_list, dim=0)
v2_adapter = torch.cat(v2_adapter_list, dim=0)

# --- 6. 計算相似度與差異 ---
sim_roberta = similarity_score(v1_roberta, v2_roberta)
sim_adapter = similarity_score(v1_adapter, v2_adapter)
differences = sim_adapter - sim_roberta

# --- 7. 顯示結果 ---
print("\n--- 比較結果 (前 5 筆) ---")
for i in range(min(5, NUM_SAMPLES)):
    print(f"樣本 {i}: RoBERTa Sim={sim_roberta[i]:.4f}, Adapter Sim={sim_adapter[i]:.4f}, 差距={differences[i]:+.4f}")

print("\n--- 總體平均 ---")
print(f"平均 RoBERTa 相似度: {sim_roberta.mean().item():.4f}")
print(f"平均 Adapter 相似度: {sim_adapter.mean().item():.4f}")
print(f"平均 相似度差距 (Adapter - RoBERTa): {differences.mean().item():+.4f}")

In [1]:
import torch
import torch.nn as nn
from adapters import AutoAdapterModel
from transformers import AutoTokenizer
import os

class AdapterEmbeddingModel(nn.Module):
    def __init__(self, model_name, adapter_config, adapter_name, 
                 input_dim=256, output_dim=256, hidden_dim=128):
        super().__init__()
        
        self.adapter_name = adapter_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        self.model = AutoAdapterModel.from_pretrained(model_name)
        self.model.add_adapter(adapter_name, config=adapter_config)
        self.model.train_adapter(adapter_name)
        self.model.set_active_adapters(adapter_name)
        
        for name, param in self.model.named_parameters():
            if 'adapter' not in name.lower():
                param.requires_grad = False
        
        total = sum(p.numel() for p in self.model.parameters())
        trainable = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        print(f"\nAdapter Model: {total:,} total, {trainable:,} trainable ({trainable/total*100:.1f}%)\n")
    
    def forward(self, input_ids, attention_mask=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, 
                           output_hidden_states=True)
        return outputs.hidden_states[-1][:, 0, :]
    
    def save_adapter(self, save_path):
        os.makedirs(save_path, exist_ok=True)
        self.model.save_adapter(save_path, self.adapter_name)
    
    def load_adapter(self, load_path):
        self.model.load_adapter(load_path, load_as=self.adapter_name)
        self.model.set_active_adapters(self.adapter_name)
"""
Adapter 訓練配置檔案
使用 adapter-transformers 套件進行 Adapter 訓練
"""

def get_adapter_config():
    """
    返回 Adapter 訓練的配置參數
    
    Returns:
        dict: 包含所有訓練參數的配置字典
    """
    return {
        "model_name": "/home/tommy/Project/PcodeBERT/outputs/models/RoBERTa/model_epoch_100",
        "adapter_name": "pcode_adapter",  
        
        "adapter_config": "pfeiffer",
        "reduction_factor": 32,  
        "non_linearity": "gelu",  
        "leave_out": [0, 1, 2, 3, 4], 
        "input_dim": 256,  
        "output_dim": 256,
        "hidden_dim": 128, 
        "use_projection": False, 
        
        "data_path": "/home/tommy/Project/PcodeBERT/outputs/data/Adapters/train_x86_64_arm_32_functions_deduped.pickle",
        "val_data_path": None,  
        "val_split": 0.2,  
        
        "batch_size": 128,
        "learning_rate": 1e-4, 
        "epochs": 5,
        "weight_decay": 0.01,
        
        "loss_functions": ["mse"],
        "triplet_margin": 1.0,
        "triplet_p": 2,
        
        "scheduler_type": "cosine",  
        "scheduler_patience": 10,  
        "scheduler_factor": 0.5,   
        
        "early_stop_patience": 10,
        
        "device": "cuda",
        "save_dir": "/home/tommy/Project/PcodeBERT/outputs/adapter",
        "save_model_name": "adapter_roberta",  
        
        "max_length": 512, 
        "seed": 42,
        "log_interval": 10,  
    }


def get_inference_config():
    """
    返回推理/應用 Adapter 的配置參數
    
    Returns:
        dict: 推理配置字典
    """
    return {
        "model_path": "/home/tommy/Project/PcodeBERT/outputs/models/RoBERTa/model_epoch_100",
        "adapter_path": "/home/tommy/Project/PcodeBERT/outputs/adapter/adapter_roberta_mse",
        "adapter_name": "pcode_adapter",
        "input_path": "/home/tommy/Project/PcodeBERT/outputs/data/GNN/gpickle_merged_adjusted_filtered",
        "output_path": "/home/tommy/Project/PcodeBERT/outputs/data/GNN/gpickle_merged_adjusted_filtered_adapter",
        "csv_path": "/home/tommy/Project/PcodeBERT/dataset/csv/merged_adjusted_filtered.csv",
        "target_cpus": ["x86_64", "ARM"],
        "batch_size": 64,
        "device": "cuda",
        "max_length": 512,
    }


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.append('/home/tommy/Project/PcodeBERT/src')

import pickle
import torch
import torch.nn.functional as F
from adapters import AdapterConfig

def similarity_score(x1, x2):
    return F.cosine_similarity(x1, x2, dim=1)

# --- 設定 ---
FILE_PATH = "/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_functions_deduped.pickle"
ADAPTER_PATH = "/home/tommy/Project/PcodeBERT/outputs/adapter/adapter_roberta_mse"
NUM_SAMPLES = 1000
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {DEVICE}")

# --- 載入配置和模型 ---
config = get_adapter_config()

adapter_config = AdapterConfig.load(
    config["adapter_config"],
    reduction_factor=config["reduction_factor"],
    non_linearity=config["non_linearity"],
    leave_out=config["leave_out"]
)

model = AdapterEmbeddingModel(
    model_name=config["model_name"],
    adapter_config=adapter_config,
    adapter_name=config["adapter_name"],
    input_dim=config["input_dim"],
    output_dim=config["output_dim"],
    hidden_dim=config["hidden_dim"]
).to(DEVICE)

model.load_adapter(ADAPTER_PATH)
model.eval()

# --- 嵌入函數 ---
def get_embeddings(texts, model, device, batch_size=32):
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = model.tokenizer(batch_texts, padding=True, truncation=True, 
                                max_length=512, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            embeddings = model(inputs['input_ids'], inputs['attention_mask'])
            all_embeddings.append(embeddings.cpu())
    
    return torch.cat(all_embeddings, dim=0)

# --- 載入資料 ---
with open(FILE_PATH, 'rb') as f:
    data = pickle.load(f)

data = data[:NUM_SAMPLES]
texts1 = [d[0] for d in data]
texts2 = [d[1] for d in data]

# --- 獲取嵌入 ---
print("Computing embeddings...")
v1_adapter = get_embeddings(texts1, model, DEVICE, BATCH_SIZE)
v2_adapter = get_embeddings(texts2, model, DEVICE, BATCH_SIZE)

# --- 計算相似度 ---
sim_adapter = similarity_score(v1_adapter, v2_adapter)

# --- 顯示結果 ---
print("\n--- 結果 (前 5 筆) ---")
for i in range(min(5, NUM_SAMPLES)):
    print(f"樣本 {i}: Adapter Sim={sim_adapter[i]:.4f}")

print("\n--- 總體統計 ---")
print(f"平均相似度: {sim_adapter.mean().item():.4f}")
print(f"中位數: {sim_adapter.median().item():.4f}")
print(f"標準差: {sim_adapter.std().item():.4f}")
print(f"最小值: {sim_adapter.min().item():.4f}")
print(f"最大值: {sim_adapter.max().item():.4f}")


Some weights of RobertaAdapterModel were not initialized from the model checkpoint at /home/tommy/Project/PcodeBERT/outputs/models/RoBERTa/model_epoch_100 and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
There are adapters available but none are activated for the forward pass.


Using device: cuda

Adapter Model: 11,330,387 total, 4,360 trainable (0.0%)



Overwriting existing adapter 'pcode_adapter'.


Computing embeddings...

--- 結果 (前 5 筆) ---
樣本 0: Adapter Sim=0.9943
樣本 1: Adapter Sim=1.0000
樣本 2: Adapter Sim=0.9668
樣本 3: Adapter Sim=0.9891
樣本 4: Adapter Sim=0.9958

--- 總體統計 ---
平均相似度: 0.9875
中位數: 0.9895
標準差: 0.0096
最小值: 0.9136
最大值: 1.0000


Check training data format

In [None]:
import pickle 
file = "/home/tommy/Project/PcodeBERT/outputs/alignment_sentences/train_x86_64_arm_32_balanced_embeddings.pickle"

with open(file, "rb") as f:
    data = pickle.load(f)

#check pickle type

print(f"Data type: {type(data)}")
print(f"First item type: {type(data[0])}")
print(f"First item content: {data[0]}")



In [2]:
import pickle
file = "/home/tommy/Project/PcodeBERT/outputs/preprocessed/pcode_corpus_x86_64_new_data.pkl"

with open(file, "rb") as f:
    data = pickle.load(f)

print(f"Data type: {type(data)}")
print(f"First item type: {type(data[0])}")
print(f"First item content: {data[0]}")


Data type: <class 'list'>
First item type: <class 'list'>
First item content: ['LOAD', 'UNIQUE', 'CONST', 'REG', 'INT_ZEXT', 'REG', 'UNIQUE', 'INT_ADD', 'UNIQUE', 'REG', 'CONST', 'LOAD', 'UNIQUE', 'CONST', 'UNIQUE', 'CAST', 'UNIQUE', 'UNIQUE', 'COPY', 'STACK', 'UNIQUE', 'LOAD', 'UNIQUE', 'CONST', 'UNIQUE', 'PTRSUB', 'UNIQUE', 'CONST', 'CONST', 'PTRADD', 'UNIQUE', 'UNIQUE', 'REG', 'CONST', 'INT_LESSEQUAL', 'UNIQUE', 'REG', 'UNIQUE', 'CBRANCH', 'MEM', 'UNIQUE', 'CAST', 'UNIQUE', 'UNIQUE', 'INT_EQUAL', 'REG', 'UNIQUE', 'CONST', 'LOAD', 'UNIQUE', 'CONST', 'UNIQUE', 'PTRADD', 'UNIQUE', 'REG', 'CONST', 'CONST', 'CAST', 'UNIQUE', 'UNIQUE', 'CBRANCH', 'MEM', 'REG', 'INT_NOTEQUAL', 'REG', 'UNIQUE', 'CONST', 'CBRANCH', 'MEM', 'REG', 'LOAD', 'UNIQUE', 'CONST', 'UNIQUE', 'INT_AND', 'UNIQUE', 'UNIQUE', 'CONST', 'STORE', 'CONST', 'UNIQUE', 'UNIQUE', 'INDIRECT', 'MEM', 'MEM', 'CONST', 'PTRADD', 'UNIQUE', 'REG', 'CONST', 'CONST', 'STORE', 'CONST', 'UNIQUE', 'CONST', 'INDIRECT', 'MEM', 'MEM', 'CONST', 

In [None]:
import pickle 

with