Gpickle

In [None]:
import pickle
import torch

path = "/home/tommy/Project/PcodeBERT/outputs/embeddings/19/19adb6449355f995b6119cd1ff97c20a0241e2f3b57ee26b6ad3c6c62d6de8a4.gpickle"

with open(path, 'rb') as f:
    data = pickle.load(f)

print(data.keys())

for node_id, embedding in data['node_embeddings'].items():
    print(f"Node ID: {node_id}")
    print(f"Embedding: {embedding}")

Node ID: 0x400c7cL
Embedding: [-0.04375347  0.7788704  -1.5390887   0.88755184  0.2612036   0.23350312
 -0.65545446  0.71358025  2.0820036  -0.26770324  1.0942057  -0.1646293
  0.5374596  -1.3948683  -1.1313785  -1.5135158  -1.3902414  -0.8858508
  0.38454613  1.4683092   0.5850619   0.21381345  0.6044717  -0.942066
 -0.14374267  0.9826433   0.8757215   0.90946776  0.65131515  0.33878854
 -2.1713219   0.29483244 -1.8595995   0.25510585  1.8910512   1.2818451
 -0.65115356 -1.4420083   0.682873    0.85709965  0.05592961  0.4202211
  0.63530266 -0.28647023 -1.0307099   0.89636475  0.2897106   0.6984294
  0.60195047 -1.1575505  -1.2377872   0.07677054 -0.7633637   0.8529213
 -1.1776712  -0.17683129  0.08594505 -0.30631095  0.56732666 -1.8134546
 -0.05288556  0.07962415 -1.8859808  -2.9215894   0.985836   -0.32625392
 -0.39131877 -0.780987    0.5169391   2.4824078  -0.8264567   0.4233305
  0.6252151   1.0398829  -0.8817629  -1.0199324  -1.8248454  -0.7003135
 -0.7574694  -0.3587801   0.4327

Embedding

In [None]:
import os
import sys
import pickle
import torch
import numpy as np
from transformers import RobertaForMaskedLM, AutoTokenizer

def load_pretrained_model():
    """載入預訓練的模型和tokenizer"""
    model_path = "/home/tommy/Project/PcodeBERT/outputs/models/pretrain"
    
    print(f"Loading model from: {model_path}")
    
    # 載入tokenizer和model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = RobertaForMaskedLM.from_pretrained(model_path)
    
    # 設定device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    print(f"Model loaded successfully on device: {device}")
    return model, tokenizer, device

def get_sentence_embedding(sentence, model, tokenizer, device):
    """對單個sentence生成embedding"""
    # Tokenize
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 生成embedding
    with torch.no_grad():
        outputs = model.roberta(**inputs)
        # 使用[CLS] token的embedding
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return embedding[0]  # 返回一維array

def process_graph_data(graph_path):
    """處理graph資料並生成embeddings"""
    # 載入模型
    model, tokenizer, device = load_pretrained_model()
    
    # 載入graph資料
    print(f"Loading graph data from: {graph_path}")
    with open(graph_path, 'rb') as f:
        graph = pickle.load(f)
    
    print(f"Graph type: {type(graph)}")
    print(f"Number of nodes: {graph.number_of_nodes()}")
    
    # 處理每個節點
    node_embeddings = {}
    sentences = []
    
    for node_id, node_data in graph.nodes(data=True):
        sentence = node_data.get('sentence', '')
        if sentence:
            print(f"Processing node {node_id}: {sentence[:50]}...")
            
            # 生成embedding
            embedding = get_sentence_embedding(sentence, model, tokenizer, device)
            node_embeddings[node_id] = embedding
            sentences.append(sentence)
    
    print(f"Generated embeddings for {len(node_embeddings)} nodes")
    print(f"Embedding dimension: {len(list(node_embeddings.values())[0])}")
    
    return node_embeddings, sentences

def main():
    # 測試資料路徑
    graph_path = "/home/tommy/Project/PcodeBERT/outputs/gpickle/00/00a3a8743be45f6c561c08be96b325c9d6c0a8b619dc67a4e44c5423bcde5532.gpickle"
    
    try:
        # 處理資料
        node_embeddings, sentences = process_graph_data(graph_path)
        
        # 顯示結果
        print("\n=== Embedding Results ===")
        for i, (node_id, embedding) in enumerate(list(node_embeddings.items())[:3]):  # 只顯示前3個
            print(f"Node {node_id}:")
            print(f"  Sentence: {sentences[i][:100]}...")
            print(f"  Embedding shape: {embedding.shape}")
            print(f"  Embedding mean: {np.mean(embedding):.4f}")
            print(f"  Embedding std: {np.std(embedding):.4f}")
            print()
        
        # 儲存結果
        output_path = "/home/tommy/Project/PcodeBERT/outputs/embeddings/graph_embeddings.pkl"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        result = {
            'node_embeddings': node_embeddings,
            'sentences': sentences,
            'embedding_dim': len(list(node_embeddings.values())[0])
        }
        
        with open(output_path, 'wb') as f:
            pickle.dump(result, f)
        
        print(f"Results saved to: {output_path}")
        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


In [None]:
import os
import sys
import pickle
import torch
import numpy as np
from transformers import RobertaForMaskedLM, AutoTokenizer

def load_pretrained_model():
    """載入預訓練的模型和tokenizer"""
    model_path = "/home/tommy/Project/PcodeBERT/outputs/models/pretrain"
    
    print(f"Loading model from: {model_path}")
    
    # 載入tokenizer和model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = RobertaForMaskedLM.from_pretrained(model_path)
    
    # 設定device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    print(f"Model loaded successfully on device: {device}")
    return model, tokenizer, device

graph_path = "/home/tommy/Project/PcodeBERT/outputs/gpickle/00/00a3a8743be45f6c561c08be96b325c9d6c0a8b619dc67a4e44c5423bcde5532.gpickle"

with open(graph_path, 'rb') as f:
    data = pickle.load(f)

sample_sentence = data.nodes["0x10002c24L"].get('sentence', '')
print(f"Sample sentence: {sample_sentence}")


model, tokenizer, device = load_pretrained_model()

inputs = tokenizer(sample_sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = model.roberta(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
print(f"Embedding shape: {embedding.shape}")