Gpickle

In [None]:
import pickle
import torch

path = "/home/tommy/Project/PcodeBERT/outputs/embeddings/19/19adb6449355f995b6119cd1ff97c20a0241e2f3b57ee26b6ad3c6c62d6de8a4.gpickle"

with open(path, 'rb') as f:
    data = pickle.load(f)

for node_id, embedding in data['node_embeddings'].items():
    print(f"Node ID: {node_id}")
    print(f"Embedding: {embedding}")

Embedding

In [None]:
import os
import sys
import pickle
import torch
import numpy as np
from transformers import RobertaForMaskedLM, AutoTokenizer

def load_pretrained_model():
    """載入預訓練的模型和tokenizer"""
    model_path = "/home/tommy/Project/PcodeBERT/outputs/models/pretrain"
    
    print(f"Loading model from: {model_path}")
    
    # 載入tokenizer和model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = RobertaForMaskedLM.from_pretrained(model_path)
    
    # 設定device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    print(f"Model loaded successfully on device: {device}")
    return model, tokenizer, device

def get_sentence_embedding(sentence, model, tokenizer, device):
    """對單個sentence生成embedding"""
    # Tokenize
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 生成embedding
    with torch.no_grad():
        outputs = model.roberta(**inputs)
        # 使用[CLS] token的embedding
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return embedding[0]  # 返回一維array

def process_graph_data(graph_path):
    """處理graph資料並生成embeddings"""
    # 載入模型
    model, tokenizer, device = load_pretrained_model()
    
    # 載入graph資料
    print(f"Loading graph data from: {graph_path}")
    with open(graph_path, 'rb') as f:
        graph = pickle.load(f)
    
    print(f"Graph type: {type(graph)}")
    print(f"Number of nodes: {graph.number_of_nodes()}")
    
    # 處理每個節點
    node_embeddings = {}
    sentences = []
    
    for node_id, node_data in graph.nodes(data=True):
        sentence = node_data.get('sentence', '')
        if sentence:
            print(f"Processing node {node_id}: {sentence[:50]}...")
            
            # 生成embedding
            embedding = get_sentence_embedding(sentence, model, tokenizer, device)
            node_embeddings[node_id] = embedding
            sentences.append(sentence)
    
    print(f"Generated embeddings for {len(node_embeddings)} nodes")
    print(f"Embedding dimension: {len(list(node_embeddings.values())[0])}")
    
    return node_embeddings, sentences

def main():
    # 測試資料路徑
    graph_path = "/home/tommy/Project/PcodeBERT/outputs/gpickle/00/00a3a8743be45f6c561c08be96b325c9d6c0a8b619dc67a4e44c5423bcde5532.gpickle"
    
    try:
        # 處理資料
        node_embeddings, sentences = process_graph_data(graph_path)
        
        # 顯示結果
        print("\n=== Embedding Results ===")
        for i, (node_id, embedding) in enumerate(list(node_embeddings.items())[:3]):  # 只顯示前3個
            print(f"Node {node_id}:")
            print(f"  Sentence: {sentences[i][:100]}...")
            print(f"  Embedding shape: {embedding.shape}")
            print(f"  Embedding mean: {np.mean(embedding):.4f}")
            print(f"  Embedding std: {np.std(embedding):.4f}")
            print()
        
        # 儲存結果
        output_path = "/home/tommy/Project/PcodeBERT/outputs/embeddings/graph_embeddings.pkl"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        result = {
            'node_embeddings': node_embeddings,
            'sentences': sentences,
            'embedding_dim': len(list(node_embeddings.values())[0])
        }
        
        with open(output_path, 'wb') as f:
            pickle.dump(result, f)
        
        print(f"Results saved to: {output_path}")
        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


In [None]:
import os
import sys
import pickle
import torch
import numpy as np
from transformers import RobertaForMaskedLM, AutoTokenizer

def load_pretrained_model():
    """載入預訓練的模型和tokenizer"""
    model_path = "/home/tommy/Project/PcodeBERT/outputs/models/pretrain"
    
    print(f"Loading model from: {model_path}")
    
    # 載入tokenizer和model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = RobertaForMaskedLM.from_pretrained(model_path)
    
    # 設定device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    print(f"Model loaded successfully on device: {device}")
    return model, tokenizer, device

graph_path = "/home/tommy/Project/PcodeBERT/outputs/gpickle/00/00a3a8743be45f6c561c08be96b325c9d6c0a8b619dc67a4e44c5423bcde5532.gpickle"

with open(graph_path, 'rb') as f:
    data = pickle.load(f)

sample_sentence = data.nodes["0x10002c24L"].get('sentence', '')
print(f"Sample sentence: {sample_sentence}")


model, tokenizer, device = load_pretrained_model()

inputs = tokenizer(sample_sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = model.roberta(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
print(f"Embedding shape: {embedding.shape}")

In [None]:
import pickle
import torch
file_path = '/home/tommy/Project/PcodeBERT/outputs/embeddings/00/00a3a8743be45f6c561c08be96b325c9d6c0a8b619dc67a4e44c5423bcde5532.gpickle'

with open(file_path, 'rb') as f:
    data = pickle.load(f)
print('Data type:', type(data))
print('Data keys:', list(data.keys()) if hasattr(data, 'keys') else 'No keys')
if hasattr(data, 'keys'):
    for key, value in data.items():
        print(f'{key}: {type(value)}, shape: {getattr(value, "shape", "N/A")}')
else:
    print('Data:', data)



print('Data type:', type(data))
if hasattr(data, 'keys'):
    print('Data keys:', list(data.keys()))
    for key, value in data.items():
        print(f'{key}: {type(value)}, shape: {getattr(value, "shape", "N/A")}')
else:
    print('Data attributes:', dir(data))

Data type: <class 'dict'>
Data keys: ['file_path', 'node_embeddings', 'node_sentences', 'num_nodes', 'embedding_dim']
file_path: <class 'str'>, shape: N/A
node_embeddings: <class 'dict'>, shape: N/A
node_sentences: <class 'dict'>, shape: N/A
num_nodes: <class 'int'>, shape: N/A
embedding_dim: <class 'int'>, shape: N/A




UnpicklingError: Weights only load failed. In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
Please file an issue with the following so that we can make `weights_only=True` compatible with your use case: WeightsUnpickler error: Unsupported operand 149

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [None]:
with open(file_path, 'rb') as f:
    data = pickle.load(f)