In [1]:
# 1. 读取蛋白 info 文件，构建 ID->名称映射字典
id_name_map = {}
with open('../Data/9606.protein.info.v12.0.txt', 'r') as f_info:
    # 跳过表头（根据实际文件调整，比如第一行是表头）
    next(f_info)  
    for line in f_info:
        parts = line.strip().split('\t')  # 假设分隔符是制表符
        protein_id = parts[0]
        preferred_name = parts[1]
        id_name_map[protein_id] = preferred_name

In [2]:
# 2. 处理互作数据文件，替换 ID 为名称
with open('../Data/9606.protein.links.v12.0.onlyAB.txt', 'r') as f_inter, \
     open('../Data/PPI_with_names.txt', 'w') as f_out:
    # 写新表头
    f_out.write('protein1_name\tprotein2_name\tcombined_score\n')  
    for line in f_inter:
        if line.startswith('protein1'):  # 跳过原表头
            continue
        p1, p2, score = line.strip().split()
        # 替换 ID
        p1_name = id_name_map.get(p1, p1)  # 没匹配到就保留原 ID
        p2_name = id_name_map.get(p2, p2)
        f_out.write(f'{p1_name}\t{p2_name}\t{score}\n')

In [3]:
import pandas as pd

In [5]:
# 读取 Excel 文件
df = pd.read_table('../Data/PPI_with_names.txt')

In [6]:
df.head(3)

Unnamed: 0,protein1_name,protein2_name,combined_score
0,ARF5,RALGPS2,173
1,ARF5,FHDC1,154
2,ARF5,ATP6V1E1,151


In [7]:
df = df[['protein1_name', 'protein2_name']]

In [8]:
df = df.rename(columns={'protein1_name':'node1', 'protein2_name':'node2'})

In [9]:
# 将 node1 和 node2 列转换为小写
df['node1'] = df['node1'].str.lower()
df['node2'] = df['node2'].str.lower()

In [10]:
df.head(3)

Unnamed: 0,node1,node2
0,arf5,ralgps2
1,arf5,fhdc1
2,arf5,atp6v1e1


In [12]:
df.shape

(6857702, 2)

In [13]:
result_df = df.drop_duplicates()

In [14]:
result_df.shape

(6857702, 2)

In [15]:
result_df = result_df.assign(relationship=1.0)

In [16]:
result_df.head(3)

Unnamed: 0,node1,node2,relationship
0,arf5,ralgps2,1.0
1,arf5,fhdc1,1.0
2,arf5,atp6v1e1,1.0


## Add PPI

In [17]:
merged_df_long = pd.read_csv('../Data/merged_df_add_filtered_disGeNet_disease_disease.txt', sep='\t', dtype={'node1':str, 'node2': str, 'relationship': float})

In [18]:
merged_df_long.head(3)

Unnamed: 0,node1,node2,relationship
0,1002133,I802,1.0
1,1002133,M1997,1.0
2,1002133,M201,1.0


In [19]:
# 1. 将 merged_df_long 的 node1 和 node2 列合并为一个集合（用于高效查找）
merged_nodes = set(merged_df_long['node1']) | set(merged_df_long['node2'])

# 2. 对 result_df['node1'] 去重，获取唯一元素
unique_nodes_node1 = result_df['node1'].unique()  # 返回 numpy 数组

# 3. 检查去重后的元素有多少存在于 merged_nodes 中
unique_matches_node1 = pd.Series(unique_nodes_node1).isin(merged_nodes).sum()

print(f"{len(unique_nodes_node1)} unique elements in result_df node1 column.")
print(f"去重后，result_df 的 node1 列中有 {unique_matches_node1} 个元素存在于 merged_df_long 的 node1 或 node2 列中。")

####
# 2. 对 result_df['node2'] 去重，获取唯一元素
unique_nodes_node2 = result_df['node2'].unique()  # 返回 numpy 数组

# 3. 检查去重后的元素有多少存在于 merged_nodes 中
unique_matches_node2 = pd.Series(unique_nodes_node2).isin(merged_nodes).sum()

print(f"{len(unique_nodes_node2)} unique elements in result_df node2 column.")
print(f"去重后，result_df 的 node2 列中有 {unique_matches_node2} 个元素存在于 merged_df_long 的 node1 或 node2 列中。")

19466 unique elements in result_df node1 column.
去重后，result_df 的 node1 列中有 2870 个元素存在于 merged_df_long 的 node1 或 node2 列中。
19595 unique elements in result_df node2 column.
去重后，result_df 的 node2 列中有 2869 个元素存在于 merged_df_long 的 node1 或 node2 列中。


#### For new nodes need correspong embedding features, we use overlap nodes firstly.

In [20]:
merged_nodes = set(merged_df_long['node1']) | set(merged_df_long['node2'])

# 筛选 result_df 中 node1 存在于 merged_nodes 的行
filtered_df = result_df[
    result_df['node1'].isin(merged_nodes) & 
    result_df['node2'].isin(merged_nodes)
]

In [21]:
print(f"result_df行数: {len(result_df)}")
print(f"筛选后的数据行数: {len(filtered_df)}")
print(filtered_df.head())

result_df行数: 6857702
筛选后的数据行数: 376384
     node1    node2  relationship
1302  m6pr   rabep1           1.0
1303  m6pr     ppt1           1.0
1306  m6pr     scg3           1.0
1310  m6pr    aifm1           1.0
1314  m6pr  b4galt1           1.0


In [22]:
# add filtered PPI
merged_df_add_PPI = pd.concat([merged_df_long, filtered_df], ignore_index=True)

In [23]:
# 保存为以制表符分隔的文本文件 
merged_df_add_PPI.to_csv('../Data/merged_df_add_filtered_disGeNet_disease_disease_PPI.txt', sep='\t', na_rep='nan', index=False)

## prepare input data

In [24]:
import numpy as np
import os
from scipy.sparse import csr_matrix
from utils import N2V
import pandas as pd
from fastnode2vec import Graph, Node2Vec 
import pickle
import json

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
adj_path = '../Data/merged_df_add_filtered_disGeNet_disease_disease_PPI.txt'

In [26]:
merged_df = pd.read_csv(adj_path, sep='\t',
    dtype={'node1': str, 'node2': str, 'relationship': float} )
merged_df.head(2)

Unnamed: 0,node1,node2,relationship
0,1002133,I802,1.0
1,1002133,M1997,1.0


In [27]:
adj_matrix = pd.read_csv(adj_path, sep='\t',
    dtype={'node1': str, 'node2': str, 'relationship': float})
# 构建图所需的边列表
edges = [(row['node1'], row['node2'], row['relationship']) for _, row in adj_matrix.iterrows()]

# 构建图
graph = Graph(edges, directed=False, weighted=True)

                                                                                                                                                                                                                                                                                                                                    

In [28]:
graph.node_names

array(['1002133', 'I802', 'M1997', ..., 'Z90', 'Z994', 'Z99'],
      dtype='<U86')

In [29]:
graph

<fastnode2vec.graph.Graph at 0x7f8a00096f30>

In [30]:
# 保存图对象
with open('../Data/graph_add_filtered_disGeNet_disease_disease_PPI.pkl', 'wb') as f:
    pickle.dump(graph, f)

In [31]:
with open('../Data/gpt_protein_embeddings_dict.pkl', 'rb') as f:
    protein_embeddings_dict = pickle.load(f)

In [32]:
with open('../Data/gpt_embeddings_dict.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)

In [33]:
with open('../Data/metabolite_embeddings_dict.pkl', 'rb') as f:
    metabolite_embeddings_dict = pickle.load(f)

In [34]:
with open('../Data/icd10_embeddings_dict.pkl', 'rb') as f:
    icd10_embeddings_dict = pickle.load(f)

In [35]:
# 创建新字典，保留原始字典不变
merged_dict = {**protein_embeddings_dict, **embeddings_dict, **metabolite_embeddings_dict, **icd10_embeddings_dict}

In [36]:
# 提取所有键并保持顺序（Python 3.7+ 字典保持插入顺序）
keys = list(graph.node_names)

# 将字典值转换为NumPy矩阵（每行一个向量）
matrix = np.array([merged_dict[key] for key in keys])

# 检查矩阵形状
print("矩阵形状:", matrix.shape)  # 输出: (47626, 向量维度)

# 示例：查看前3行
print("前3行:\n", matrix[:3])

矩阵形状: (49373, 1536)
前3行:
 [[-0.0182373  -0.02098356  0.02359651 ... -0.01561102 -0.01053844
  -0.00612576]
 [-0.02499108 -0.0067757   0.02219337 ... -0.02719697 -0.01021568
  -0.02587882]
 [ 0.00454228 -0.00239301  0.02163428 ... -0.01282641 -0.01000684
  -0.05185902]]


In [37]:
# 保存
with open('../Data/keys_add_filtered_disGeNet_disease_disease_PPI.json', 'w') as f:
    json.dump(keys, f)

In [38]:
np.save('../Data/UKB_node_feature_gpt_add_filtered_disGeNet_disease_disease_PPI.npy', matrix)