In [3]:
import pandas as pd

In [6]:
# 读取 Excel 文件
df = pd.read_excel('../Data/DisGeNET_Complete_Results_gene_disease_association.xlsx')

In [9]:
df.head(3)

Unnamed: 0,Gene_Symbol,Gene_ID,ensemblid,geneNcbiType,geneDSI,geneDPI,genepLI,uniprotids,protein_classid,protein_class_name,...,numberPmidsWithChemsFiltered,Association_Score,yearInitial,yearFinal,evidence_level,evidence_index,diseaseid,Node_ID,ICD_Code,mapped_protein_names_complete
0,TNF,7124,ENSG00000232810,protein-coding,0.229,0.958,0.85687,"P01375, Q5STB3",DTO_05007599,Signaling,...,,0.75,,,,0.666667,C0041466,2910,A01.0,Tumor necrosis factor (Cachectin) (TNF-alpha) ...
1,CFTR,1080,ENSG00000001626,protein-coding,0.384,0.875,2.1646000000000003e-39,P13569,DTO_01300327,Ion channel,...,,0.7,,,,1.0,C0041466,2910,A01.0,Cystic fibrosis transmembrane conductance regu...
2,HLA-DRB1,3123,ENSG00000196126,protein-coding,0.366,0.875,,"D7RIH8, A0A224MM52, P01911, X5DNQ0, Q5Y7D1",DTO_05007608,Immune response,...,,0.7,2014.0,2014.0,,1.0,C0041466,2910,A01.0,HLA class II histocompatibility antigen DR bet...


## No need convert id

In [28]:
from Bio import ExPASy
from Bio import SwissProt

In [33]:
# 假设UniProt ID
uniprot_id = "Q5STB3"

# 获取蛋白质记录
handle = ExPASy.get_sprot_raw(uniprot_id)
record = SwissProt.read(handle)

In [42]:
record.gene_name

[{'Name': 'TNF {ECO:0000313|EMBL:AHJ25918.1}',
  'Synonyms': ['TNF-alpha {ECO:0000313|EMBL:QCI55733.1}',
   'TNFA {ECO:0000313|EMBL:BAF31279.1}',
   'TNLG1F {ECO:0000313|EMBL:CTQ86082.1}'],
  'ORFNames': ['hCG_43716 {ECO:0000313|EMBL:EAX03424.1}']}]

## use gene symbols as protein names

In [51]:
disGeNet_df = df[['Gene_Symbol', 'ICD_Code']]

In [52]:
disGeNet_df = disGeNet_df.rename(columns={'Gene_Symbol': 'node1', 'ICD_Code': 'node2'})

In [53]:
disGeNet_df = disGeNet_df.assign(relationship=1.0)

In [54]:
disGeNet_df.head(3)

Unnamed: 0,node1,node2,relationship
0,TNF,A01.0,1.0
1,CFTR,A01.0,1.0
2,HLA-DRB1,A01.0,1.0


In [55]:
# 1. 将 node1 列的所有大写字母转为小写
disGeNet_df['node1'] = disGeNet_df['node1'].str.lower()

# 2. 去除 node2 列中的小数点
disGeNet_df['node2'] = disGeNet_df['node2'].str.replace('.', '', regex=False)

In [56]:
disGeNet_df.head(3)

Unnamed: 0,node1,node2,relationship
0,tnf,A010,1.0
1,cftr,A010,1.0
2,hla-drb1,A010,1.0


In [57]:
merged_df_long = pd.read_csv('../Data/merged_df_long_convert.txt', sep='\t', dtype={'node1':str, 'node2': str, 'relationship': float})

In [58]:
merged_df_long.head(3)

Unnamed: 0,node1,node2,relationship
0,1002133,I802,1.0
1,1002133,M1997,1.0
2,1002133,M201,1.0


In [63]:
# 1. 将 merged_df_long 的 node1 和 node2 列合并为一个集合（用于高效查找）
merged_nodes = set(merged_df_long['node1']) | set(merged_df_long['node2'])

# 2. 对 disGeNet_df['node1'] 去重，获取唯一元素
unique_nodes_node1 = disGeNet_df['node1'].unique()  # 返回 numpy 数组

# 3. 检查去重后的元素有多少存在于 merged_nodes 中
unique_matches_node1 = pd.Series(unique_nodes_node1).isin(merged_nodes).sum()

print(f"{len(unique_nodes_node1)} unique elements in disGeNet_df node1 column.")
print(f"去重后，disGeNet_df 的 node1 列中有 {unique_matches_node1} 个元素存在于 merged_df_long 的 node1 或 node2 列中。")

####
# 2. 对 disGeNet_df['node2'] 去重，获取唯一元素
unique_nodes_node2 = disGeNet_df['node2'].unique()  # 返回 numpy 数组

# 3. 检查去重后的元素有多少存在于 merged_nodes 中
unique_matches_node2 = pd.Series(unique_nodes_node2).isin(merged_nodes).sum()

print(f"{len(unique_nodes_node2)} unique elements in disGeNet_df node1 column.")
print(f"去重后，disGeNet_df 的 node1 列中有 {unique_matches_node2} 个元素存在于 merged_df_long 的 node1 或 node2 列中。")

7690 unique elements in disGeNet_df node1 column.
去重后，disGeNet_df 的 node1 列中有 1547 个元素存在于 merged_df_long 的 node1 或 node2 列中。
1356 unique elements in disGeNet_df node1 column.
去重后，disGeNet_df 的 node1 列中有 1356 个元素存在于 merged_df_long 的 node1 或 node2 列中。


#### I think we should add all known disease-gene pairs from disGeNet, but not only inlcude overlap nodes. Because the detected proteins in Olink data is limitted by the Olink technology, but the known pairs in disGeNet are from various techologies.

In [64]:
# add disGeNet gene-disease pairs
merged_df_add_disGeNet = pd.concat([merged_df_long, disGeNet_df], ignore_index=True)

In [65]:
merged_df_add_disGeNet.shape

(1681546, 3)

In [66]:
merged_df_long.shape

(1650665, 3)

In [67]:
disGeNet_df.shape

(30881, 3)

In [68]:
# 保存为以制表符分隔的文本文件 
merged_df_add_disGeNet.to_csv('../Data/merged_df_add_disGeNet.txt', sep='\t', na_rep='nan', index=False)

#### For new nodes need correspong embedding features, we use overlap nodes firstly.

In [71]:
merged_nodes = set(merged_df_long['node1']) | set(merged_df_long['node2'])

# 筛选 disGeNet_df 中 node1 存在于 merged_nodes 的行
filtered_df = disGeNet_df[
    disGeNet_df['node1'].isin(merged_nodes)
]

In [72]:
print(f"disGeNet_df行数: {len(disGeNet_df)}")
print(f"筛选后的数据行数: {len(filtered_df)}")
print(filtered_df.head())

disGeNet_df行数: 30881
筛选后的数据行数: 9740
   node1 node2  relationship
0    tnf  A010           1.0
3    lta  A010           1.0
5  il12b  A020           1.0
6   mbl2  A072           1.0
7  cxcl8   A23           1.0


In [74]:
# add filtered disGeNet gene-disease pairs
merged_df_add_filtered_disGeNet = pd.concat([merged_df_long, filtered_df], ignore_index=True)

In [75]:
# 保存为以制表符分隔的文本文件 
merged_df_add_filtered_disGeNet.to_csv('../Data/merged_df_add_filtered_disGeNet.txt', sep='\t', na_rep='nan', index=False)

## prepare input data

In [95]:
import numpy as np
import os
from scipy.sparse import csr_matrix
from utils import N2V
import pandas as pd
from fastnode2vec import Graph, Node2Vec 
import pickle
import json

In [98]:
adj_path = '../Data/merged_df_add_filtered_disGeNet.txt'

In [99]:
merged_df = pd.read_csv(adj_path, sep='\t',
    dtype={'node1': str, 'node2': str, 'relationship': float} )
merged_df.head(2)

Unnamed: 0,node1,node2,relationship
0,1002133,I802,1.0
1,1002133,M1997,1.0


In [100]:
X, A, Y = [], None, []
n_node = 0

# Acquire Edges
edge_list = []
node_list = []
node_type = {}
relationship_list = []

with open(adj_path, 'rt', encoding='utf-8') as f:
    next(f)
    for line in f.readlines():
        node1, node2, relationship, *_ = line.strip().split('\t')
        edge_list.append((node1, node2))
        node_list.extend([node1, node2])
        relationship_list.append(relationship)

node_map = {item: i for i, item in enumerate(sorted(list(set(node_list))))}
n_node = len(node_map)

# 使用稀疏矩阵存储邻接关系
row = []
col = []
data = []
for i, (node1, node2) in enumerate(edge_list):
    row.append(node_map[node1])
    col.append(node_map[node2])
    data.append(relationship_list[i])  # 使用实际的关系值
    row.append(node_map[node2])
    col.append(node_map[node1])
    data.append(relationship_list[i])  # 使用实际的关系值
A = csr_matrix((data, (row, col)), shape=(n_node, n_node), dtype=np.float32)

adj_matrix = pd.read_csv(adj_path, sep='\t',
    dtype={'node1': str, 'node2': str, 'relationship': float})
# 构建图所需的边列表
edges = [(row['node1'], row['node2'], row['relationship']) for _, row in adj_matrix.iterrows()]

# 构建图
graph = Graph(edges, directed=False, weighted=True)

                                                                                                                                                                                                                                                                                                                                    

In [101]:
graph.node_names

array(['1002133', 'I802', 'M1997', ..., 'Z90', 'Z994', 'Z99'],
      dtype='<U86')

In [102]:
graph

<fastnode2vec.graph.Graph at 0x7fa694a4f350>

In [103]:
# 保存图对象
with open('../Data/graph_add_filtered_disGeNet.pkl', 'wb') as f:
    pickle.dump(graph, f)

In [85]:
with open('../Data/gpt_protein_embeddings_dict.pkl', 'rb') as f:
    protein_embeddings_dict = pickle.load(f)

In [86]:
with open('../Data/gpt_embeddings_dict.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)

In [87]:
with open('../Data/metabolite_embeddings_dict.pkl', 'rb') as f:
    metabolite_embeddings_dict = pickle.load(f)

In [88]:
with open('../Data/icd10_embeddings_dict.pkl', 'rb') as f:
    icd10_embeddings_dict = pickle.load(f)

In [90]:
# 创建新字典，保留原始字典不变
merged_dict = {**protein_embeddings_dict, **embeddings_dict, **metabolite_embeddings_dict, **icd10_embeddings_dict}

In [93]:
# 提取所有键并保持顺序（Python 3.7+ 字典保持插入顺序）
keys = list(graph.node_names)

# 将字典值转换为NumPy矩阵（每行一个向量）
matrix = np.array([merged_dict[key] for key in keys])

# 检查矩阵形状
print("矩阵形状:", matrix.shape)  # 输出: (47626, 向量维度)

# 示例：查看前3行
print("前3行:\n", matrix[:3])

矩阵形状: (49373, 1536)
前3行:
 [[-0.0182373  -0.02098356  0.02359651 ... -0.01561102 -0.01053844
  -0.00612576]
 [-0.02499108 -0.0067757   0.02219337 ... -0.02719697 -0.01021568
  -0.02587882]
 [ 0.00454228 -0.00239301  0.02163428 ... -0.01282641 -0.01000684
  -0.05185902]]


In [96]:
# 保存
with open('../Data/keys_add_filtered_disGeNet.json', 'w') as f:
    json.dump(keys, f)

In [97]:
np.save('../Data/UKB_node_feature_gpt_add_filtered_disGeNet.npy', matrix)