### Prepare data

In [1]:
import pandas as pd
import pickle
import numpy as np
import torch

### participant-disease pairs

In [2]:
# 读取原始数据（假设每个诊断记录为一行）
clinical_df = pd.read_csv("../Data/ICD10_long.csv")

In [3]:
clinical_df.shape

(7015554, 5)

In [4]:
clinical_df['eid'] = clinical_df['eid'].astype(str)

In [5]:
clinical_df.head(2)

Unnamed: 0,eid,ICD10_codes,Date,p40000_i0,p40023
0,1000122,J383,1999-07-06,2014-12-22,1.0
1,1000122,M255,2001-07-31,2014-12-22,1.0


In [6]:
len(clinical_df['ICD10_codes'].unique())

12315

In [7]:
clinical_df.dtypes

eid             object
ICD10_codes     object
Date            object
p40000_i0       object
p40023         float64
dtype: object

In [8]:
len(set(clinical_df['eid']))

446802

In [9]:
BaselineCharacteristics = pd.read_csv("../Data/BaselineCharacteristics.csv")

In [10]:
BaselineCharacteristics.shape

(502131, 6)

In [11]:
BaselineCharacteristics['eid'] = BaselineCharacteristics['eid'].astype(str)

In [12]:
BaselineCharacteristics.head(2)

Unnamed: 0,eid,p31,p34,p52,p21022,p22189
0,2600773,0,1946,11,62,-3.11
1,2163573,1,1948,5,57,5.33


In [13]:
# 转换为字符串并补零
BaselineCharacteristics['p34_str'] = BaselineCharacteristics['p34'].astype(str)
BaselineCharacteristics['p52_str'] = BaselineCharacteristics['p52'].astype(str).str.zfill(2)

# 拼接成生日格式
BaselineCharacteristics['birthday'] = BaselineCharacteristics['p34_str'] + '-' + BaselineCharacteristics['p52_str'] + '-01'

In [14]:
BaselineCharacteristics.head(2)

Unnamed: 0,eid,p31,p34,p52,p21022,p22189,p34_str,p52_str,birthday
0,2600773,0,1946,11,62,-3.11,1946,11,1946-11-01
1,2163573,1,1948,5,57,5.33,1948,5,1948-05-01


In [15]:
# 使用左连接将birthday列合并到clinical_df中
clinical_df = clinical_df.merge(
    BaselineCharacteristics[['eid', 'birthday']],  # 只选择需要的列
    on='eid',         # 连接键
    how='left'        # 左连接，保留clinical_df的所有行
)

In [16]:
clinical_df.head(2)

Unnamed: 0,eid,ICD10_codes,Date,p40000_i0,p40023,birthday
0,1000122,J383,1999-07-06,2014-12-22,1.0,1941-01-01
1,1000122,M255,2001-07-31,2014-12-22,1.0,1941-01-01


In [17]:
clinical_df['birthday'].isna().any()

False

In [18]:
# 转换Date和birthday列为datetime类型
clinical_df['Date'] = pd.to_datetime(clinical_df['Date'])
clinical_df['birthday'] = pd.to_datetime(clinical_df['birthday'])

In [19]:
# 计算时间差（天数）
clinical_df['DiagnosisAge'] = (clinical_df['Date'] - clinical_df['birthday']).dt.days

In [20]:
clinical_df.head(2)

Unnamed: 0,eid,ICD10_codes,Date,p40000_i0,p40023,birthday,DiagnosisAge
0,1000122,J383,1999-07-06,2014-12-22,1.0,1941-01-01,21370
1,1000122,M255,2001-07-31,2014-12-22,1.0,1941-01-01,22126


In [21]:
clinical_df_select_columns = clinical_df[['eid', 'ICD10_codes', 'DiagnosisAge']]

In [22]:
clinical_df_select_columns.to_csv("../Data/clinical_df_select_columns.csv", index=False)

### participant-protein pairs

In [23]:
Olink_df = pd.read_csv("../Data/Olink.csv")

In [24]:
Olink_df['olink_instance_0.eid'] = Olink_df['olink_instance_0.eid'].astype(str)

In [25]:
Olink_df.head(2)

Unnamed: 0,olink_instance_0.eid,olink_instance_0.a1bg,olink_instance_0.aamdc,olink_instance_0.aarsd1,olink_instance_0.abca2,olink_instance_0.abhd14b,olink_instance_0.abl1,olink_instance_0.abo,olink_instance_0.abraxas2,olink_instance_0.acaa1,...,olink_instance_0.zfyve19,olink_instance_0.zhx2,olink_instance_0.znf174,olink_instance_0.znf75d,olink_instance_0.znf830,olink_instance_0.znrd2,olink_instance_0.znrf4,olink_instance_0.zp3,olink_instance_0.zp4,olink_instance_0.zpr1
0,1002133,0.0957,0.59795,1.33215,-0.6441,0.76325,0.3784,0.5446,0.4682,0.2192,...,-0.2728,0.1954,-0.2586,-0.0127,0.0235,-0.26245,-0.1236,-6.0773,-0.0136,0.4131
1,1002201,-0.0473,0.39585,0.26585,0.0604,1.07765,1.3977,0.583,1.074,0.75625,...,0.9153,-0.0274,-0.3048,-0.8814,-0.5342,1.12255,-0.5537,-5.1443,-0.1713,-0.5605


In [26]:
Olink_df = Olink_df.rename(columns={'olink_instance_0.eid': 'eid'})

In [27]:
Olink_df.head(4)

Unnamed: 0,eid,olink_instance_0.a1bg,olink_instance_0.aamdc,olink_instance_0.aarsd1,olink_instance_0.abca2,olink_instance_0.abhd14b,olink_instance_0.abl1,olink_instance_0.abo,olink_instance_0.abraxas2,olink_instance_0.acaa1,...,olink_instance_0.zfyve19,olink_instance_0.zhx2,olink_instance_0.znf174,olink_instance_0.znf75d,olink_instance_0.znf830,olink_instance_0.znrd2,olink_instance_0.znrf4,olink_instance_0.zp3,olink_instance_0.zp4,olink_instance_0.zpr1
0,1002133,0.0957,0.59795,1.33215,-0.6441,0.76325,0.3784,0.5446,0.4682,0.2192,...,-0.2728,0.1954,-0.2586,-0.0127,0.0235,-0.26245,-0.1236,-6.0773,-0.0136,0.4131
1,1002201,-0.0473,0.39585,0.26585,0.0604,1.07765,1.3977,0.583,1.074,0.75625,...,0.9153,-0.0274,-0.3048,-0.8814,-0.5342,1.12255,-0.5537,-5.1443,-0.1713,-0.5605
2,1002534,-0.2239,-0.04545,-0.0758,0.2499,-0.4838,0.4578,-0.0678,-0.1641,-0.2901,...,-0.8164,0.2514,0.4768,-0.4894,-0.21715,0.0055,-0.0715,-5.9309,-0.2436,-0.4368
3,1003548,0.0255,,,,,,-0.2824,,,...,1.1505,0.1681,0.2402,,0.42805,0.5587,0.4882,,-0.06,0.7822


In [28]:
Olink_df.shape

(53013, 2924)

In [29]:
# 提取非eid列（蛋白质表达量列）
protein_cols = Olink_df.columns[1:]  # 第一列是eid

In [30]:
# 找出蛋白质表达量全为NaN的病人行
all_nan_mask = Olink_df[protein_cols].isna().all(axis=1)

In [31]:
# 过滤掉全NaN的病人
Olink_df_filtered = Olink_df[~all_nan_mask].copy()

In [32]:
Olink_df_filtered.shape

(53013, 2924)

In [33]:
# 2. 用各列最小值填补NaN
# 计算各列最小值（排除NaN）
col_min_values = Olink_df_filtered[protein_cols].min()

# 填补NaN（使用广播机制）
Olink_df_imputed = Olink_df_filtered.copy()
Olink_df_imputed[protein_cols] = Olink_df_filtered[protein_cols].fillna(col_min_values)

In [34]:
Olink_df_imputed.head(4)

Unnamed: 0,eid,olink_instance_0.a1bg,olink_instance_0.aamdc,olink_instance_0.aarsd1,olink_instance_0.abca2,olink_instance_0.abhd14b,olink_instance_0.abl1,olink_instance_0.abo,olink_instance_0.abraxas2,olink_instance_0.acaa1,...,olink_instance_0.zfyve19,olink_instance_0.zhx2,olink_instance_0.znf174,olink_instance_0.znf75d,olink_instance_0.znf830,olink_instance_0.znrd2,olink_instance_0.znrf4,olink_instance_0.zp3,olink_instance_0.zp4,olink_instance_0.zpr1
0,1002133,0.0957,0.59795,1.33215,-0.6441,0.76325,0.3784,0.5446,0.4682,0.2192,...,-0.2728,0.1954,-0.2586,-0.0127,0.0235,-0.26245,-0.1236,-6.0773,-0.0136,0.4131
1,1002201,-0.0473,0.39585,0.26585,0.0604,1.07765,1.3977,0.583,1.074,0.75625,...,0.9153,-0.0274,-0.3048,-0.8814,-0.5342,1.12255,-0.5537,-5.1443,-0.1713,-0.5605
2,1002534,-0.2239,-0.04545,-0.0758,0.2499,-0.4838,0.4578,-0.0678,-0.1641,-0.2901,...,-0.8164,0.2514,0.4768,-0.4894,-0.21715,0.0055,-0.0715,-5.9309,-0.2436,-0.4368
3,1003548,0.0255,-1.97485,-4.52005,-1.5961,-5.0921,-2.67535,-0.2824,-2.4128,-6.4471,...,1.1505,0.1681,0.2402,-5.0317,0.42805,0.5587,0.4882,-7.2245,-0.06,0.7822


In [35]:
# 计算除第一列外每列的最小值和最大值
range_df = Olink_df_imputed.iloc[:, 1:].agg(['min', 'max'])

# 转置结果以便更清晰地查看每列的范围
range_df = range_df.T
range_df.columns = ['最小值', '最大值']

# 打印结果
print(range_df)

                              最小值      最大值
olink_instance_0.a1bg    -1.24600  1.47870
olink_instance_0.aamdc   -1.97485  3.93705
olink_instance_0.aarsd1  -4.52005  4.04045
olink_instance_0.abca2   -1.59610  5.20110
olink_instance_0.abhd14b -5.09210  4.20725
...                           ...      ...
olink_instance_0.znrd2   -2.03170  7.07165
olink_instance_0.znrf4   -1.93520  7.30770
olink_instance_0.zp3     -7.22450  4.52160
olink_instance_0.zp4     -0.92460  7.15110
olink_instance_0.zpr1    -2.15270  9.32450

[2923 rows x 2 columns]


In [36]:
# 计算除第一列外每一行的最小值和最大值
range_df = Olink_df_imputed.iloc[:, 1:].agg(['min', 'max'], axis=1)

In [37]:
range_df.columns = ['最小值', '最大值']

# 打印结果
print(range_df)

           最小值      最大值
0     -8.97930  9.32795
1     -7.23585  4.90065
2     -8.81690  5.32800
3     -8.97930  8.92390
4     -5.79100  4.68585
...        ...      ...
53008 -4.42530  7.08710
53009 -8.90080  4.43580
53010 -5.25605  7.28690
53011 -6.38720  3.85655
53012 -8.97930  6.40180

[53013 rows x 2 columns]


In [38]:
# 对除第一列外的所有列进行行求和
row_sums = Olink_df_imputed.iloc[:, 1:].sum(axis=1)

# 显示结果示例（前5行）
print(row_sums)

0         144.02135
1         711.91540
2        -418.11685
3       -2705.07415
4        -589.10660
            ...    
53008     488.80845
53009    -557.52780
53010     556.30340
53011     279.96150
53012     158.69970
Length: 53013, dtype: float64


In [39]:
# 对除第一列外的所有列进行列求均值
col_means = Olink_df_imputed.iloc[:, 1:].mean()

print(col_means)

olink_instance_0.a1bg      -0.195471
olink_instance_0.aamdc     -0.332965
olink_instance_0.aarsd1    -0.129088
olink_instance_0.abca2     -0.281832
olink_instance_0.abhd14b   -0.181393
                              ...   
olink_instance_0.znrd2     -0.249826
olink_instance_0.znrf4     -0.283627
olink_instance_0.zp3       -2.180907
olink_instance_0.zp4       -0.144332
olink_instance_0.zpr1      -0.214294
Length: 2923, dtype: float64


In [40]:
# 将数据框从宽格式转换为长格式
Olink_long_df = pd.melt(Olink_df_imputed, id_vars=['eid'], var_name='protein_name', value_name='protein_expression')

print(Olink_long_df)  

               eid           protein_name  protein_expression
0          1002133  olink_instance_0.a1bg              0.0957
1          1002201  olink_instance_0.a1bg             -0.0473
2          1002534  olink_instance_0.a1bg             -0.2239
3          1003548  olink_instance_0.a1bg              0.0255
4          1003599  olink_instance_0.a1bg              0.2003
...            ...                    ...                 ...
154956994  6021692  olink_instance_0.zpr1              1.2116
154956995  6021914  olink_instance_0.zpr1             -0.0460
154956996  6022081  olink_instance_0.zpr1             -0.6499
154956997  6022213  olink_instance_0.zpr1             -0.1758
154956998  6022247  olink_instance_0.zpr1              1.4512

[154956999 rows x 3 columns]


In [41]:
53013 * 2923

154956999

In [42]:
print(Olink_long_df['protein_expression'].min())
print(Olink_long_df['protein_expression'].max())

-9.66045
13.4435


In [43]:
Olink_long_df['protein_expression'].describe()

count    1.549570e+08
mean    -2.591068e-01
std      1.175336e+00
min     -9.660450e+00
25%     -4.430000e-01
50%     -5.570000e-02
75%      2.548500e-01
max      1.344350e+01
Name: protein_expression, dtype: float64

In [44]:
Olink_long_df = Olink_long_df.dropna(subset=['protein_expression'])

In [45]:
Olink_long_df.shape

(154956999, 3)

In [46]:
RecruitmentDate = pd.read_csv("../Data/RecruitmentDate.csv")

In [47]:
RecruitmentDate["eid"] = RecruitmentDate["eid"].astype(str)

In [48]:
RecruitmentDate.head(2)

Unnamed: 0,eid,p34,p52,p53_i0,p53_i1,p53_i2,p53_i3,p21022
0,2600773,1946,11,2008-12-19,,,,62
1,2163573,1948,5,2006-03-21,,,,57


In [49]:
# 转换为字符串并补零
RecruitmentDate['p34_str'] = RecruitmentDate['p34'].astype(str)
RecruitmentDate['p52_str'] = RecruitmentDate['p52'].astype(str).str.zfill(2)

# 拼接成生日格式
RecruitmentDate['birthday'] = RecruitmentDate['p34_str'] + '-' + RecruitmentDate['p52_str'] + '-01'

In [50]:
# 转换Date和birthday列为datetime类型
RecruitmentDate['p53_i0'] = pd.to_datetime(RecruitmentDate['p53_i0'])
RecruitmentDate['birthday'] = pd.to_datetime(RecruitmentDate['birthday'])

In [51]:
# 计算时间差（天数）
RecruitmentDate['RecruitmentAge'] = (RecruitmentDate['p53_i0'] - RecruitmentDate['birthday']).dt.days

In [52]:
RecruitmentDate.head(2)

Unnamed: 0,eid,p34,p52,p53_i0,p53_i1,p53_i2,p53_i3,p21022,p34_str,p52_str,birthday,RecruitmentAge
0,2600773,1946,11,2008-12-19,,,,62,1946,11,1946-11-01,22694
1,2163573,1948,5,2006-03-21,,,,57,1948,5,1948-05-01,21143


In [53]:
Olink_long_df.head(2)

Unnamed: 0,eid,protein_name,protein_expression
0,1002133,olink_instance_0.a1bg,0.0957
1,1002201,olink_instance_0.a1bg,-0.0473


In [54]:
# 使用左连接将Recruitment列合并到Olink_long_df中
Olink_long_df = Olink_long_df.merge(
    RecruitmentDate[['eid', 'RecruitmentAge']],  # 只选择需要的列
    on='eid',         # 连接键
    how='left'        # 左连接，保留Olink_long_df的所有行
)

In [55]:
Olink_long_df.head(2)

Unnamed: 0,eid,protein_name,protein_expression,RecruitmentAge
0,1002133,olink_instance_0.a1bg,0.0957,23090
1,1002201,olink_instance_0.a1bg,-0.0473,25311


In [56]:
Olink_long_df['RecruitmentAge'].isna().any()

False

In [57]:
Olink_long_df['protein_name'] = Olink_long_df['protein_name'].str.replace("olink_instance_0.", "", regex=False)

In [58]:
Olink_long_df.head(2)

Unnamed: 0,eid,protein_name,protein_expression,RecruitmentAge
0,1002133,a1bg,0.0957,23090
1,1002201,a1bg,-0.0473,25311


In [59]:
Olink_long_df.to_csv("../Data/Olink_long_df.csv", index=False)

### participant-metabolite pairs

In [60]:
NMR_df = pd.read_csv("../Data/NMR.csv")

In [61]:
NMR_df.shape

(502131, 252)

In [62]:
NMR_df['participant.eid'] = NMR_df['participant.eid'].astype(str)

In [63]:
NMR_df.head(2)

Unnamed: 0,participant.eid,participant.p20280_i0,participant.p20281_i0,participant.p23400_i0,participant.p23401_i0,participant.p23402_i0,participant.p23403_i0,participant.p23404_i0,participant.p23405_i0,participant.p23406_i0,...,participant.p23639_i0,participant.p23640_i0,participant.p23641_i0,participant.p23642_i0,participant.p23643_i0,participant.p23644_i0,participant.p23645_i0,participant.p23646_i0,participant.p23647_i0,participant.p23648_i0
0,1000053,2.9721,0.282259,3.8459,2.9413,1.5075,0.83349,1.9998,1.4338,0.90459,...,49.941,40.646,33.031,7.6145,9.413,58.373,34.427,24.479,9.9478,7.2007
1,1000122,,,,,,,,,,...,,,,,,,,,,


In [64]:
NMR_df = NMR_df.rename(columns={'participant.eid': 'eid'})

In [65]:
NMR_df.head(4)

Unnamed: 0,eid,participant.p20280_i0,participant.p20281_i0,participant.p23400_i0,participant.p23401_i0,participant.p23402_i0,participant.p23403_i0,participant.p23404_i0,participant.p23405_i0,participant.p23406_i0,...,participant.p23639_i0,participant.p23640_i0,participant.p23641_i0,participant.p23642_i0,participant.p23643_i0,participant.p23644_i0,participant.p23645_i0,participant.p23646_i0,participant.p23647_i0,participant.p23648_i0
0,1000053,2.9721,0.282259,3.8459,2.9413,1.5075,0.83349,1.9998,1.4338,0.90459,...,49.941,40.646,33.031,7.6145,9.413,58.373,34.427,24.479,9.9478,7.2007
1,1000122,,,,,,,,,,...,,,,,,,,,,
2,1000148,5.79763,0.407074,6.5412,4.7789,2.1906,0.9426,3.9233,2.5883,1.7623,...,46.042,50.345,40.931,9.4135,3.6133,56.367,40.318,29.758,10.56,3.3156
3,1000150,5.32703,0.436711,4.2633,3.1027,1.4863,0.82748,1.9775,1.6164,1.1606,...,49.781,40.044,32.094,7.95,10.175,58.825,32.954,23.077,9.8765,8.221


In [66]:
# 提取非eid列
NMR_cols = NMR_df.columns[1:]  # 第一列是eid

In [67]:
# 找出表达量全为NaN的病人行
all_nan_mask = NMR_df[NMR_cols].isna().all(axis=1)

In [68]:
# 过滤掉全NaN的病人
NMR_df_filtered = NMR_df[~all_nan_mask].copy()

In [69]:
NMR_df_filtered.shape

(274236, 252)

In [70]:
# 2. 用各列最小值填补NaN
# 计算各列最小值（排除NaN）
col_min_values = NMR_df_filtered[NMR_cols].min()

# 填补NaN（使用广播机制）
NMR_df_imputed = NMR_df_filtered.copy()
NMR_df_imputed[NMR_cols] = NMR_df_filtered[NMR_cols].fillna(col_min_values)

In [71]:
NMR_df_imputed.head(4)

Unnamed: 0,eid,participant.p20280_i0,participant.p20281_i0,participant.p23400_i0,participant.p23401_i0,participant.p23402_i0,participant.p23403_i0,participant.p23404_i0,participant.p23405_i0,participant.p23406_i0,...,participant.p23639_i0,participant.p23640_i0,participant.p23641_i0,participant.p23642_i0,participant.p23643_i0,participant.p23644_i0,participant.p23645_i0,participant.p23646_i0,participant.p23647_i0,participant.p23648_i0
0,1000053,2.9721,0.282259,3.8459,2.9413,1.5075,0.83349,1.9998,1.4338,0.90459,...,49.941,40.646,33.031,7.6145,9.413,58.373,34.427,24.479,9.9478,7.2007
2,1000148,5.79763,0.407074,6.5412,4.7789,2.1906,0.9426,3.9233,2.5883,1.7623,...,46.042,50.345,40.931,9.4135,3.6133,56.367,40.318,29.758,10.56,3.3156
3,1000150,5.32703,0.436711,4.2633,3.1027,1.4863,0.82748,1.9775,1.6164,1.1606,...,49.781,40.044,32.094,7.95,10.175,58.825,32.954,23.077,9.8765,8.221
5,1000199,5.73236,0.38478,5.836,3.8974,1.809,0.8103,3.0389,2.0885,1.9386,...,46.181,48.841,39.874,8.9677,4.978,58.066,37.815,28.076,9.7397,4.1188


In [72]:
# 将数据框从宽格式转换为长格式
NMR_long_df = pd.melt(NMR_df_imputed, id_vars=['eid'], var_name='NMR_name', value_name='NMR_expression')

print(NMR_long_df)  

              eid               NMR_name  NMR_expression
0         1000053  participant.p20280_i0         2.97210
1         1000148  participant.p20280_i0         5.79763
2         1000150  participant.p20280_i0         5.32703
3         1000199  participant.p20280_i0         5.73236
4         1000428  participant.p20280_i0         5.26226
...           ...                    ...             ...
68833231  6022081  participant.p23648_i0         5.61880
68833232  6022102  participant.p23648_i0         5.26160
68833233  6022141  participant.p23648_i0         4.78620
68833234  6022197  participant.p23648_i0         3.85360
68833235  6022252  participant.p23648_i0         5.26830

[68833236 rows x 3 columns]


In [73]:
NMR_df_imputed.shape

(274236, 252)

In [74]:
274236 * 251

68833236

In [75]:
# 移除 NMR_expression 列中值为 NaN 的行
NMR_long_df = NMR_long_df.dropna(subset=['NMR_expression'])

print(NMR_long_df) 

              eid               NMR_name  NMR_expression
0         1000053  participant.p20280_i0         2.97210
1         1000148  participant.p20280_i0         5.79763
2         1000150  participant.p20280_i0         5.32703
3         1000199  participant.p20280_i0         5.73236
4         1000428  participant.p20280_i0         5.26226
...           ...                    ...             ...
68833231  6022081  participant.p23648_i0         5.61880
68833232  6022102  participant.p23648_i0         5.26160
68833233  6022141  participant.p23648_i0         4.78620
68833234  6022197  participant.p23648_i0         3.85360
68833235  6022252  participant.p23648_i0         5.26830

[68833236 rows x 3 columns]


In [76]:
# 使用左连接将Recruitment列合并到NMR_long_df中
NMR_long_df = NMR_long_df.merge(
    RecruitmentDate[['eid', 'RecruitmentAge']],  # 只选择需要的列
    on='eid',         # 连接键
    how='left'        # 左连接，保留NMR_long_df的所有行
)

In [77]:
NMR_long_df['NMR_name'] = NMR_long_df['NMR_name'].str.replace("participant.", "", regex=False)

In [78]:
NMR_name_description_mapping = pd.read_csv('../Data/NMR_name_description_mapping.csv',
                                           encoding='utf-8')

In [79]:
NMR_name_description_mapping.head(2)

Unnamed: 0,NMR_name,full_name
0,p20280_i0,Glucose-lactate | Instance 0
1,p20281_i0,Spectrometer-corrected alanine | Instance 0


In [80]:
NMR_name_description_mapping['full_name'] = NMR_name_description_mapping['full_name'].str.replace(" | Instance 0", "", regex=False)

In [81]:
NMR_name_description_mapping.head(2)

Unnamed: 0,NMR_name,full_name
0,p20280_i0,Glucose-lactate
1,p20281_i0,Spectrometer-corrected alanine


In [82]:
# 创建一个映射字典
name_mapping = dict(zip(NMR_name_description_mapping['NMR_name'], 
                        NMR_name_description_mapping['full_name']))

In [83]:
# 定义一个函数来执行映射
def map_node2(name):
    return name_mapping.get(name, name)  # 如果找不到映射，返回原值

In [84]:
# 应用映射到NMR_name列
NMR_long_df['NMR_name'] = NMR_long_df['NMR_name'].apply(map_node2)

In [85]:
# 查看结果
print(NMR_long_df.head(2))
print(NMR_long_df.tail(2))

       eid         NMR_name  NMR_expression  RecruitmentAge
0  1000053  Glucose-lactate         2.97210           24523
1  1000148  Glucose-lactate         5.79763           23304
              eid                                           NMR_name  \
68833234  6022197  Triglycerides to Total Lipids in Small HDL per...   
68833235  6022252  Triglycerides to Total Lipids in Small HDL per...   

          NMR_expression  RecruitmentAge  
68833234          3.8536           19613  
68833235          5.2683           22943  


In [86]:
NMR_long_df.to_csv("../Data/NMR_long_df.csv", index=False)

## read in embeddings

In [2]:
import pickle
import numpy as np

In [88]:
print(np.__version__)

1.26.4


In [89]:
with open('../Data/gpt_protein_embeddings_dict.pkl', 'rb') as f:
    protein_embeddings_dict = pickle.load(f)

In [90]:
# 提取键（按插入顺序，Python 3.7+字典有序）
protein_embeddings_dict_keys = list(protein_embeddings_dict.keys())

In [91]:
# 将字典值转换为NumPy矩阵（每行一个向量）
protein_matrix = np.array([protein_embeddings_dict[key] for key in protein_embeddings_dict_keys])

In [92]:
# 创建异构图数据对象
protein_features = torch.from_numpy(protein_matrix).float()

In [93]:
with open('../Data/gpt_embeddings_dict.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)

In [94]:
# 提取键（按插入顺序，Python 3.7+字典有序）
embeddings_dict_keys = list(embeddings_dict.keys())

In [95]:
# 将字典值转换为NumPy矩阵（每行一个向量）
participant_matrix = np.array([embeddings_dict[key] for key in embeddings_dict_keys])

In [96]:
# 创建异构图数据对象
participant_features = torch.from_numpy(participant_matrix).float()

In [97]:
with open('../Data/metabolite_embeddings_dict.pkl', 'rb') as f:
    metabolite_embeddings_dict = pickle.load(f)

In [98]:
# 提取键（按插入顺序，Python 3.7+字典有序）
metabolite_embeddings_dict_keys = list(metabolite_embeddings_dict.keys())

In [99]:
# 将字典值转换为NumPy矩阵（每行一个向量）
metabolite_matrix = np.array([metabolite_embeddings_dict[key] for key in metabolite_embeddings_dict_keys])

In [100]:
# 创建异构图数据对象
metabolite_features = torch.from_numpy(metabolite_matrix).float()

## ICD10 embedding

In [3]:
import glob
import json
import string

In [102]:
values = []
icd_index = []
for index in list(string.ascii_uppercase):
    file = glob.glob(f'../Data/icd10_embedding/{index}/icd10_results_*.json')[0]
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for key, value in data['embeddings'].items():
            values.append(value)
            icd_index.append(key)

embeddings = np.array(values)

In [103]:
icd_index[0:3]

['A00', 'A00.0', 'A00.1']

In [104]:
icd10_embeddings = {}

for index in string.ascii_uppercase:
    try:
        file = glob.glob(f'../Data/icd10_embedding/{index}/icd10_results_*.json')[0]
        
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
            # 处理每个键，移除小数点后再添加到总字典
            for key, value in data['embeddings'].items():
                clean_key = key.replace('.', '')  # 移除小数点
                icd10_embeddings[clean_key] = value
                
    except (IndexError, FileNotFoundError):
        print(f"未找到字母 {index} 对应的文件")
    except Exception as e:
        print(f"处理字母 {index} 时出错: {e}")

In [105]:
type(icd10_embeddings)

dict

In [106]:
# 检查键是否存在
if "C926" in icd10_embeddings:
    print("键存在于字典中")
else:
    print("键不存在于字典中")

键存在于字典中


In [107]:
with open('../Data/icd10_embeddings_dict.pkl', 'wb') as f:
    pickle.dump(icd10_embeddings, f)

In [108]:
# 提取键（按插入顺序，Python 3.7+字典有序）
icd10_embeddings_keys = list(icd10_embeddings.keys())

In [109]:
# 将字典值转换为NumPy矩阵（每行一个向量）
icd10_matrix = np.array([icd10_embeddings[key] for key in icd10_embeddings_keys])

In [110]:
# 创建异构图数据对象
icd10_features = torch.from_numpy(icd10_matrix).float()

## Build the Heterogenous Graph

In [4]:
from torch_geometric.data import HeteroData

  from .autonotebook import tqdm as notebook_tqdm


In [112]:
# HeteroData是PyG自带的一个异质图数据结构
HGData = HeteroData()

In [113]:
# 添加节点的信息
# data['paper'].x = ... # [num_papers, num_features_paper]
HGData['participant'].x = participant_features
HGData['disease'].x = icd10_features
HGData['protein'].x = protein_features
HGData['metabolite'].x = metabolite_features

In [114]:
print(HGData['participant'].x.shape)
print(HGData['disease'].x.shape)
print(HGData['protein'].x.shape)
print(HGData['metabolite'].x.shape)

torch.Size([47626, 1536])
torch.Size([18904, 1536])
torch.Size([2923, 1536])
torch.Size([251, 1536])


In [115]:
clinical_df_select_columns.head(2)

Unnamed: 0,eid,ICD10_codes,DiagnosisAge
0,1000122,J383,21370
1,1000122,M255,22126


In [116]:
# 创建映射字典，将eid和ICD10_codes映射到节点索引
participant_to_idx = {eid: idx for idx, eid in enumerate(embeddings_dict_keys)}
disease_to_idx = {icd10: idx for idx, icd10 in enumerate(icd10_embeddings_keys)}

# 提取临床数据中的边信息
participant_indices = []
disease_indices = []
DiagnosisAge = []

for _, row in clinical_df_select_columns.iterrows():
    eid = row['eid']
    icd10_code = row['ICD10_codes']
    
    # 检查eid和ICD10代码是否存在于节点列表中
    if eid in participant_to_idx and icd10_code in disease_to_idx:
        participant_idx = participant_to_idx[eid]
        disease_idx = disease_to_idx[icd10_code]
        
        participant_indices.append(participant_idx)
        disease_indices.append(disease_idx)
        DiagnosisAge.append(row['DiagnosisAge'])

# 转换为PyTorch张量
participant_disease_edge_index = torch.tensor([
    participant_indices,
    disease_indices
], dtype=torch.long)

In [117]:
Olink_long_df.head(2)

Unnamed: 0,eid,protein_name,protein_expression,RecruitmentAge
0,1002133,a1bg,0.0957,23090
1,1002201,a1bg,-0.0473,25311


In [118]:
# 创建映射字典，将eid和protein映射到节点索引
participant_to_idx = {eid: idx for idx, eid in enumerate(embeddings_dict_keys)}
protein_to_idx = {protein: idx for idx, protein in enumerate(protein_embeddings_dict_keys)}

# 筛选有效行
valid_df = Olink_long_df[
    Olink_long_df['eid'].isin(participant_to_idx) & 
    Olink_long_df['protein_name'].isin(protein_to_idx)
].copy()

# 映射ID到索引（向量化）
valid_df['participant_idx'] = valid_df['eid'].map(participant_to_idx)
valid_df['protein_idx'] = valid_df['protein_name'].map(protein_to_idx)

# 直接提取列数据（避免循环）
participant_indices = valid_df['participant_idx'].to_numpy()
protein_indices = valid_df['protein_idx'].to_numpy()
protein_expression = valid_df['protein_expression'].to_numpy()
protein_RecruitmentAge = valid_df['RecruitmentAge'].to_numpy()

In [119]:
# 转换为PyTorch张量
participant_protein_edge_index = torch.stack(
    [torch.from_numpy(participant_indices),
    torch.from_numpy(protein_indices)], dim=0
).to(torch.long)

In [120]:
NMR_long_df.head(2)

Unnamed: 0,eid,NMR_name,NMR_expression,RecruitmentAge
0,1000053,Glucose-lactate,2.9721,24523
1,1000148,Glucose-lactate,5.79763,23304


In [121]:
# 创建映射字典，将eid和protein映射到节点索引
participant_to_idx = {eid: idx for idx, eid in enumerate(embeddings_dict_keys)}
metabolite_to_idx = {metabolite: idx for idx, metabolite in enumerate(metabolite_embeddings_dict_keys)}

# 筛选有效行
valid_df = NMR_long_df[
    NMR_long_df['eid'].isin(participant_to_idx) & 
    NMR_long_df['NMR_name'].isin(metabolite_to_idx)
].copy()

# 映射ID到索引（向量化）
valid_df['participant_idx'] = valid_df['eid'].map(participant_to_idx)
valid_df['metabolite_idx'] = valid_df['NMR_name'].map(metabolite_to_idx)

# 直接提取列数据（避免循环）
participant_indices = valid_df['participant_idx'].to_numpy()
metabolite_indices = valid_df['metabolite_idx'].to_numpy()
NMR_expression = valid_df['NMR_expression'].to_numpy()
metabolite_RecruitmentAge = valid_df['RecruitmentAge'].to_numpy()

# 转换为PyTorch张量（优化内存分配）
participant_metabolite_edge_index = torch.stack([
    torch.from_numpy(participant_indices),
    torch.from_numpy(metabolite_indices)
], dim=0).to(torch.long)

In [122]:
# 添加边的连接信息
# data['author', 'writes', 'paper'].edge_index = ... # [2, num_edges_writes]
HGData['participant', 'expresses', 'protein'].edge_index = participant_protein_edge_index
HGData['participant', 'expresses', 'metabolite'].edge_index = participant_metabolite_edge_index
HGData['participant', 'suffers_from', 'disease'].edge_index = participant_disease_edge_index
#HGData['protein', 'interacts_with', 'protein'].edge_index = 
#HGData['protein', 'causes', 'disease'].edge_index = 
#HGData['disease', 'is_comorbid_with', 'disease'].edge_index = 

In [123]:
participant_features
icd10_features
protein_features
metabolite_features

tensor([[-0.0057,  0.0067,  0.0107,  ..., -0.0036, -0.0166, -0.0310],
        [ 0.0133,  0.0176,  0.0039,  ..., -0.0277, -0.0060, -0.0316],
        [ 0.0030,  0.0330, -0.0099,  ..., -0.0021, -0.0238, -0.0454],
        ...,
        [-0.0067,  0.0164, -0.0054,  ..., -0.0091, -0.0008, -0.0460],
        [ 0.0114,  0.0209, -0.0284,  ..., -0.0151, -0.0054, -0.0493],
        [-0.0164, -0.0015,  0.0004,  ..., -0.0115,  0.0018, -0.0472]])

In [124]:
participant_features.shape

torch.Size([47626, 1536])

In [125]:
participant_disease_edge_index.max(dim=1)

torch.return_types.max(
values=tensor([47625, 18903]),
indices=tensor([159697,    920]))

In [126]:
print(HGData)

HeteroData(
  participant={ x=[47626, 1536] },
  disease={ x=[18904, 1536] },
  protein={ x=[2923, 1536] },
  metabolite={ x=[251, 1536] },
  (participant, expresses, protein)={ edge_index=[2, 139210798] },
  (participant, expresses, metabolite)={ edge_index=[2, 6851045] },
  (participant, suffers_from, disease)={ edge_index=[2, 805376] }
)


In [127]:
# 创建一个训练掩码，随机选择40000节点作为训练集
train_mask = torch.zeros(47626, dtype=torch.bool)
train_indices = torch.randperm(47626)[:40000]  # 随机选择40000个节点
train_mask[train_indices] = True

# 将掩码赋值给HGData
HGData['participant'].train_mask = train_mask

In [128]:
val_mask = ~train_mask  # 直接取反
HGData['participant'].val_mask = val_mask

In [129]:
torch.save(HGData, '../Data/HGData.pt')

In [130]:
def check_edge_targets(data):
    print("\n===== 边目标节点分布检查 =====")
    for edge_type in data.edge_types:
        src_type, etype, dst_type = edge_type
        edge_index = data[edge_type].edge_index
        
        if src_type != 'participant':
            continue  # 只检查从participant出发的边
            
        print(f"\n边类型: {edge_type}")
        print(f"总边数: {edge_index.size(1)}")
        
        # 检查目标节点ID的唯一性和分布
        dst_nodes = edge_index[1]
        unique_dst = dst_nodes.unique()
        print(f"唯一目标节点数: {unique_dst.numel()} / {data[dst_type].num_nodes}")

check_edge_targets(HGData)


===== 边目标节点分布检查 =====

边类型: ('participant', 'expresses', 'protein')
总边数: 139210798
唯一目标节点数: 2923 / 2923

边类型: ('participant', 'expresses', 'metabolite')
总边数: 6851045
唯一目标节点数: 251 / 251

边类型: ('participant', 'suffers_from', 'disease')
总边数: 805376
唯一目标节点数: 8549 / 18904


In [5]:
from torch_geometric.loader import HGTLoader

In [6]:
from torch_geometric.transforms import ToUndirected

In [133]:
HGData = torch.load('../Data/HGData.pt', weights_only=False)
HGData_Undir = ToUndirected()(HGData)

In [134]:
torch.save(HGData_Undir, '../Data/HGData_Undir.pt')

In [135]:
print(HGData_Undir)

HeteroData(
  participant={
    x=[47626, 1536],
    train_mask=[47626],
    val_mask=[47626],
  },
  disease={ x=[18904, 1536] },
  protein={ x=[2923, 1536] },
  metabolite={ x=[251, 1536] },
  (participant, expresses, protein)={ edge_index=[2, 139210798] },
  (participant, expresses, metabolite)={ edge_index=[2, 6851045] },
  (participant, suffers_from, disease)={ edge_index=[2, 805376] },
  (protein, rev_expresses, participant)={ edge_index=[2, 139210798] },
  (metabolite, rev_expresses, participant)={ edge_index=[2, 6851045] },
  (disease, rev_suffers_from, participant)={ edge_index=[2, 805376] }
)


In [136]:
train_loader = HGTLoader(data=HGData, num_samples={key: [64] * 1 for key in HGData.node_types}, 
                         batch_size=512, shuffle=True,
                         input_nodes=('participant', None))

In [137]:
# 遍历前3个batch
for i, batch in enumerate(train_loader):
    if i > 1:
        break  # 只处理前3个batch
    
    print(f"\n===== Batch {i+1} 边信息 =====")
    print(batch)


===== Batch 1 边信息 =====
HeteroData(
  participant={
    x=[512, 1536],
    train_mask=[512],
    val_mask=[512],
    n_id=[512],
    input_id=[512],
    batch_size=512,
  },
  disease={
    x=[0, 1536],
    n_id=[0],
  },
  protein={
    x=[0, 1536],
    n_id=[0],
  },
  metabolite={
    x=[0, 1536],
    n_id=[0],
  },
  (participant, expresses, protein)={
    edge_index=[2, 0],
    e_id=[0],
  },
  (participant, expresses, metabolite)={
    edge_index=[2, 0],
    e_id=[0],
  },
  (participant, suffers_from, disease)={
    edge_index=[2, 0],
    e_id=[0],
  }
)

===== Batch 2 边信息 =====
HeteroData(
  participant={
    x=[512, 1536],
    train_mask=[512],
    val_mask=[512],
    n_id=[512],
    input_id=[512],
    batch_size=512,
  },
  disease={
    x=[0, 1536],
    n_id=[0],
  },
  protein={
    x=[0, 1536],
    n_id=[0],
  },
  metabolite={
    x=[0, 1536],
    n_id=[0],
  },
  (participant, expresses, protein)={
    edge_index=[2, 0],
    e_id=[0],
  },
  (participant, expresses, me

In [138]:
train_loader = HGTLoader(data=HGData_Undir, num_samples={key: [64] * 2 for key in HGData.node_types}, 
                         batch_size=512, shuffle=True,
                         input_nodes=('disease', None))

In [139]:
# 查看前三个batch的节点数量
print("前三个batch的节点数量统计：")
for i, batch in enumerate(train_loader):
    if i > 1:  # 只查看前3个batch
        break
        
    print(f"\n===== Batch {i+1} =====")
    node_counts = {}
    
    # 统计每个节点类型的数量
    for node_type in batch.x_dict:
        node_counts[node_type] = batch.x_dict[node_type].size(0)
    
    # 打印结果
    for node_type, count in node_counts.items():
        print(f"{node_type}节点数量: {count}")

前三个batch的节点数量统计：

===== Batch 1 =====
participant节点数量: 128
disease节点数量: 576
protein节点数量: 64
metabolite节点数量: 64

===== Batch 2 =====
participant节点数量: 128
disease节点数量: 576
protein节点数量: 64
metabolite节点数量: 64


### add self loops

In [140]:
def add_self_loops(hg):
    """为异构图的每个节点类型添加自环边"""
    new_hg = hg.clone()  # 克隆原图以避免修改原始数据
    
    # 遍历所有节点类型
    for ntype in hg.node_types:
        # 获取节点数量
        num_nodes = hg[ntype].num_nodes
    
        # 创建自环边的边索引：源节点和目标节点都是0到num_nodes-1
        src = torch.arange(num_nodes, dtype=torch.int64)
        dst = torch.arange(num_nodes, dtype=torch.int64)
        
        # 添加自环边到异构图
        new_hg[ntype, 'self_loop', ntype].edge_index = torch.stack([src, dst])
    
    return new_hg

HGData_Undir_with_self_loops = add_self_loops(HGData_Undir)

In [141]:
torch.save(HGData_Undir_with_self_loops, '../Data/HGData_Undir_with_self_loops.pt')

In [7]:
HGData_Undir_with_self_loops = torch.load('../Data/HGData_Undir_with_self_loops.pt', weights_only=False)

In [145]:
np.save('../Data/protein_expression.npy', protein_expression)
np.save('../Data/protein_RecruitmentAge.npy', protein_RecruitmentAge)
np.save('../Data/NMR_expression.npy', NMR_expression)
np.save('../Data/metabolite_RecruitmentAge.npy', metabolite_RecruitmentAge)
np.save('../Data/DiagnosisAge.npy', DiagnosisAge)

In [8]:
protein_expression = np.load('../Data/protein_expression.npy')
protein_RecruitmentAge = np.load('../Data/protein_RecruitmentAge.npy')
NMR_expression = np.load('../Data/NMR_expression.npy')
metabolite_RecruitmentAge = np.load('../Data/metabolite_RecruitmentAge.npy')
DiagnosisAge = np.load('../Data/DiagnosisAge.npy')

In [9]:
protein_expression_tensor = torch.tensor(protein_expression, dtype=torch.float32)

In [10]:
protein_RecruitmentAge_tensor = torch.tensor(protein_RecruitmentAge, dtype=torch.float32)

In [11]:
NMR_expression_tensor = torch.tensor(NMR_expression, dtype=torch.float32)

In [12]:
metabolite_RecruitmentAge_tensor = torch.tensor(metabolite_RecruitmentAge, dtype=torch.float32)

In [13]:
DiagnosisAge_tensor = torch.tensor(DiagnosisAge, dtype=torch.float32)

In [14]:
# Z-score归一化函数
def z_score_normalize(tensor):
    mean = tensor.mean()
    std = tensor.std()
    # 防止除零错误
    if std == 0:
        return tensor
    return (tensor - mean) / std

In [15]:
# 对年龄数据进行Z-score归一化
protein_RecruitmentAge_norm = z_score_normalize(protein_RecruitmentAge_tensor)

In [16]:
metabolite_RecruitmentAge_norm = z_score_normalize(metabolite_RecruitmentAge_tensor)

In [17]:
DiagnosisAge_norm = z_score_normalize(DiagnosisAge_tensor)

In [18]:
# Min-Max归一化函数
def min_max_normalize(tensor):
    min_val = tensor.min()
    max_val = tensor.max()
    # 防止除零错误
    if max_val == min_val:
        return tensor
    return (tensor - min_val) / (max_val - min_val)

In [19]:
# 对表达量数据进行Min-Max归一化
protein_expression_norm = min_max_normalize(protein_expression_tensor)

In [20]:
NMR_expression_norm = min_max_normalize(NMR_expression_tensor)

In [23]:
# 添加边的属性信息
# data['paper', 'cites', 'paper'].edge_attr = ... # [num_edges_cites, num_features_cites]
HGData_Undir_with_self_loops['participant', 'expresses', 'protein'].edge_attr = torch.cat([protein_expression_norm.unsqueeze(1),  # 形状：[num_edges, 1]
               protein_RecruitmentAge_norm.unsqueeze(1)], dim=1)

HGData_Undir_with_self_loops['participant', 'expresses', 'metabolite'].edge_attr = torch.cat([NMR_expression_norm.unsqueeze(1),
               metabolite_RecruitmentAge_norm.unsqueeze(1)], dim=1)

HGData_Undir_with_self_loops['participant', 'suffers_from', 'disease'].edge_attr = DiagnosisAge_norm.unsqueeze(1)

#HGData['protein', 'interacts_with', 'protein'].edge_attr = None
#HGData['protein', 'causes', 'disease'].edge_attr = None
#HGData['disease', 'is_comorbid_with', 'disease'].edge_attr = None

In [24]:
print(HGData_Undir_with_self_loops)

HeteroData(
  participant={
    x=[47626, 1536],
    train_mask=[47626],
    val_mask=[47626],
  },
  disease={ x=[18904, 1536] },
  protein={ x=[2923, 1536] },
  metabolite={ x=[251, 1536] },
  (participant, expresses, protein)={
    edge_index=[2, 139210798],
    edge_attr=[139210798, 2],
  },
  (participant, expresses, metabolite)={
    edge_index=[2, 6851045],
    edge_attr=[6851045, 2],
  },
  (participant, suffers_from, disease)={
    edge_index=[2, 805376],
    edge_attr=[805376, 1],
  },
  (protein, rev_expresses, participant)={ edge_index=[2, 139210798] },
  (metabolite, rev_expresses, participant)={ edge_index=[2, 6851045] },
  (disease, rev_suffers_from, participant)={ edge_index=[2, 805376] },
  (participant, self_loop, participant)={ edge_index=[2, 47626] },
  (disease, self_loop, disease)={ edge_index=[2, 18904] },
  (protein, self_loop, protein)={ edge_index=[2, 2923] },
  (metabolite, self_loop, metabolite)={ edge_index=[2, 251] }
)


In [25]:
torch.save(HGData_Undir_with_self_loops, '../Data/HGData_Undir_with_self_loops_with_attr.pt')

In [27]:
def add_default_edge_attr(data):
    """为没有边属性的边类型添加全1的边属性（维度为1）"""
    for edge_type in data.edge_types:
        # 检查边类型是否有edge_attr属性
        if 'edge_attr' not in data[edge_type]:
            # 获取该边类型的边数量
            num_edges = data[edge_type].edge_index.size(1)
            # 创建全1的边属性张量，维度为[num_edges, 1]
            data[edge_type].edge_attr = torch.ones(num_edges, 1)
            print(f"为边类型 {edge_type} 添加了默认边属性，形状为 {data[edge_type].edge_attr.shape}")
    return data

# 使用示例
HGData_Undir_with_self_loops_add_default_edge_attr = add_default_edge_attr(HGData_Undir_with_self_loops)

为边类型 ('protein', 'rev_expresses', 'participant') 添加了默认边属性，形状为 torch.Size([139210798, 1])
为边类型 ('metabolite', 'rev_expresses', 'participant') 添加了默认边属性，形状为 torch.Size([6851045, 1])
为边类型 ('disease', 'rev_suffers_from', 'participant') 添加了默认边属性，形状为 torch.Size([805376, 1])
为边类型 ('participant', 'self_loop', 'participant') 添加了默认边属性，形状为 torch.Size([47626, 1])
为边类型 ('disease', 'self_loop', 'disease') 添加了默认边属性，形状为 torch.Size([18904, 1])
为边类型 ('protein', 'self_loop', 'protein') 添加了默认边属性，形状为 torch.Size([2923, 1])
为边类型 ('metabolite', 'self_loop', 'metabolite') 添加了默认边属性，形状为 torch.Size([251, 1])


In [29]:
print(HGData_Undir_with_self_loops_add_default_edge_attr)

HeteroData(
  participant={
    x=[47626, 1536],
    train_mask=[47626],
    val_mask=[47626],
  },
  disease={ x=[18904, 1536] },
  protein={ x=[2923, 1536] },
  metabolite={ x=[251, 1536] },
  (participant, expresses, protein)={
    edge_index=[2, 139210798],
    edge_attr=[139210798, 2],
  },
  (participant, expresses, metabolite)={
    edge_index=[2, 6851045],
    edge_attr=[6851045, 2],
  },
  (participant, suffers_from, disease)={
    edge_index=[2, 805376],
    edge_attr=[805376, 1],
  },
  (protein, rev_expresses, participant)={
    edge_index=[2, 139210798],
    edge_attr=[139210798, 1],
  },
  (metabolite, rev_expresses, participant)={
    edge_index=[2, 6851045],
    edge_attr=[6851045, 1],
  },
  (disease, rev_suffers_from, participant)={
    edge_index=[2, 805376],
    edge_attr=[805376, 1],
  },
  (participant, self_loop, participant)={
    edge_index=[2, 47626],
    edge_attr=[47626, 1],
  },
  (disease, self_loop, disease)={
    edge_index=[2, 18904],
    edge_attr=[189

In [28]:
torch.save(HGData_Undir_with_self_loops_add_default_edge_attr, '../Data/HGData_Undir_with_self_loops_add_default_edge_attr.pt')

## get KNN

In [None]:
import numpy as np
from sklearn.metrics import pairwise_distances
from typing import List, Optional, Dict, Tuple
from scipy.stats import spearmanr

In [None]:
def get_knn( # 不包含自己
    embeddings: np.ndarray,
    names: List[str],
    metric: str,
    k: int,
    threshold: Optional[float] = None
) -> Dict[str, List[Tuple[str, float]]]:
    if metric=='spearman':
        corr_matrix, _ = spearmanr(embeddings, axis=1) #axis=1 means rows
        dists = 1.0 - np.abs(corr_matrix)
    else:
        dists = pairwise_distances(embeddings, metric=metric)
    n = len(names)
    result: Dict[str, List[Tuple[str, float]]] = {}
    for i, name in enumerate(names):
        dist_row = dists[i]
        sorted_idx = np.argsort(dist_row)
        neighbors: List[Tuple[str, float]] = []
        for j in sorted_idx:
            if j == i:
                # skip self
                continue
            dist_ij = float(dist_row[j])
            if threshold is not None and dist_ij > threshold:
                # skip if beyond threshold
                continue
            neighbors.append((names[j], dist_ij))
            if len(neighbors) >= k:
                break

        result[name] = neighbors

    return result

# embs = np.array([
#     [0.0, 1.0],
#     [1.0, 0.0],
#     [1.0, 1.0],
#     [2.0, 2.0],
# ])
# names = ["A", "B", "C", "D"]
# knn = get_knn(embs, names, metric="euclidean", k=2, threshold=None)
# for name, neighbors in knn.items():
#     print(f"{name}: {neighbors}")

##### Olink KNN

In [None]:
Olink_df_imputed.head(2)

In [None]:
# 复制数据并排除第一列（eid）
Olink_df_imputed_processed = Olink_df_imputed.iloc[:, 1:].copy()

# 移除列名中的 "olink_instance_0." 前缀
Olink_df_imputed_processed.columns = Olink_df_imputed_processed.columns.str.replace('olink_instance_0.', '', regex=False)

In [None]:
Olink_df_imputed_processed.head(2)

In [None]:
Olink_df_transposed = Olink_df_imputed_processed.T

In [None]:
Olink_knn = get_knn(embeddings=Olink_df_transposed.to_numpy(), 
                    names=Olink_df_transposed.index.tolist(), 
                    metric="spearman", k=10, threshold=None)

In [None]:
# 初始化空列表收集数据
rows = []
for node1, neighbors in Olink_knn.items():
    for neighbor in neighbors:
        node2, distance = neighbor
        rows.append({
            'node1': node1,
            'node2': node2,
            'relationship': 1
        })
# 创建DataFrame
Olink_knn_dataframe = pd.DataFrame(rows)

#### NMR KNN

In [None]:
NMR_df_imputed.head(2)

In [None]:
# 复制数据并排除第一列（eid）
NMR_df_imputed_processed = NMR_df_imputed.iloc[:, 1:].copy()

# 移除列名中的 "participant." 前缀
NMR_df_imputed_processed.columns = NMR_df_imputed_processed.columns.str.replace('participant.', '', regex=False)

In [None]:
NMR_df_imputed_processed.head(2)

In [None]:
NMR_df_transposed = NMR_df_imputed_processed.T

In [None]:
NMR_knn = get_knn(embeddings=NMR_df_transposed.to_numpy(), 
                    names=NMR_df_transposed.index.tolist(), 
                    metric="spearman", k=10, threshold=None)

In [None]:
# 初始化空列表收集数据
rows = []
for node1, neighbors in NMR_knn.items():
    for neighbor in neighbors:
        node2, distance = neighbor
        rows.append({
            'node1': node1,
            'node2': node2,
            'relationship': 1
        })
# 创建DataFrame
NMR_knn_dataframe = pd.DataFrame(rows)

In [None]:
# convert NMR names
NMR_knn_dataframe['node1'] = NMR_knn_dataframe['node1'].apply(map_node2)
NMR_knn_dataframe['node2'] = NMR_knn_dataframe['node2'].apply(map_node2)

#### ICD10 KNN

In [None]:
# 提取键（按插入顺序，Python 3.7+字典有序）
icd10_embeddings_keys = list(icd10_embeddings.keys())

In [None]:
# 将字典值转换为NumPy矩阵（每行一个向量）
icd10_matrix = np.array([icd10_embeddings[key] for key in icd10_embeddings_keys])

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=150, random_state=42)
icd10_matrix_pca = pca.fit_transform(icd10_matrix)
print(f"PCA降维后形状: {icd10_matrix_pca.shape}")
print(f"解释方差比例: {np.sum(pca.explained_variance_ratio_):.3f}")

In [None]:
icd10_knn = get_knn(embeddings=icd10_matrix_pca, 
                    names=icd10_embeddings_keys, 
                    metric="euclidean", k=10, threshold=None)

In [None]:
# 初始化空列表收集数据
rows = []
for node1, neighbors in icd10_knn.items():
    for neighbor in neighbors:
        node2, distance = neighbor
        #if node1[0] == node2[0]:
        rows.append({
                'node1': node1,
                'node2': node2,
                'relationship': 1})
# 创建DataFrame
icd10_knn_dataframe = pd.DataFrame(rows)

#### eid KNN

In [None]:
len(embeddings_dict)

In [None]:
# 提取键（按插入顺序，Python 3.7+字典有序）
embeddings_dict_keys = list(embeddings_dict.keys())

In [None]:
select_embeddings_dict_keys = list(set(embeddings_dict_keys) & set(subset_clinical_df['node1'].astype(str).drop_duplicates(keep='first').reset_index(drop=True)))

In [None]:
len(select_embeddings_dict_keys)

In [None]:
with open('../Data/select_embeddings_dict_keys.json', 'w') as f:
    json.dump(select_embeddings_dict_keys,f)

In [None]:
# 将字典值转换为NumPy矩阵（每行一个向量）
eid_matrix = np.array([embeddings_dict[key] for key in select_embeddings_dict_keys])

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100, random_state=42)
eid_matrix_pca = pca.fit_transform(eid_matrix)
print(f"PCA降维后形状: {eid_matrix_pca.shape}")
print(f"解释方差比例: {np.sum(pca.explained_variance_ratio_):.3f}")

In [None]:
eid_knn = get_knn(embeddings=eid_matrix_pca, 
                    names=select_embeddings_dict_keys, 
                    metric="euclidean", k=5, threshold=None)

In [None]:
# 初始化空列表收集数据
rows = []
for node1, neighbors in eid_knn.items():
    for neighbor in neighbors:
        node2, distance = neighbor
        rows.append({
            'node1': node1,
            'node2': node2,
            'relationship': 1
        })
# 创建DataFrame
eid_knn_dataframe = pd.DataFrame(rows)

In [None]:
# add KNN dataframes
merged_df_add_KNN = pd.concat([merged_df, Olink_knn_dataframe, NMR_knn_dataframe, icd10_knn_dataframe, eid_knn_dataframe], ignore_index=True)

In [None]:
pd.api.types.is_string_dtype(merged_df_add_KNN['node1'])

In [None]:
merged_df_add_KNN['relationship'] = 1

In [None]:
# 保存为以制表符分隔的文本文件
file_path = '../Data/merged_df_long_convert.txt'
merged_df_add_KNN.to_csv(file_path, sep='\t', na_rep='nan', index=False)

In [None]:
import numpy as np
import os
from scipy.sparse import csr_matrix
from utils import N2V
import pandas as pd
from fastnode2vec import Graph, Node2Vec 

In [None]:
adj_path = '../Data/merged_df_long_convert.txt'

In [None]:
merged_df = pd.read_csv(adj_path, sep='\t',
    dtype={'node1': str, 'node2': str, 'relationship': float} )
merged_df.head(2)

In [None]:
# X, A, Y = [], None, []
# n_node = 0

# # Acquire Edges
# edge_list = []
# node_list = []
# node_type = {}
# relationship_list = []

# with open(adj_path, 'rt', encoding='utf-8') as f:
#     next(f)
#     for line in f.readlines():
#         node1, node2, relationship, *_ = line.strip().split('\t')
#         edge_list.append((node1, node2))
#         node_list.extend([node1, node2])
#         relationship_list.append(relationship)

# node_map = {item: i for i, item in enumerate(sorted(list(set(node_list))))}
# n_node = len(node_map)

# # 使用稀疏矩阵存储邻接关系
# row = []
# col = []
# data = []
# for i, (node1, node2) in enumerate(edge_list):
#     row.append(node_map[node1])
#     col.append(node_map[node2])
#     data.append(relationship_list[i])  # 使用实际的关系值
#     row.append(node_map[node2])
#     col.append(node_map[node1])
#     data.append(relationship_list[i])  # 使用实际的关系值
# A = csr_matrix((data, (row, col)), shape=(n_node, n_node), dtype=np.float32)

In [None]:
adj_matrix = pd.read_csv(adj_path, sep='\t',
    dtype={'node1': str, 'node2': str, 'relationship': float})
# 构建图所需的边列表
edges = [(row['node1'], row['node2'], row['relationship']) for _, row in adj_matrix.iterrows()]

# 构建图
graph = Graph(edges, directed=False, weighted=True)

In [None]:
graph.node_names

In [None]:
graph

In [None]:
# 保存图对象
with open('../Data/graph.pkl', 'wb') as f:
    pickle.dump(graph, f)

In [None]:
# 创建新字典，保留原始字典不变
merged_dict = {**protein_embeddings_dict, **embeddings_dict, **metabolite_embeddings_dict, **icd10_embeddings}

In [None]:
all(isinstance(key, str) for key in merged_dict.keys())

In [None]:
all(isinstance(node_name, str) for node_name in graph.node_names)

In [None]:
# 提取所有键并保持顺序（Python 3.7+ 字典保持插入顺序）
keys = list(graph.node_names)

# 将字典值转换为NumPy矩阵（每行一个向量）
matrix = np.array([merged_dict[key] for key in keys])

# 检查矩阵形状
print("矩阵形状:", matrix.shape)  # 输出: (47626, 向量维度)

# 示例：查看前3行
print("前3行:\n", matrix[:3])

In [None]:
# 保存
with open('../Data/keys.json', 'w') as f:
    json.dump(keys, f)

In [None]:
np.save('../Data/UKB_node_feature_gpt.npy', matrix)