# 加载数据

In [1]:
import os
import torch
import numpy as np

# 加载图数据和标签
work_dir = r'D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data'
merged_file = os.path.join(work_dir, 'all_graphs_to_be_predicted.pt')
merged_graphs = torch.load(merged_file)
len(merged_graphs)

480

# 多层注意力模型

In [2]:
import torch
import torch.nn as nn
from torch_geometric.nn import GATConv, global_mean_pool
from torch.optim.lr_scheduler import ReduceLROnPlateau

# 设置随机种子
def set_seed(seed):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # 如果使用GPU

set_seed(42)  # 设置你的随机种子

# GAT 模型定义
class GATModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads, dropout_rate=0.3, dosage_weight=1.0):
        super(GATModel, self).__init__()
        self.dosage_weight = dosage_weight  # 用于放大第91号特征的权重
        
        # 增加四层 GAT 注意力机制
        self.layer1 = GATConv(in_dim, hidden_dim, heads=num_heads, dropout=dropout_rate)
        self.layer2 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, dropout=dropout_rate)
        self.layer3 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, dropout=dropout_rate)
        self.layer4 = GATConv(hidden_dim * num_heads, hidden_dim, heads=1, dropout=dropout_rate)  # 第四层 GAT
        
        # 全连接层，用于最终输出
        self.fc = nn.Linear(hidden_dim, out_dim)

        # 权重初始化
        self._initialize_weights()

    def _initialize_weights(self):
        for layer in [self.layer1, self.layer2, self.layer3, self.layer4]:  # 加入第四层的权重初始化
            nn.init.xavier_uniform_(layer.lin.weight)  # 线性层权重初始化
            if layer.lin.bias is not None:
                nn.init.zeros_(layer.lin.bias)  # 偏置初始化为0

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        # 取出第91号特征并乘以 self.dosage_weight，并限制其放大范围
        x[:, 90] = torch.clamp(x[:, 90] * self.dosage_weight, min=0, max=10)

        # 执行 GAT 层计算并获取注意力权重
        h, attn_weights_1 = self.layer1(x, edge_index, return_attention_weights=True)
        h = torch.relu(h)

        h, attn_weights_2 = self.layer2(h, edge_index, return_attention_weights=True)
        h = torch.relu(h)

        h, attn_weights_3 = self.layer3(h, edge_index, return_attention_weights=True)
        h = torch.relu(h)

        h, attn_weights_4 = self.layer4(h, edge_index, return_attention_weights=True)  # 计算第四层的输出和注意力权重
        
        # 使用全局均值池化汇聚节点信息
        hg = global_mean_pool(h, batch)
        
        # 输出层
        out = self.fc(hg)
        
        # 返回最终输出以及所有的注意力权重
        return out, hg, (attn_weights_1, attn_weights_2, attn_weights_3, attn_weights_4)

# 模型参数设置
in_dim = 91       # 节点特征维度
hidden_dim = 64   # 隐藏层维度
out_dim = 4       # 输出维度，对应5个标签
num_heads = 2     # GAT多头注意力数量
dropout_rate = 0.4
dosage_weight = 1  # 放大剂量参数的权重

# 构建模型
model = GATModel(in_dim, hidden_dim, out_dim, num_heads, dropout_rate, dosage_weight=dosage_weight)
print(model)

GATModel(
  (layer1): GATConv(91, 64, heads=2)
  (layer2): GATConv(128, 64, heads=2)
  (layer3): GATConv(128, 64, heads=2)
  (layer4): GATConv(128, 64, heads=1)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)


# 模型 加载

In [3]:
import gc

# 删除模型
del model  # 删除模型对象
gc.collect()  # 强制进行垃圾回收，释放内存

85

In [4]:
#加载模型
model = GATModel(in_dim, hidden_dim, out_dim, num_heads, dropout_rate, dosage_weight=dosage_weight)

# 直接加载整个模型
model_save_path = r'D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data\gat_model.pth'
model = torch.load(model_save_path)  # 直接加载整个模型
model.eval()  # 切换到评估模式
print(f"Model loaded from {model_save_path}")


Model loaded from D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data\gat_model.pth


In [5]:
len(merged_graphs)

480

# 模型预测

In [13]:
#输出为TSV文件
import os
import torch
import pandas as pd
import numpy as np

# 定义一个辅助函数，用于将数值保留四位有效数字
def format_value(val):
    return round(val, 4)

# 定义预测函数
def predict_samples(start_index, end_index):
    output_results = []
    attn_results = []

    for i in range(start_index, end_index + 1):
        sample = merged_graphs[i]
        cpm_id = sample.cpm_id  # 每个样本的唯一标识名
        out, hg, attn_weights = model(sample)
        
        # 对 logits 应用 sigmoid 转换为概率
        out_probs = torch.sigmoid(out).detach().cpu().numpy()

        # 第一个表的数据：cpm_id, out_probs（转为概率） 和 hg，将张量转换为四位有效数字的列表，并从 1 开始计数
        output_result = {
            "cpm_id": cpm_id,
            **{f"Class_{j+1}": format_value(val) for j, val in enumerate(out_probs.flatten())},  # 使用概率
            **{f"hg_{j+1}": format_value(val) for j, val in enumerate(hg.detach().cpu().numpy().flatten())}
        }        

        output_results.append(output_result)

        # 第二个表的数据：cpm_id，节点名称和注意力权重
        node_names = sample.node_names  # 确保 graph 有 `node_names` 属性

        # 初始化一个字典来存储边和对应的注意力权重
        edge_dict = {}

        # 遍历每一层的注意力权重
        for layer_idx, (edge_index, attn_weight) in enumerate(attn_weights, start=1):
            edge_index_np = edge_index.detach().cpu().numpy()  # [2, E]
            attn_weight_np = attn_weight.detach().cpu().numpy()  # [E, heads]

            # 转置 edge_index_np，得到 [E, 2]
            edges = edge_index_np.T

            # 对于每个边，获取节点名称和对应的权重
            for edge, attn in zip(edges, attn_weight_np):
                node_idx_1, node_idx_2 = edge
                node_name_1 = node_names[int(node_idx_1)]
                node_name_2 = node_names[int(node_idx_2)]

                edge_key = (node_name_1, node_name_2)

                # 初始化字典
                if edge_key not in edge_dict:
                    edge_dict[edge_key] = {
                        "cpm_id": cpm_id,
                        "Source": node_name_1,
                        "Target": node_name_2
                    }

                # 将注意力权重展开，每个注意力头占一个单元格，注意从 1 开始计数
                for head_idx, attn_value in enumerate(attn, start=1):
                    attn_col_name = f"attn_weights_{layer_idx}_head_{head_idx}"
                    edge_dict[edge_key][attn_col_name] = format_value(attn_value)

        # 将 edge_dict 的值添加到 attn_results 列表中
        attn_results.extend(edge_dict.values())

    # 导出第一个表为 TSV 文件
    output_df = pd.DataFrame(output_results)
    output_path = os.path.join(work_dir, 'prediction_outputs.tsv')
    output_df.to_csv(output_path, sep='\t', index=False)
    print(f"Prediction outputs exported to {output_path} as TSV")

    # 导出第二个表为 TSV 文件
    attn_df = pd.DataFrame(attn_results)
    attn_path = os.path.join(work_dir, 'attention_weights.tsv')
    attn_df.to_csv(attn_path, sep='\t', index=False)
    print(f"Attention weights exported to {attn_path} as TSV")

# 自定义预测范围
start_index = 0  # 你可以更改这个值
end_index = 479  # 你可以更改这个值
predict_samples(start_index, end_index)



Prediction outputs exported to D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data\prediction_outputs.tsv as TSV
Attention weights exported to D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data\attention_weights.tsv as TSV
