# 样本集划分

In [1]:
import os
import torch
import numpy as np
from sklearn.model_selection import train_test_split


# 加载图数据和标签
work_dir = r'D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data'
merged_file = os.path.join(work_dir, 'all_graphs_to_be_predicted.pt')
merged_graphs = torch.load(merged_file)

# 假设每个图数据都有 y 属性表示标签，并转换为 NumPy 数组
labels = np.array([graph.y.numpy() if isinstance(graph.y, torch.Tensor) else graph.y for graph in merged_graphs])

def random_train_test_split(graphs, labels, test_size=0.3, random_state=42):
    """使用随机划分 train_test_split"""
    train_graphs, test_graphs, train_labels, test_labels = train_test_split(
        graphs, labels, test_size=test_size, random_state=random_state, shuffle=True)

    return train_graphs, test_graphs, train_labels, test_labels

# 数据集划分
train_graphs, temp_graphs, train_labels, temp_labels = random_train_test_split(
    merged_graphs, labels, test_size=0.003, random_state=42)

val_graphs, test_graphs, val_labels, test_labels = random_train_test_split(
    temp_graphs, temp_labels, test_size=0.5, random_state=42)

# 计算每个子集中标签中 1 的比例
def calculate_label_proportions(labels):
    proportions = np.mean(labels == 1, axis=0)  # 计算每个标签中 1 的比例
    return proportions

# 计算训练集、验证集、测试集中每个标签为 1 的比例
train_proportions = calculate_label_proportions(train_labels)
val_proportions = calculate_label_proportions(val_labels)
test_proportions = calculate_label_proportions(test_labels)

# 将标签转换为 torch.Tensor
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

# 打印每个子集的大小
print(f"Training set: {len(train_graphs)} graphs")
print(f"Validation set: {len(val_graphs)} graphs")
print(f"Test set: {len(test_graphs)} graphs")

# 打印每个标签中 1 的比例
print("Proportion of '1's for each label in training set:", train_proportions)
print("Proportion of '1's for each label in validation set:", val_proportions)
print("Proportion of '1's for each label in test set:", test_proportions)


Training set: 478 graphs
Validation set: 1 graphs
Test set: 1 graphs
Proportion of '1's for each label in training set: [0.41004184 0.70502092 0.52719665 0.46443515]
Proportion of '1's for each label in validation set: [1. 0. 0. 0.]
Proportion of '1's for each label in test set: [1. 1. 1. 0.]


# 多层注意力模型

In [2]:
import torch
import torch.nn as nn
from torch_geometric.nn import GATConv, global_mean_pool
from torch.optim.lr_scheduler import ReduceLROnPlateau

# 设置随机种子
def set_seed(seed):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # 如果使用GPU

set_seed(42)  # 设置你的随机种子

# GAT 模型定义
class GATModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads, dropout_rate=0.3, dosage_weight=1.0):
        super(GATModel, self).__init__()
        self.dosage_weight = dosage_weight  # 用于放大第91号特征的权重
        
        # 增加四层 GAT 注意力机制
        self.layer1 = GATConv(in_dim, hidden_dim, heads=num_heads, dropout=dropout_rate)
        self.layer2 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, dropout=dropout_rate)
        self.layer3 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, dropout=dropout_rate)
        self.layer4 = GATConv(hidden_dim * num_heads, hidden_dim, heads=1, dropout=dropout_rate)  # 第四层 GAT
        
        # 全连接层，用于最终输出
        self.fc = nn.Linear(hidden_dim, out_dim)

        # 权重初始化
        self._initialize_weights()

    def _initialize_weights(self):
        for layer in [self.layer1, self.layer2, self.layer3, self.layer4]:  # 加入第四层的权重初始化
            nn.init.xavier_uniform_(layer.lin.weight)  # 线性层权重初始化
            if layer.lin.bias is not None:
                nn.init.zeros_(layer.lin.bias)  # 偏置初始化为0

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

                # 将倒数第一个节点和倒数第二个节点的特征设置为零, , '', '', '', ''
        #x[-1, :] = 0  # 'Hot therapeutic'
        #x[-2, :] = 0  # 'Cold therapeutic'
        #x[-3, :] = 0  # 'Moist therapeutic'
        #x[-4, :] = 0  # 'Dry therapeutic'
        x[-5:, :] = 0  # CHP     
        
        # 取出第91号特征并乘以 self.dosage_weight，并限制其放大范围
        #x[:, 90] = torch.clamp(x[:, 90] * self.dosage_weight, min=0, max=10)  #   Dosage weight
        #x[:, 0:15] = torch.clamp(x[:, 0:15] * self.dosage_weight, min=0, max=10)  #特征  Sources
        #x[:, 15:30] = torch.clamp(x[:, 15:30] * self.dosage_weight, min=0, max=10)  #特征  Medicinal properties
        #x[:, 30:60] = torch.clamp(x[:, 30:60] * self.dosage_weight, min=0, max=10)  #特征  Combination
        #x[:, 60:90] = torch.clamp(x[:, 60:90] * self.dosage_weight, min=0, max=10)  #特征  Efficacy
        

        # 执行 GAT 层计算并获取注意力权重
        h, attn_weights_1 = self.layer1(x, edge_index, return_attention_weights=True)
        h = torch.relu(h)

        h, attn_weights_2 = self.layer2(h, edge_index, return_attention_weights=True)
        h = torch.relu(h)

        h, attn_weights_3 = self.layer3(h, edge_index, return_attention_weights=True)
        h = torch.relu(h)

        h, attn_weights_4 = self.layer4(h, edge_index, return_attention_weights=True)  # 计算第四层的输出和注意力权重
        
        # 使用全局均值池化汇聚节点信息
        hg = global_mean_pool(h, batch)
        
        # 输出层
        out = self.fc(hg)
        
        # 返回最终输出以及所有的注意力权重
        return out, hg, (attn_weights_1, attn_weights_2, attn_weights_3, attn_weights_4)

# 模型参数设置
in_dim = 91       # 节点特征维度
hidden_dim = 64   # 隐藏层维度
out_dim = 4       # 输出维度，对应5个标签
num_heads = 2     # GAT多头注意力数量
dropout_rate = 0.4
dosage_weight = 0  # 放大剂量参数的权重

# 构建模型
model = GATModel(in_dim, hidden_dim, out_dim, num_heads, dropout_rate, dosage_weight=dosage_weight)
print(model)

GATModel(
  (layer1): GATConv(91, 64, heads=2)
  (layer2): GATConv(128, 64, heads=2)
  (layer3): GATConv(128, 64, heads=2)
  (layer4): GATConv(128, 64, heads=1)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)


# 模型 加载

In [3]:
import gc

# 删除模型
del model  # 删除模型对象
gc.collect()  # 强制进行垃圾回收，释放内存

85

In [4]:
#加载模型
model = GATModel(in_dim, hidden_dim, out_dim, num_heads, dropout_rate, dosage_weight=dosage_weight)

# 直接加载整个模型
model_save_path = r'D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data\gat_model.pth'
model = torch.load(model_save_path)  # 直接加载整个模型
model.eval()  # 切换到评估模式
print(f"Model loaded from {model_save_path}")

# 修改模型的 dosage_weight 参数
model.dosage_weight = 0  # 将 dosage_weight 从之前的 1 改为 1.5
print(f"Updated dosage_weight to: {model.dosage_weight}")

Model loaded from D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data\gat_model.pth
Updated dosage_weight to: 0


# 模型评价

In [5]:
import torch
import numpy as np
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support, accuracy_score, confusion_matrix
import pandas as pd
import os
from matplotlib import rcParams

# 设置全局字体为 Arial
rcParams['font.family'] = 'Arial'

def evaluate_model(graphs, labels, model, output_dir, data_name, cpm_id=None):
    model.eval()
    all_outputs = []
    all_labels = []
    all_attn_weights = []

    with torch.no_grad():
        for i, graph in enumerate(graphs):
            output, hg, attn_weights = model(graph)
            all_outputs.append(output.cpu().numpy())
            all_labels.append(labels[i].cpu().numpy())
            all_attn_weights.append(attn_weights)

    final_outputs = np.vstack(all_outputs)
    final_labels = np.vstack(all_labels)

    compute_and_save_metrics(final_labels, final_outputs, output_dir, data_name)

    if cpm_id is not None:
        output_attention_weights(all_attn_weights, graphs, cpm_id, output_dir)

def output_attention_weights(all_attn_weights, graphs, cpm_id, output_dir):
    for i, graph in enumerate(graphs):
        if hasattr(graph, 'cpm_id') and graph.cpm_id == cpm_id:
            attn_weights = all_attn_weights[i]
            attn_weights_1, attn_weights_2, _ = attn_weights

            # 将 attn_weights_1 和 attn_weights_2 转换为 numpy 数组
            attn_weights_1_array = [aw.cpu().numpy() for aw in attn_weights_1]
            attn_weights_2_array = [aw.cpu().numpy() for aw in attn_weights_2]

            # 获取节点名称
            node_names = graph.node_names  # 确保这个属性存在并正确获取

            # 转置第一个数组
            transposed_0 = attn_weights_1_array[0].T  # 现在形状变为 (132, 2)

            # 替换 transposed_0 中的索引为节点名称
            corresponding_node_names = []
            for index_pair in transposed_0:
                name_pair = [node_names[int(idx)] for idx in index_pair]
                corresponding_node_names.append(name_pair)

            # 转换为 NumPy 数组
            corresponding_node_names = np.array(corresponding_node_names)

            # 合并对应的节点名称和权重
            merged_array = np.column_stack((corresponding_node_names, attn_weights_1_array[1]))

            # 保存为 CSV 文件
            np.savetxt(os.path.join(output_dir, f'{cpm_id}_attn_weights_1-多层注意力.csv'), merged_array, delimiter=',', fmt='%s')
            print(f"注意力权重 attn_weights_1 已保存为 {cpm_id}_attn_weights_1.csv")

            # 对 attn_weights_2 进行类似处理（如果需要的话）
            if attn_weights_2_array:
                transposed_2 = attn_weights_2_array[0].T  # 转置第二个数组
                corresponding_node_names_2 = []
                for index_pair in transposed_2:
                    name_pair = [node_names[int(idx)] for idx in index_pair]
                    corresponding_node_names_2.append(name_pair)

                # 转换为 NumPy 数组
                corresponding_node_names_2 = np.array(corresponding_node_names_2)

                # 合并对应的节点名称和权重
                merged_array_2 = np.column_stack((corresponding_node_names_2, attn_weights_2_array[1]))

                # 保存为 CSV 文件
                np.savetxt(os.path.join(output_dir, f'{cpm_id}_attn_weights_2-多层注意力.csv'), merged_array_2, delimiter=',', fmt='%s')
                print(f"注意力权重 attn_weights_2 已保存为 {cpm_id}_attn_weights_2.csv")



# 计算和保存指标的函数（不变）
def compute_and_save_metrics(labels, outputs, output_dir, data_name):
    num_classes = labels.shape[1]
    metrics = {
        'Class': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': [],
        'AUC': [],
        'Accuracy': [],
        'Specificity': []
    }
    roc_data_long_format = {'Class': [], 'Reference': [], 'Predicted': []}
    
    for i in range(num_classes):
        # 使用 Sigmoid 函数将 logits 转换为概率
        probabilities = torch.sigmoid(torch.tensor(outputs))  # 转换为 0~1 概率
        
        # ROC curve
        fpr, tpr, thresholds = roc_curve(labels[:, i], probabilities[:, i].numpy())
        roc_auc = auc(fpr, tpr)
        
        # 保存参考值和预测值到长格式
        for ref, pred in zip(labels[:, i], probabilities[:, i].numpy()):
            roc_data_long_format['Class'].append(f'Class_{i+1}')
            roc_data_long_format['Reference'].append(ref)
            roc_data_long_format['Predicted'].append(pred)
        
        # Precision, Recall, F1, Accuracy, Specificity
        pred_binary = (probabilities[:, i] > 0.5).numpy().astype(int)  # 使用 0.5 作为阈值
        precision, recall, f1, _ = precision_recall_fscore_support(labels[:, i], pred_binary, average='binary')
        accuracy = accuracy_score(labels[:, i], pred_binary)
        
        tn, fp, fn, tp = confusion_matrix(labels[:, i], pred_binary).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # 防止除零
        
        # Save metrics
        metrics['Class'].append(f'Class_{i+1}')
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['F1 Score'].append(f1)
        metrics['AUC'].append(roc_auc)
        metrics['Accuracy'].append(accuracy)
        metrics['Specificity'].append(specificity)
    
    # 计算平均值
    avg_metrics = {
        'Class': ['Average'],
        'Precision': [np.mean(metrics['Precision'])],
        'Recall': [np.mean(metrics['Recall'])],
        'F1 Score': [np.mean(metrics['F1 Score'])],
        'AUC': [np.mean(metrics['AUC'])],
        'Accuracy': [np.mean(metrics['Accuracy'])],
        'Specificity': [np.mean(metrics['Specificity'])]
    }
    
    # 添加平均值到 metrics
    for key in metrics:
        metrics[key].append(avg_metrics[key][0])
    
    # 保存长格式的 ROC 数据到 CSV
    roc_df_long = pd.DataFrame(roc_data_long_format)
    roc_df_long.to_csv(os.path.join(output_dir, f'{data_name}_roc_data_多层注意力.csv'), index=False)

    # 保存指标数据到 CSV
    metrics_df = pd.DataFrame(metrics)
    print(metrics_df)
    metrics_df.to_csv(os.path.join(output_dir, f'{data_name}_metrics_多层注意力.csv'), index=False)
    print(f"Metrics and ROC data saved to {output_dir}.")

# 示例调用
work_dir = r'D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data'



# 评估训练集
evaluate_model(train_graphs, train_labels, model, work_dir, "train-CHP")
# 评估验证集
#evaluate_model(val_graphs, val_labels, model, work_dir, "validation-饮片")

# 评估测试集
#evaluate_model(test_graphs, test_labels, model, work_dir, "test-0", cpm_id='CPM05651')

# 评估测试集
#evaluate_model(test_graphs, test_labels, model, work_dir, "test-Medicinal flavor2")


     Class  Precision    Recall  F1 Score       AUC  Accuracy  Specificity
0  Class_1   0.573171  0.719388  0.638009  0.720356  0.665272     0.627660
1  Class_2   0.759717  0.637982  0.693548  0.618242  0.602510     0.517730
2  Class_3   0.618026  0.571429  0.593814  0.650706  0.587866     0.606195
3  Class_4   0.601286  0.842342  0.701689  0.767253  0.667364     0.515625
4  Average   0.638050  0.692785  0.656765  0.689139  0.630753     0.566802
Metrics and ROC data saved to D:\博士文件\博士毕业课题材料\维吾尔医药配伍机制量化分析\data.
