# 计算节点之间的注意力

In [5]:
import os
import pandas as pd

# 设置工作目录
work_dir = r'D:/博士文件/博士毕业课题材料/维吾尔医药配伍机制量化分析/data/'
os.chdir(work_dir)

# 读取数据
SD10data = pd.read_csv("attention_weights.tsv", sep="\t")

# 计算每层的平均注意力并保留四位有效数字
SD10data['attn_layer_1_avg'] = SD10data.filter(regex='^attn_weights_1_head').mean(axis=1).round(4)
SD10data['attn_layer_2_avg'] = SD10data.filter(regex='^attn_weights_2_head').mean(axis=1).round(4)
SD10data['attn_layer_3_avg'] = SD10data.filter(regex='^attn_weights_3_head').mean(axis=1).round(4)
SD10data['attn_layer_4_avg'] = SD10data.filter(regex='^attn_weights_4_head').mean(axis=1).round(4)

# 选择需要的列，包括 Source、Target 和三层的平均注意力
attention_data = SD10data[['cpm_id','Source', 'Target', 'attn_layer_1_avg', 'attn_layer_2_avg', 'attn_layer_3_avg', 'attn_layer_4_avg']]

# 将结果输出为新的 TSV 文件
output_file = os.path.join(work_dir, "attention_averages.tsv")
attention_data.to_csv(output_file, sep="\t", index=False)

print(f"Average attention weights saved to {output_file}")


Average attention weights saved to D:/博士文件/博士毕业课题材料/维吾尔医药配伍机制量化分析/data/attention_averages.tsv


# 计算节点相互注意力

In [6]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm

# 设置工作目录路径
work_dir = r'D:/博士文件/博士毕业课题材料/维吾尔医药配伍机制量化分析/data/'
file_path = os.path.join(work_dir, "attention_averages.tsv")

# 读取平均注意力数据
attention_data = pd.read_csv(file_path, sep='\t')

# 获取所有的 cpm_id
cpm_ids = attention_data['cpm_id'].unique()

# 准备输出文件路径
output_path = os.path.join(work_dir, "calculated_attention_weights.tsv")

# 如果输出文件已存在，删除它（防止旧数据干扰）
if os.path.exists(output_path):
    os.remove(output_path)

# 初始化输出文件，写入表头
with open(output_path, 'w', encoding='utf-8') as f_out:
    f_out.write('cpm_id\tSource\tTarget\tattention\n')

# 设置批量处理的数量
batch_size = 100
batch_results = []

# 使用 tqdm 显示进度条
for idx, cpm_id in enumerate(tqdm(cpm_ids, desc="Processing cpm_id")):
    cpm_data = attention_data[attention_data['cpm_id'] == cpm_id]

    # 获取所有的节点并排序
    nodes = sorted(pd.unique(cpm_data[['Source', 'Target']].values.ravel()))

    # 使用 defaultdict 累加权重
    attention_dict = defaultdict(float)

    # 逐层计算注意力贡献
    for target in nodes:
        # 第四层：直接连接到目标节点的所有节点
        fourth_layer = cpm_data[cpm_data['Target'] == target]

        for _, fourth_row in fourth_layer.iterrows():
            source_4 = fourth_row['Source']
            weight_4 = fourth_row['attn_layer_4_avg']

            # 第三层：连接到第四层节点的节点
            third_layer = cpm_data[cpm_data['Target'] == source_4]

            for _, third_row in third_layer.iterrows():
                source_3 = third_row['Source']
                weight_3 = third_row['attn_layer_3_avg']

                # 第二层：连接到第三层节点的节点
                second_layer = cpm_data[cpm_data['Target'] == source_3]

                for _, second_row in second_layer.iterrows():
                    source_2 = second_row['Source']
                    weight_2 = second_row['attn_layer_2_avg']

                    # 第一层：连接到第二层节点的节点
                    first_layer = cpm_data[cpm_data['Target'] == source_2]

                    for _, first_row in first_layer.iterrows():
                        source_1 = first_row['Source']
                        weight_1 = first_row['attn_layer_1_avg']

                        # 计算总权重（四层的乘积）
                        total_weight = weight_1 * weight_2 * weight_3 * weight_4

                        # 累加到字典
                        attention_dict[(source_1, target)] += total_weight

    # 将字典转换为 DataFrame
    results = [[cpm_id, src, tgt, weight] for (src, tgt), weight in attention_dict.items()]
    attention_df = pd.DataFrame(results, columns=["cpm_id", "Source", "Target", "attention"])

    # 对每个目标节点的注意力进行标准化（可选）
    #attention_df['attention'] = attention_df.groupby('Target')['attention'].transform(lambda x: x / x.sum())

    # 保留四位有效数字
    attention_df['attention'] = attention_df['attention'].apply(lambda x: round(x, 6))

    # 将结果添加到批量结果列表
    batch_results.append(attention_df)

    # 每处理 batch_size 个 cpm_id，将数据写入文件并清空批量结果
    if (idx + 1) % batch_size == 0 or (idx + 1) == len(cpm_ids):
        # 将批量结果列表合并为一个 DataFrame
        combined_df = pd.concat(batch_results, ignore_index=True)
        # 追加保存到文件
        combined_df.to_csv(output_path, sep='\t', index=False, header=False, mode='a', encoding='utf-8')
        # 清空批量结果列表
        batch_results = []

# 输出完成信息
print(f"所有 cpm_id 的注意力权重已保存至 {output_path}")


Processing cpm_id: 100%|█████████████████████████████████████████████████████████████| 480/480 [05:26<00:00,  1.47it/s]

所有 cpm_id 的注意力权重已保存至 D:/博士文件/博士毕业课题材料/维吾尔医药配伍机制量化分析/data/calculated_attention_weights.tsv



