In [1]:
import os
print(os.getcwd())
os.chdir('/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1')
print(os.getcwd())

/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1/src/gnn_clustering
/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1


In [5]:
from src.gnn_clustering.data_loader import load_data
from src.gnn_clustering.model import get_model
from src.gnn_clustering.loss_functions import modularity_loss
from src.gnn_clustering.train import train_model
from src.gnn_clustering.evaluate import (
    get_embeddings,
    kmeans_clustering,
    leiden_clustering,
    random_clustering,
    compute_modularity,
    format_communities
)
from src.gnn_clustering.utils import get_device, get_dense_adj
import torch
import numpy as np

# 设置设备
device = get_device()

# 数据加载
data = load_data()
data = data.to(device)

# 使用Leiden算法进行聚类，获取簇数
communities_leiden, modularity_leiden = leiden_clustering(data)
num_clusters_leiden = len(communities_leiden)

# 获取模型
model = get_model(data, device=device)

# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 获取密集邻接矩阵
adj = get_dense_adj(data.edge_index, device=device)

# 初始评估
initial_embeddings = get_embeddings(model, data, device=device)
clusters_kmeans_initial = kmeans_clustering(initial_embeddings, n_clusters=7)
communities_kmeans_initial = format_communities(clusters_kmeans_initial, n_clusters=7)
modularity_kmeans_initial = compute_modularity(data, communities_kmeans_initial)

# 模型训练
model = train_model(model, data, adj, modularity_loss, optimizer)

# 获取训练后的嵌入
embeddings = get_embeddings(model, data, device=device)

# KMeans 聚类评估
clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
modularity_kmeans = compute_modularity(data, communities_kmeans)

# # Leiden 算法评估
# communities_leiden, modularity_leiden = leiden_clustering(data)
# print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')

# 随机聚类评估
communities_random = random_clustering(data.num_nodes, n_clusters=7)
modularity_random = compute_modularity(data, communities_random)
print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')
print(f'Leiden 算法得到的簇数: {num_clusters_leiden}')
print(f'未训练模型（初始嵌入）的 KMeans 聚类模块度: {modularity_kmeans_initial:.4f}')
print(f'KMeans 聚类的模块度: {modularity_kmeans:.4f}')
print(f'随机聚类的模块度: {modularity_random:.4f}')



Epoch 50, Loss: 1.7670
Epoch 100, Loss: 1.7559
Epoch 150, Loss: 1.7511
Epoch 200, Loss: 1.7490
Epoch 250, Loss: 1.7478
Epoch 300, Loss: 1.7470
Epoch 350, Loss: 1.7465
Epoch 400, Loss: 1.7460
Epoch 450, Loss: 1.7457
Epoch 500, Loss: 1.7454
Epoch 550, Loss: 1.7451
Epoch 600, Loss: 1.7449
Epoch 650, Loss: 1.7447
Epoch 700, Loss: 1.7446
Epoch 750, Loss: 1.7445
Epoch 800, Loss: 1.7444
Epoch 850, Loss: 1.7443
Epoch 900, Loss: 1.7442
Epoch 950, Loss: 1.7442
Epoch 1000, Loss: 1.7441
Leiden 算法的模块度: 0.8213
Leiden 算法得到的簇数: 109
未训练模型（初始嵌入）的 KMeans 聚类模块度: 0.2946
KMeans 聚类的模块度: 0.7055
随机聚类的模块度: 0.0061


In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
from torch_geometric.nn import GCNConv
import networkx as nx
import numpy as np
from sklearn.cluster import KMeans
from src.gnn_clustering.model import get_multi_head_model
from src.gnn_clustering.data_loader import load_data
from src.gnn_clustering.utils import get_device, get_dense_adj
from src.gnn_clustering.train import train_model_multi_head
from src.gnn_clustering.evaluate import (
    get_embeddings_list,
    kmeans_clustering,
    leiden_clustering,
    compute_modularity,
    format_communities
)
# 设置设备
device = get_device()

# 数据加载
data = load_data()
data = data.to(device)

# 使用Leiden算法进行聚类，获取簇数
communities_leiden, modularity_leiden = leiden_clustering(data)
num_clusters_leiden = len(communities_leiden)

num_heads = 3  # 设置头的数量
model = get_multi_head_model(data=data, device= device, num_heads=num_heads)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 获取密集邻接矩阵
adj = get_dense_adj(data.edge_index, device=device)

# 模型训练
model = train_model_multi_head(model, data, adj, optimizer, num_heads)

embeddings_list = get_embeddings_list(model,data,device)

G_nx = to_networkx(data, to_undirected=True)

for idx, embeddings in enumerate(embeddings_list):
    # KMeans 聚类评估
    clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
    communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
    modularity_kmeans = compute_modularity(data, communities_kmeans)
    print(f'头 {idx} 的模块度: {modularity_kmeans:.4f}')

# 计算头之间的互信息
from sklearn.metrics import adjusted_mutual_info_score
for i in range(num_heads):
    for j in range(i + 1, num_heads):
        clusters_i = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[i].cpu().numpy())
        clusters_j = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[j].cpu().numpy())
        mi = adjusted_mutual_info_score(clusters_i, clusters_j)
        print(f'头 {i} 和头 {j} 之间的调整互信息: {mi:.4f}')


Epoch 1, Loss: 7.0753
Epoch 20, Loss: 5.3945
Epoch 40, Loss: 5.3420
Epoch 60, Loss: 5.3230
Epoch 80, Loss: 5.3120
Epoch 100, Loss: 5.3041
Epoch 120, Loss: 5.2980
Epoch 140, Loss: 5.2931
Epoch 160, Loss: 5.2892
Epoch 180, Loss: 5.2861
Epoch 200, Loss: 5.2835
头 0 的模块度: 0.6610
头 1 的模块度: 0.6243
头 2 的模块度: 0.6386
头 0 和头 1 之间的调整互信息: 0.2997
头 0 和头 2 之间的调整互信息: 0.3313
头 1 和头 2 之间的调整互信息: 0.3114


In [33]:
# import torch
# import torch.nn.functional as F
# from torch_geometric.datasets import Planetoid
# from torch_geometric.utils import to_dense_adj, to_networkx
# from torch_geometric.nn import GCNConv
# import networkx as nx
# import numpy as np
# from sklearn.cluster import KMeans

# # 数据准备：加载 Cora 数据集
# dataset = Planetoid(root='/tmp/Cora', name='Cora')
# data = dataset[0]

# # 定义单头 GNN 模型
# class GNN(torch.nn.Module):
#     def __init__(self, in_channels, out_channels):
#         super(GNN, self).__init__()
#         self.conv1 = GCNConv(in_channels, 16)
#         self.conv2 = GCNConv(16, out_channels)

#     def forward(self, x, edge_index):
#         x = F.relu(self.conv1(x, edge_index))
#         x = self.conv2(x, edge_index)
#         return x

# # 定义多头 GNN 模型
# class MultiHeadGNN(torch.nn.Module):
#     def __init__(self, in_channels, out_channels, num_heads):
#         super(MultiHeadGNN, self).__init__()
#         self.num_heads = num_heads
#         self.gnns = torch.nn.ModuleList([
#             GNN(in_channels, out_channels) for _ in range(num_heads)
#         ])

#     def forward(self, x, edge_index):
#         embeddings_list = []
#         for gnn in self.gnns:
#             embeddings = gnn(x, edge_index)
#             embeddings_list.append(embeddings)
#         return embeddings_list

# # 定义模块度损失函数，包含正交正则化
# def modularity_loss_multi_head(embeddings_list, adj, num_heads, reg_lambda=1e-3, orth_lambda=1.0):
#     losses = []
#     for i in range(num_heads):
#         embeddings = embeddings_list[i]
#         embeddings = F.normalize(embeddings, p=2, dim=1)
#         sim = torch.mm(embeddings, embeddings.t())
#         degrees = adj.sum(dim=1)
#         m = adj.sum()
#         expected = torch.outer(degrees, degrees) / m
#         B = adj - expected
#         modularity = (sim * B).sum() / m
#         reg = reg_lambda * (embeddings.norm(dim=1) ** 2).sum()
#         loss = - modularity + reg
#         losses.append(loss)
    
#     orth_loss = 0
#     for i in range(num_heads):
#         for j in range(i + 1, num_heads):
#             h_i = F.normalize(embeddings_list[i], p=2, dim=1)
#             h_j = F.normalize(embeddings_list[j], p=2, dim=1)
#             inner_product = torch.mm(h_i.t(), h_j)
#             orth_loss += torch.norm(inner_product, p='fro') ** 2

#     total_loss = sum(losses) + orth_lambda * orth_loss
#     return total_loss

# # 对抗训练函数
# def adversarial_training(model, data, adj, num_heads, epsilon=0.01, reg_lambda=1e-3, orth_lambda=1.0):
#     model.train()
#     optimizer.zero_grad()
#     x_adv = data.x.clone().detach().requires_grad_(True)

#     # 正常前向传播
#     embeddings_list = model(x_adv, data.edge_index)
#     loss = modularity_loss_multi_head(embeddings_list, adj, num_heads, reg_lambda, orth_lambda)
#     loss.backward()

#     # 生成对抗扰动
#     x_adv_grad = x_adv.grad.data
#     x_adv = x_adv + epsilon * x_adv_grad.sign()
#     x_adv = torch.clamp(x_adv, 0, 1)  # 根据特征值范围进行裁剪

#     # 使用对抗样本重新计算损失
#     optimizer.zero_grad()
#     embeddings_list_adv = model(x_adv.detach(), data.edge_index)
#     loss_adv = modularity_loss_multi_head(embeddings_list_adv, adj, num_heads, reg_lambda, orth_lambda)
#     loss_adv.backward()
#     optimizer.step()

#     return loss_adv.item()

# # 设置设备和模型
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# num_heads = 3  # 设置头的数量
# model = MultiHeadGNN(dataset.num_features, 16, num_heads).to(device)
# data = data.to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# # 将稀疏邻接矩阵转换为密集矩阵
# adj = to_dense_adj(data.edge_index)[0].to(device)

# # 训练模型
# for epoch in range(200):
#     loss_value = adversarial_training(model, data, adj, num_heads, epsilon=0.01, orth_lambda=1e-8)
#     if (epoch+1) % 20 == 0 or epoch == 0:
#         print(f'Epoch {epoch+1}, Loss: {loss_value:.4f}')

# # 测试模型并获取嵌入
# model.eval()
# with torch.no_grad():
#     embeddings_list = model(data.x, data.edge_index)
#     G_nx = to_networkx(data, to_undirected=True)

#     for idx, embeddings in enumerate(embeddings_list):
#         embeddings_np = embeddings.cpu().numpy()
#         # 使用 KMeans 进行聚类
#         clusters = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_np)

#         # 计算模块度
#         communities = [[] for _ in range(7)]
#         for node_idx, label in enumerate(clusters):
#             communities[label].append(node_idx)
#         modularity = nx.algorithms.community.modularity(G_nx, communities)
#         print(f'头 {idx} 的模块度: {modularity:.4f}')

#     # 计算头之间的互信息
#     from sklearn.metrics import adjusted_mutual_info_score
#     for i in range(num_heads):
#         for j in range(i + 1, num_heads):
#             clusters_i = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[i].cpu().numpy())
#             clusters_j = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[j].cpu().numpy())
#             mi = adjusted_mutual_info_score(clusters_i, clusters_j)
#             print(f'头 {i} 和头 {j} 之间的调整互信息: {mi:.4f}')


Epoch 1, Loss: 7.8728
Epoch 20, Loss: 5.7533
Epoch 40, Loss: 5.4928
Epoch 60, Loss: 5.4233
Epoch 80, Loss: 5.3916
Epoch 100, Loss: 5.3719
Epoch 120, Loss: 5.3577
Epoch 140, Loss: 5.3488
Epoch 160, Loss: 5.3404
Epoch 180, Loss: 5.3355
Epoch 200, Loss: 5.3275
头 0 的模块度: 0.7174
头 1 的模块度: 0.6670
头 2 的模块度: 0.5523
头 0 和头 1 之间的调整互信息: 0.4247
头 0 和头 2 之间的调整互信息: 0.3555
头 1 和头 2 之间的调整互信息: 0.3272
