In [1]:
import os
print(os.getcwd())
os.chdir('/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1')
print(os.getcwd())

/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1/src/gnn_clustering
/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1


In [5]:
from src.gnn_clustering.data_loader import load_data
from src.gnn_clustering.model import get_model
from src.gnn_clustering.loss_functions import modularity_loss
from src.gnn_clustering.train import train_model
from src.gnn_clustering.evaluate import (
    get_embeddings,
    kmeans_clustering,
    leiden_clustering,
    random_clustering,
    compute_modularity,
    format_communities
)
from src.gnn_clustering.utils import get_device, get_dense_adj
import torch
import numpy as np

# 设置设备
device = get_device()

# 数据加载
data = load_data()
data = data.to(device)

# 使用Leiden算法进行聚类，获取簇数
communities_leiden, modularity_leiden = leiden_clustering(data)
num_clusters_leiden = len(communities_leiden)

# 获取模型
model = get_model(data, device=device)

# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 获取密集邻接矩阵
adj = get_dense_adj(data.edge_index, device=device)

# 初始评估
initial_embeddings = get_embeddings(model, data, device=device)
clusters_kmeans_initial = kmeans_clustering(initial_embeddings, n_clusters=7)
communities_kmeans_initial = format_communities(clusters_kmeans_initial, n_clusters=7)
modularity_kmeans_initial = compute_modularity(data, communities_kmeans_initial)

# 模型训练
model = train_model(model, data, adj, modularity_loss, optimizer)

# 获取训练后的嵌入
embeddings = get_embeddings(model, data, device=device)

# KMeans 聚类评估
clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
modularity_kmeans = compute_modularity(data, communities_kmeans)

# # Leiden 算法评估
# communities_leiden, modularity_leiden = leiden_clustering(data)
# print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')

# 随机聚类评估
communities_random = random_clustering(data.num_nodes, n_clusters=7)
modularity_random = compute_modularity(data, communities_random)
print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')
print(f'Leiden 算法得到的簇数: {num_clusters_leiden}')
print(f'未训练模型（初始嵌入）的 KMeans 聚类模块度: {modularity_kmeans_initial:.4f}')
print(f'KMeans 聚类的模块度: {modularity_kmeans:.4f}')
print(f'随机聚类的模块度: {modularity_random:.4f}')



Epoch 50, Loss: 1.7670
Epoch 100, Loss: 1.7559
Epoch 150, Loss: 1.7511
Epoch 200, Loss: 1.7490
Epoch 250, Loss: 1.7478
Epoch 300, Loss: 1.7470
Epoch 350, Loss: 1.7465
Epoch 400, Loss: 1.7460
Epoch 450, Loss: 1.7457
Epoch 500, Loss: 1.7454
Epoch 550, Loss: 1.7451
Epoch 600, Loss: 1.7449
Epoch 650, Loss: 1.7447
Epoch 700, Loss: 1.7446
Epoch 750, Loss: 1.7445
Epoch 800, Loss: 1.7444
Epoch 850, Loss: 1.7443
Epoch 900, Loss: 1.7442
Epoch 950, Loss: 1.7442
Epoch 1000, Loss: 1.7441
Leiden 算法的模块度: 0.8213
Leiden 算法得到的簇数: 109
未训练模型（初始嵌入）的 KMeans 聚类模块度: 0.2946
KMeans 聚类的模块度: 0.7055
随机聚类的模块度: 0.0061


In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
from torch_geometric.nn import GCNConv
import networkx as nx
import numpy as np
from sklearn.cluster import KMeans
from src.gnn_clustering.model import get_multi_head_model
from src.gnn_clustering.data_loader import load_data
from src.gnn_clustering.utils import get_device, get_dense_adj
from src.gnn_clustering.train import train_model_multi_head
from src.gnn_clustering.evaluate import (
    get_embeddings_list,
    kmeans_clustering,
    leiden_clustering,
    compute_modularity,
    format_communities
)
# 设置设备
device = get_device()

# 数据加载
data = load_data()
data = data.to(device)

# 使用Leiden算法进行聚类，获取簇数
communities_leiden, modularity_leiden = leiden_clustering(data)
num_clusters_leiden = len(communities_leiden)

num_heads = 3  # 设置头的数量
model = get_multi_head_model(data=data, device= device, num_heads=num_heads)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 获取密集邻接矩阵
adj = get_dense_adj(data.edge_index, device=device)

# 模型训练
model = train_model_multi_head(model, data, adj, optimizer, num_heads)

embeddings_list = get_embeddings_list(model,data,device)

G_nx = to_networkx(data, to_undirected=True)

for idx, embeddings in enumerate(embeddings_list):
    # KMeans 聚类评估
    clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
    communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
    modularity_kmeans = compute_modularity(data, communities_kmeans)
    print(f'头 {idx} 的模块度: {modularity_kmeans:.4f}')

# 计算头之间的互信息
from sklearn.metrics import adjusted_mutual_info_score
for i in range(num_heads):
    for j in range(i + 1, num_heads):
        clusters_i = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[i].cpu().numpy())
        clusters_j = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[j].cpu().numpy())
        mi = adjusted_mutual_info_score(clusters_i, clusters_j)
        print(f'头 {i} 和头 {j} 之间的调整互信息: {mi:.4f}')


Epoch 1, Loss: 7.3737
Epoch 20, Loss: 5.4109
Epoch 40, Loss: 5.3432
Epoch 60, Loss: 5.3220
Epoch 80, Loss: 5.3101
Epoch 100, Loss: 5.3021
Epoch 120, Loss: 5.2962
Epoch 140, Loss: 5.2916
Epoch 160, Loss: 5.2878
Epoch 180, Loss: 5.2848
Epoch 200, Loss: 5.2823
头 0 的模块度: 0.7003
头 1 的模块度: 0.6706
头 2 的模块度: 0.7265
头 0 和头 1 之间的调整互信息: 0.2842
头 0 和头 2 之间的调整互信息: 0.3700
头 1 和头 2 之间的调整互信息: 0.3118


In [2]:
# from src.gnn_clustering.data_loader import load_random_data
# from src.gnn_clustering.model import get_model
# from src.gnn_clustering.loss_functions import modularity_loss
# from src.gnn_clustering.train import train_model
# from src.gnn_clustering.evaluate import (
#     get_embeddings,
#     kmeans_clustering,
#     leiden_clustering,
#     random_clustering,
#     compute_modularity,
#     format_communities
# )
# from src.gnn_clustering.utils import get_device, get_dense_adj
# import torch
# import numpy as np

# # 设置设备
# device = get_device()

# # 数据加载
# data = load_random_data(1024,2048)
# data = data.to(device)

# # 使用Leiden算法进行聚类，获取簇数
# communities_leiden, modularity_leiden = leiden_clustering(data)
# num_clusters_leiden = len(communities_leiden)

# # 获取模型
# model = get_model(data, device=device)

# # 定义优化器
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# # 获取密集邻接矩阵
# adj = get_dense_adj(data.edge_index, device=device)

# # 初始评估
# initial_embeddings = get_embeddings(model, data, device=device)
# clusters_kmeans_initial = kmeans_clustering(initial_embeddings, n_clusters=7)
# communities_kmeans_initial = format_communities(clusters_kmeans_initial, n_clusters=7)
# modularity_kmeans_initial = compute_modularity(data, communities_kmeans_initial)

# # 模型训练
# model = train_model(model, data, adj, modularity_loss, optimizer)

# # 获取训练后的嵌入
# embeddings = get_embeddings(model, data, device=device)

# # KMeans 聚类评估
# clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
# communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
# modularity_kmeans = compute_modularity(data, communities_kmeans)

# # # Leiden 算法评估
# # communities_leiden, modularity_leiden = leiden_clustering(data)
# # print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')

# # 随机聚类评估
# communities_random = random_clustering(data.num_nodes, n_clusters=7)
# modularity_random = compute_modularity(data, communities_random)
# print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')
# print(f'Leiden 算法得到的簇数: {num_clusters_leiden}')
# print(f'未训练模型（初始嵌入）的 KMeans 聚类模块度: {modularity_kmeans_initial:.4f}')
# print(f'KMeans 聚类的模块度: {modularity_kmeans:.4f}')
# print(f'随机聚类的模块度: {modularity_random:.4f}')



  edge_index = torch.tensor([source_indices, target_indices], dtype=torch.long)


Epoch 50, Loss: 0.2406
Epoch 100, Loss: 0.2252
Epoch 150, Loss: 0.2189
Epoch 200, Loss: 0.2156
Epoch 250, Loss: 0.2138
Epoch 300, Loss: 0.2124
Epoch 350, Loss: 0.2114
Epoch 400, Loss: 0.2109
Epoch 450, Loss: 0.2107
Epoch 500, Loss: 0.2106
Epoch 550, Loss: 0.2105
Epoch 600, Loss: 0.2105
Epoch 650, Loss: 0.2104
Epoch 700, Loss: 0.2104
Epoch 750, Loss: 0.2103
Epoch 800, Loss: 0.2103
Epoch 850, Loss: 0.2101
Epoch 900, Loss: 0.2100
Epoch 950, Loss: 0.2097
Epoch 1000, Loss: 0.2096
Leiden 算法的模块度: 0.5381
Leiden 算法得到的簇数: 42
未训练模型（初始嵌入）的 KMeans 聚类模块度: 0.2129
KMeans 聚类的模块度: 0.4834
随机聚类的模块度: 0.0046


In [11]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
from torch_geometric.nn import GCNConv
import networkx as nx
import numpy as np
from sklearn.cluster import KMeans
from src.gnn_clustering.model import get_multi_head_model, get_model
from src.gnn_clustering.data_loader import load_random_data
from src.gnn_clustering.utils import get_device, get_dense_adj
from src.gnn_clustering.train import train_model_multi_head
from src.gnn_clustering.evaluate import (
    get_embeddings_list,
    kmeans_clustering,
    leiden_clustering,
    compute_modularity,
    format_communities,
    random_clustering
)
# 设置设备
device = get_device()

# 数据加载
data = load_random_data(1024,1024)
data = data.to(device)

# 使用Leiden算法进行聚类，获取簇数
communities_leiden, modularity_leiden = leiden_clustering(data)
num_clusters_leiden = len(communities_leiden)

num_heads = 3  # 设置头的数量
model = get_multi_head_model(data=data, device= device, num_heads=num_heads)
single_head_model = get_model(data, device=device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 获取密集邻接矩阵
adj = get_dense_adj(data.edge_index, device=device)

# 初始评估
initial_embeddings = get_embeddings(single_head_model, data, device=device)
clusters_kmeans_initial = kmeans_clustering(initial_embeddings, n_clusters=7)
communities_kmeans_initial = format_communities(clusters_kmeans_initial, n_clusters=7)
modularity_kmeans_initial = compute_modularity(data, communities_kmeans_initial)
# # 随机聚类评估
communities_random = random_clustering(data.num_nodes, n_clusters=7)
modularity_random = compute_modularity(data, communities_random)

# 模型训练
model = train_model_multi_head(model, data, adj, optimizer, num_heads)

embeddings_list = get_embeddings_list(model,data,device)

G_nx = to_networkx(data, to_undirected=True)

print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')
print(f'Leiden 算法得到的簇数: {num_clusters_leiden}')
print(f'未训练模型（初始嵌入）的 KMeans 聚类模块度: {modularity_kmeans_initial:.4f}')
print(f'随机聚类的模块度: {modularity_random:.4f}')

for idx, embeddings in enumerate(embeddings_list):
    # KMeans 聚类评估
    clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
    communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
    modularity_kmeans = compute_modularity(data, communities_kmeans)
    print(f'头 {idx} 的模块度: {modularity_kmeans:.4f}')

# 计算头之间的互信息
from sklearn.metrics import adjusted_mutual_info_score
for i in range(num_heads):
    for j in range(i + 1, num_heads):
        clusters_i = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[i].cpu().numpy())
        clusters_j = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[j].cpu().numpy())
        mi = adjusted_mutual_info_score(clusters_i, clusters_j)
        print(f'头 {i} 和头 {j} 之间的调整互信息: {mi:.4f}')


Epoch 1, Loss: 2.1456
Epoch 20, Loss: 0.3552
Epoch 40, Loss: 0.2841
Epoch 60, Loss: 0.2553
Epoch 80, Loss: 0.2390
Epoch 100, Loss: 0.2285
Epoch 120, Loss: 0.2214
Epoch 140, Loss: 0.2162
Epoch 160, Loss: 0.2121
Epoch 180, Loss: 0.2086
Epoch 200, Loss: 0.2057
Leiden 算法的模块度: 0.7997
Leiden 算法得到的簇数: 199
未训练模型（初始嵌入）的 KMeans 聚类模块度: 0.3282
头 0 的模块度: 0.6775
头 1 的模块度: 0.7060
头 2 的模块度: 0.6686
头 0 和头 1 之间的调整互信息: 0.2152
头 0 和头 2 之间的调整互信息: 0.0936
头 1 和头 2 之间的调整互信息: 0.1237
