In [1]:
import os
print(os.getcwd())
os.chdir('/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1')
print(os.getcwd())

/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1/tests
/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1


# Multihead-GRAG 随机网络聚类测试

In [None]:
import torch
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score
from src.gnn_clustering.train import train_model_multi_head
from src.gnn_clustering.evaluate import (
    get_embeddings,
    get_embeddings_list, 
    kmeans_clustering, 
    leiden_clustering, 
    compute_modularity, 
    format_communities, 
    random_clustering
)
from src.gnn_clustering.data_loader import load_random_data
from src.gnn_clustering.model import get_multi_head_model, get_model
from src.gnn_clustering.utils import get_device, get_dense_adj

# 设置设备
device = get_device()

def test_model_performance(num_tests, num_nodes, num_edges):
    results = []

    for test_idx in range(num_tests):
        # 加载随机数据
        data = load_random_data(num_nodes, num_edges)
        data = data.to(device)

        # 使用Leiden算法进行聚类，获取簇数和模块度
        communities_leiden, modularity_leiden = leiden_clustering(data)
        num_clusters_leiden = len(communities_leiden)

        # 初始模型和优化器
        num_heads = 3
        model = get_multi_head_model(data=data, device=device, num_heads=num_heads)
        single_head_model = get_model(data, device=device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        # 获取密集邻接矩阵
        adj = get_dense_adj(data.edge_index, device=device)

        # 初始嵌入和KMeans聚类模块度
        initial_embeddings = get_embeddings(single_head_model, data, device=device)
        clusters_kmeans_initial = kmeans_clustering(initial_embeddings, n_clusters=7)
        communities_kmeans_initial = format_communities(clusters_kmeans_initial, n_clusters=7)
        modularity_kmeans_initial = compute_modularity(data, communities_kmeans_initial)

        # 随机聚类模块度
        communities_random = random_clustering(data.num_nodes, n_clusters=7)
        modularity_random = compute_modularity(data, communities_random)

        # 模型训练
        model = train_model_multi_head(model, data, adj, optimizer, num_heads)
        embeddings_list = get_embeddings_list(model, data, device)

        # 头的模块度和互信息
        head_modularities = []
        head_mutual_info = []

        for idx, embeddings in enumerate(embeddings_list):
            clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
            communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
            modularity_kmeans = compute_modularity(data, communities_kmeans)
            head_modularities.append(modularity_kmeans)

        for i in range(num_heads):
            for j in range(i + 1, num_heads):
                clusters_i = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[i].cpu().numpy())
                clusters_j = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[j].cpu().numpy())
                mi = adjusted_mutual_info_score(clusters_i, clusters_j)
                head_mutual_info.append(mi)

        # 多头平均模块度和互信息
        avg_modularity = sum(head_modularities) / len(head_modularities)
        avg_mutual_info = sum(head_mutual_info) / len(head_mutual_info)

        # 多头平均模块度与Leiden模块度的百分比
        modularity_percent = (avg_modularity / modularity_leiden) * 100

        # 记录实验结果
        result = {
            'test_index': test_idx,
            'num_nodes': num_nodes,
            'num_edges': num_edges,
            'modularity_leiden': modularity_leiden,
            'num_clusters_leiden': num_clusters_leiden,
            'modularity_kmeans_initial': modularity_kmeans_initial,
            'modularity_random': modularity_random,
            'head_0_modularity': head_modularities[0],
            'head_1_modularity': head_modularities[1],
            'head_2_modularity': head_modularities[2],
            'head_0_vs_1_mutual_info': head_mutual_info[0],
            'head_0_vs_2_mutual_info': head_mutual_info[1],
            'head_1_vs_2_mutual_info': head_mutual_info[2],
            'avg_modularity': avg_modularity,
            'avg_mutual_info': avg_mutual_info,
            'modularity_percent': modularity_percent
        }
        results.append(result)

    # 将实验结果转换为DataFrame
    df_results = pd.DataFrame(results)
    return df_results

# 执行测试
df_report = test_model_performance(num_tests=5, num_nodes=1024, num_edges=1024)

In [14]:
df_report

Unnamed: 0,test_index,num_nodes,num_edges,modularity_leiden,num_clusters_leiden,modularity_kmeans_initial,modularity_random,head_0_modularity,head_1_modularity,head_2_modularity,head_0_vs_1_mutual_info,head_0_vs_2_mutual_info,head_1_vs_2_mutual_info,avg_modularity,avg_mutual_info,modularity_percent
0,0,1024,1024,0.803401,190,0.361889,0.000213,0.681856,0.714348,0.6912,0.127509,0.161088,0.172739,0.695801,0.153779,86.606998
1,1,1024,1024,0.804371,188,0.367263,0.008207,0.701479,0.689469,0.666657,0.107097,0.182355,0.136652,0.685868,0.142035,85.267642
2,2,1024,1024,0.812207,188,0.36715,-0.003109,0.685268,0.700477,0.689586,0.173727,0.201858,0.185094,0.691777,0.186893,85.172523
3,3,1024,1024,0.818721,190,0.337591,-0.024745,0.691602,0.699014,0.716218,0.163765,0.191526,0.157118,0.702278,0.170803,85.777474
4,4,1024,1024,0.811377,189,0.3424,-0.023731,0.691478,0.70729,0.694745,0.134703,0.079257,0.155786,0.697838,0.123249,86.006633


In [None]:
# 执行测试
df_report = test_model_performance(num_tests=5, num_nodes=2048, num_edges=2048)

In [16]:
df_report

Unnamed: 0,test_index,num_nodes,num_edges,modularity_leiden,num_clusters_leiden,modularity_kmeans_initial,modularity_random,head_0_modularity,head_1_modularity,head_2_modularity,head_0_vs_1_mutual_info,head_0_vs_2_mutual_info,head_1_vs_2_mutual_info,avg_modularity,avg_mutual_info,modularity_percent
0,0,2048,2048,0.822559,365,0.326056,0.005659,0.692971,0.703276,0.695821,0.053369,0.046698,0.052551,0.697356,0.050873,84.778807
1,1,2048,2048,0.818237,370,0.32201,0.00342,0.670239,0.700328,0.699431,0.025275,0.030483,0.069321,0.689999,0.041693,84.327526
2,2,2048,2048,0.811265,397,0.325104,0.002966,0.696697,0.681323,0.684605,0.074697,0.048963,0.05483,0.687542,0.059497,84.749341
3,3,2048,2048,0.822826,365,0.346365,0.004336,0.694722,0.693643,0.695781,0.074632,0.032607,0.06513,0.694715,0.057456,84.430397
4,4,2048,2048,0.825514,359,0.335969,0.006577,0.689677,0.692953,0.707248,0.028664,0.034075,0.064198,0.696626,0.042313,84.386892


In [10]:
# 执行测试
df_report = test_model_performance(num_tests=1, num_nodes=8192, num_edges=8192)

Epoch 1, Loss: 23.6427
Epoch 20, Loss: 21.9419
Epoch 40, Loss: 21.8485
Epoch 60, Loss: 21.8087
Epoch 80, Loss: 21.7860
Epoch 100, Loss: 21.7702
Epoch 120, Loss: 21.7582
Epoch 140, Loss: 21.7485
Epoch 160, Loss: 21.7403
Epoch 180, Loss: 21.7334
Epoch 200, Loss: 21.7274


In [11]:
df_report

Unnamed: 0,test_index,num_nodes,num_edges,modularity_leiden,num_clusters_leiden,modularity_kmeans_initial,modularity_random,head_0_modularity,head_1_modularity,head_2_modularity,head_0_vs_1_mutual_info,head_0_vs_2_mutual_info,head_1_vs_2_mutual_info,avg_modularity,avg_mutual_info,modularity_percent
0,0,8192,8192,0.83678,1393,0.318922,-0.005135,0.64521,0.669093,0.678187,0.001994,0.001141,0.003631,0.664164,0.002255,79.371406
