In [1]:
import os
print(os.getcwd())
os.chdir('/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1')
print(os.getcwd())

/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1/tests
/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1


# Multihead-GRAG 随机网络聚类测试

In [2]:
import torch
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score
from src.gnn_clustering.train import train_model_multi_head
from src.gnn_clustering.evaluate import (
    get_embeddings,
    get_embeddings_list, 
    kmeans_clustering, 
    leiden_clustering, 
    compute_modularity, 
    format_communities, 
    random_clustering
)
from src.gnn_clustering.data_loader import load_random_data
from src.gnn_clustering.model import get_multi_head_model, get_model
from src.gnn_clustering.utils import get_device, get_dense_adj

# 设置设备
device = get_device()

def test_model_performance(num_tests, num_nodes, num_edges):
    results = []

    for test_idx in range(num_tests):
        # 加载随机数据
        data = load_random_data(num_nodes, num_edges)
        data = data.to(device)

        # 使用Leiden算法进行聚类，获取簇数和模块度
        communities_leiden, modularity_leiden = leiden_clustering(data)
        num_clusters_leiden = len(communities_leiden)

        # 初始模型和优化器
        num_heads = 3
        model = get_multi_head_model(data=data, device=device, num_heads=num_heads)
        single_head_model = get_model(data, device=device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        # 获取密集邻接矩阵
        adj = get_dense_adj(data.edge_index, device=device)

        # 初始嵌入和KMeans聚类模块度
        initial_embeddings = get_embeddings(single_head_model, data, device=device)
        clusters_kmeans_initial = kmeans_clustering(initial_embeddings, n_clusters=7)
        communities_kmeans_initial = format_communities(clusters_kmeans_initial, n_clusters=7)
        modularity_kmeans_initial = compute_modularity(data, communities_kmeans_initial)

        # 随机聚类模块度
        communities_random = random_clustering(data.num_nodes, n_clusters=7)
        modularity_random = compute_modularity(data, communities_random)

        # 模型训练
        model = train_model_multi_head(model, data, adj, optimizer, num_heads)
        embeddings_list = get_embeddings_list(model, data, device)

        # 头的模块度和互信息
        head_modularities = []
        head_mutual_info = []

        for idx, embeddings in enumerate(embeddings_list):
            clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
            communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
            modularity_kmeans = compute_modularity(data, communities_kmeans)
            head_modularities.append(modularity_kmeans)

        for i in range(num_heads):
            for j in range(i + 1, num_heads):
                clusters_i = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[i].cpu().numpy())
                clusters_j = KMeans(n_clusters=7, random_state=0).fit_predict(embeddings_list[j].cpu().numpy())
                mi = adjusted_mutual_info_score(clusters_i, clusters_j)
                head_mutual_info.append(mi)

        # 多头平均模块度和互信息
        avg_modularity = sum(head_modularities) / len(head_modularities)
        avg_mutual_info = sum(head_mutual_info) / len(head_mutual_info)

        # 多头平均模块度与Leiden模块度的百分比
        modularity_percent = (avg_modularity / modularity_leiden) * 100

        # 记录实验结果
        result = {
            'test_index': test_idx,
            'num_nodes': num_nodes,
            'num_edges': num_edges,
            'modularity_leiden': modularity_leiden,
            'num_clusters_leiden': num_clusters_leiden,
            'modularity_kmeans_initial': modularity_kmeans_initial,
            'modularity_random': modularity_random,
            'head_0_modularity': head_modularities[0],
            'head_1_modularity': head_modularities[1],
            'head_2_modularity': head_modularities[2],
            'head_0_vs_1_mutual_info': head_mutual_info[0],
            'head_0_vs_2_mutual_info': head_mutual_info[1],
            'head_1_vs_2_mutual_info': head_mutual_info[2],
            'avg_modularity': avg_modularity,
            'avg_mutual_info': avg_mutual_info,
            'modularity_percent': modularity_percent
        }
        results.append(result)

    # 将实验结果转换为DataFrame
    df_results = pd.DataFrame(results)
    return df_results

# 执行测试
df_report = test_model_performance(num_tests=5, num_nodes=1024, num_edges=1024)

  edge_index = torch.tensor([source_indices, target_indices], dtype=torch.long)


Epoch 1, Loss: 1.9755
Epoch 20, Loss: 0.3329
Epoch 40, Loss: 0.2739
Epoch 60, Loss: 0.2484
Epoch 80, Loss: 0.2336
Epoch 100, Loss: 0.2244
Epoch 120, Loss: 0.2177
Epoch 140, Loss: 0.2123
Epoch 160, Loss: 0.2077
Epoch 180, Loss: 0.2036
Epoch 200, Loss: 0.2004
Epoch 1, Loss: 2.1573
Epoch 20, Loss: 0.3487
Epoch 40, Loss: 0.2822
Epoch 60, Loss: 0.2515
Epoch 80, Loss: 0.2331
Epoch 100, Loss: 0.2220
Epoch 120, Loss: 0.2151
Epoch 140, Loss: 0.2100
Epoch 160, Loss: 0.2061
Epoch 180, Loss: 0.2031
Epoch 200, Loss: 0.2006
Epoch 1, Loss: 1.8615
Epoch 20, Loss: 0.3412
Epoch 40, Loss: 0.2719
Epoch 60, Loss: 0.2462
Epoch 80, Loss: 0.2310
Epoch 100, Loss: 0.2208
Epoch 120, Loss: 0.2138
Epoch 140, Loss: 0.2085
Epoch 160, Loss: 0.2046
Epoch 180, Loss: 0.2017
Epoch 200, Loss: 0.1996
Epoch 1, Loss: 1.8372
Epoch 20, Loss: 0.3425
Epoch 40, Loss: 0.2647
Epoch 60, Loss: 0.2361
Epoch 80, Loss: 0.2203
Epoch 100, Loss: 0.2103
Epoch 120, Loss: 0.2034
Epoch 140, Loss: 0.1983
Epoch 160, Loss: 0.1942
Epoch 180, Loss:

In [3]:
df_report

Unnamed: 0,test_index,num_nodes,num_edges,modularity_leiden,num_clusters_leiden,modularity_kmeans_initial,modularity_random,head_0_modularity,head_1_modularity,head_2_modularity,head_0_vs_1_mutual_info,head_0_vs_2_mutual_info,head_1_vs_2_mutual_info,avg_modularity,avg_mutual_info,modularity_percent
0,0,1024,1024,0.806201,193,0.342909,-0.025326,0.699622,0.682503,0.694019,0.105247,0.112769,0.141663,0.692048,0.119893,85.840622
1,1,1024,1024,0.802696,201,0.329665,0.014791,0.683546,0.702574,0.693449,0.137163,0.185263,0.143675,0.69319,0.155367,86.357668
2,2,1024,1024,0.800705,200,0.313768,0.012271,0.685036,0.688435,0.710514,0.136661,0.144375,0.140072,0.694662,0.140369,86.756289
3,3,1024,1024,0.813741,187,0.375187,-0.010347,0.682816,0.698234,0.706079,0.126057,0.174288,0.166242,0.69571,0.155529,85.495268
4,4,1024,1024,0.801322,200,0.299023,0.001437,0.663086,0.681426,0.681632,0.135311,0.10356,0.095643,0.675381,0.111504,84.283321


In [None]:
# 执行测试
df_report = test_model_performance(num_tests=5, num_nodes=2048, num_edges=2048)

In [16]:
df_report

Unnamed: 0,test_index,num_nodes,num_edges,modularity_leiden,num_clusters_leiden,modularity_kmeans_initial,modularity_random,head_0_modularity,head_1_modularity,head_2_modularity,head_0_vs_1_mutual_info,head_0_vs_2_mutual_info,head_1_vs_2_mutual_info,avg_modularity,avg_mutual_info,modularity_percent
0,0,2048,2048,0.822559,365,0.326056,0.005659,0.692971,0.703276,0.695821,0.053369,0.046698,0.052551,0.697356,0.050873,84.778807
1,1,2048,2048,0.818237,370,0.32201,0.00342,0.670239,0.700328,0.699431,0.025275,0.030483,0.069321,0.689999,0.041693,84.327526
2,2,2048,2048,0.811265,397,0.325104,0.002966,0.696697,0.681323,0.684605,0.074697,0.048963,0.05483,0.687542,0.059497,84.749341
3,3,2048,2048,0.822826,365,0.346365,0.004336,0.694722,0.693643,0.695781,0.074632,0.032607,0.06513,0.694715,0.057456,84.430397
4,4,2048,2048,0.825514,359,0.335969,0.006577,0.689677,0.692953,0.707248,0.028664,0.034075,0.064198,0.696626,0.042313,84.386892


In [None]:
# 执行测试
df_report = test_model_performance(num_tests=1, num_nodes=8192, num_edges=8192)

In [11]:
df_report

Unnamed: 0,test_index,num_nodes,num_edges,modularity_leiden,num_clusters_leiden,modularity_kmeans_initial,modularity_random,head_0_modularity,head_1_modularity,head_2_modularity,head_0_vs_1_mutual_info,head_0_vs_2_mutual_info,head_1_vs_2_mutual_info,avg_modularity,avg_mutual_info,modularity_percent
0,0,8192,8192,0.83678,1393,0.318922,-0.005135,0.64521,0.669093,0.678187,0.001994,0.001141,0.003631,0.664164,0.002255,79.371406


In [None]:
df_report = test_model_performance(num_tests=5, num_nodes=1024, num_edges=10240)

In [18]:
df_report

Unnamed: 0,test_index,num_nodes,num_edges,modularity_leiden,num_clusters_leiden,modularity_kmeans_initial,modularity_random,head_0_modularity,head_1_modularity,head_2_modularity,head_0_vs_1_mutual_info,head_0_vs_2_mutual_info,head_1_vs_2_mutual_info,avg_modularity,avg_mutual_info,modularity_percent
0,0,1024,10240,0.210588,9,0.047817,0.001738,0.166589,0.166266,0.168889,0.131797,0.071643,0.116753,0.167248,0.106731,79.419599
1,1,1024,10240,0.212206,8,0.062953,-0.000539,0.173959,0.158857,0.174614,0.112726,0.136973,0.133666,0.169143,0.127789,79.707125
2,2,1024,10240,0.209645,8,0.044193,-0.004052,0.162592,0.169736,0.153892,0.115544,0.099088,0.111719,0.162073,0.108784,77.308441
3,3,1024,10240,0.207415,9,0.058909,-0.001902,0.164447,0.158583,0.163961,0.093864,0.122568,0.124062,0.16233,0.113498,78.263577
4,4,1024,10240,0.209317,9,0.047548,-0.00529,0.150468,0.161167,0.164468,0.137254,0.085839,0.106415,0.158701,0.109836,75.81867
