In [2]:
import os
print(os.getcwd())
os.chdir('/Users/boyuren/Documents/multi_head_graph_rag/MH-GRAG-V1')
print(os.getcwd())

In [13]:
from src.gnn_clustering.data_loader import load_data
from src.gnn_clustering.model import get_model
from src.gnn_clustering.loss_functions import modularity_loss
from src.gnn_clustering.train import train_model
from src.gnn_clustering.evaluate import (
    get_embeddings,
    kmeans_clustering,
    leiden_clustering,
    random_clustering,
    compute_modularity,
    format_communities
)
from src.gnn_clustering.utils import get_device, get_dense_adj
import torch
import numpy as np

def main():
    # 设置设备
    device = get_device()

    # 数据加载
    data = load_data()
    data = data.to(device)

    # 使用Leiden算法进行聚类，获取簇数
    communities_leiden, modularity_leiden = leiden_clustering(data)
    num_clusters_leiden = len(communities_leiden)

    # 获取模型
    model = get_model(data, device=device)

    # 定义优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # 获取密集邻接矩阵
    adj = get_dense_adj(data.edge_index, device=device)

    # 初始评估
    initial_embeddings = get_embeddings(model, data, device=device)
    clusters_kmeans_initial = kmeans_clustering(initial_embeddings, n_clusters=7)
    communities_kmeans_initial = format_communities(clusters_kmeans_initial, n_clusters=7)
    modularity_kmeans_initial = compute_modularity(data, communities_kmeans_initial)

    # 模型训练
    model = train_model(model, data, adj, modularity_loss, optimizer)

    # 获取训练后的嵌入
    embeddings = get_embeddings(model, data, device=device)

    # KMeans 聚类评估
    clusters_kmeans = kmeans_clustering(embeddings, n_clusters=7)
    communities_kmeans = format_communities(clusters_kmeans, n_clusters=7)
    modularity_kmeans = compute_modularity(data, communities_kmeans)

    # # Leiden 算法评估
    # communities_leiden, modularity_leiden = leiden_clustering(data)
    # print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')

    # 随机聚类评估
    communities_random = random_clustering(data.num_nodes, n_clusters=7)
    modularity_random = compute_modularity(data, communities_random)
    print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')
    print(f'Leiden 算法得到的簇数: {num_clusters_leiden}')
    print(f'未训练模型（初始嵌入）的 KMeans 聚类模块度: {modularity_kmeans_initial:.4f}')
    print(f'KMeans 聚类的模块度: {modularity_kmeans:.4f}')
    print(f'随机聚类的模块度: {modularity_random:.4f}')

if __name__ == '__main__':
    main()


Epoch 50, Loss: 1.7688
Epoch 100, Loss: 1.7568
Epoch 150, Loss: 1.7521
Epoch 200, Loss: 1.7496
Epoch 250, Loss: 1.7481
Epoch 300, Loss: 1.7471
Epoch 350, Loss: 1.7464
Epoch 400, Loss: 1.7459
Epoch 450, Loss: 1.7456
Epoch 500, Loss: 1.7454
Epoch 550, Loss: 1.7452
Epoch 600, Loss: 1.7451
Epoch 650, Loss: 1.7450
Epoch 700, Loss: 1.7448
Epoch 750, Loss: 1.7448
Epoch 800, Loss: 1.7447
Epoch 850, Loss: 1.7446
Epoch 900, Loss: 1.7446
Epoch 950, Loss: 1.7445
Epoch 1000, Loss: 1.7445
Leiden 算法的模块度: 0.8211
Leiden 算法得到的簇数: 107
未训练模型（初始嵌入）的 KMeans 聚类模块度: 0.3358
KMeans 聚类的模块度: 0.6942
随机聚类的模块度: 0.0004


In [14]:
from src.utils.embedding_utils import load_df_from_csv_with_embedding

df_relationships=load_df_from_csv_with_embedding("export_temp/all_relationships_with_embedding.csv",['embedding'])
df_relationships

Unnamed: 0,relationship_id,source_entity,source_entity_id,target_entity,target_entity_id,relationship_type,relationship_description,summary,embedding
0,dbaadf42-029c-456d-be2e-ba3b587d8989,醇贤亲王,d1e4c6a0-905e-4467-b73e-3aa7fbf65e59,醇王府,967d5d07-e3d7-4538-ac86-da76e64c904f,Located-in,醇贤亲王的府邸,醇贤亲王的府邸是醇王府。,"[0.020783402025699615, 0.0048711649142205715, ..."
1,09d71793-5e38-4bd8-9e0e-c2f82e6f8b42,醇贤亲王,d1e4c6a0-905e-4467-b73e-3aa7fbf65e59,北京,0a29970a-7106-49ce-a64c-de5486639a85,Located-in,醇贤亲王的府邸位于北京,醇贤亲王的府邸位于北京。,"[0.017970820888876915, 0.011698826216161251, -..."
2,d57b5c1c-c856-45c6-adb8-596fe5f6cac4,光绪三十二年,9e6d9324-8882-440a-b4dd-8a474a61e807,北京,0a29970a-7106-49ce-a64c-de5486639a85,Located-in,光绪三十二年在北京,北京在光绪三十二年。,"[0.021504301577806473, 0.006124526262283325, -..."
3,af2bb431-2b0b-4ebc-8853-a7abc34a68dc,宣武门内的太平湖东岸,c5abb6ee-d4a1-4926-ad78-04717b0701a0,醇王府,967d5d07-e3d7-4538-ac86-da76e64c904f,Located-in,醇王府第一座府邸位于宣武门内的太平湖东岸,The first residence of the Prince Chun's Mansi...,"[0.039672575891017914, 0.0028197690844535828, ..."
4,da9dc6ce-e8a9-4165-9dcd-3f2acb755147,什刹后海的一座贝子府,32f76620-bebe-4f11-a2b9-790482a2cac5,醇王府,967d5d07-e3d7-4538-ac86-da76e64c904f,Located-in,醇王府第三座府邸位于什刹后海的一座贝子府,醇王府第三座府邸位于什刹后海的一座贝子府。,"[0.04397919774055481, -0.008292053826153278, -..."
...,...,...,...,...,...,...,...,...,...
143,2e9d1822-3986-4b40-880b-933924d039e7,醇亲王载沣,2cd26bd0-92b0-4313-821e-00ed86190c22,慈禧,da3fad0b-bf49-4392-9fc9-b510ca5dc0e6,Family-Relation,醇亲王载沣是慈禧的侄子,C醇亲王载沣 was the nephew of Empress Dowager Cixi.,"[0.02166585810482502, -0.0021259270142763853, ..."
144,d22b9404-2464-4375-ad8b-182087cb3db5,西太后,88f2d7a7-aec3-4f30-893e-fedfd3f1bfe0,醇王府,b461aae9-8d50-4518-9fa1-b2badd948654,Located-in,醇王府位于西太后的管辖范围内,西太后管辖醇王府。,"[0.04686933383345604, -0.00954390037804842, -0..."
145,4edf1f67-2d05-4b3e-8aaa-f2496b79ab94,西太后,88f2d7a7-aec3-4f30-893e-fedfd3f1bfe0,光绪,677e95ae-d91f-4f6a-b5fd-017018a6510c,Family-Relation,西太后是光绪的姨母,西太后是光绪的姨母。,"[0.03296554461121559, -0.007351777050644159, -..."
146,a77454d7-a1a7-4fd9-95a1-df7a9a739ad3,西太后,88f2d7a7-aec3-4f30-893e-fedfd3f1bfe0,荣禄,f2c6a49e-f1e5-4eb5-ad11-cca0743a8620,Affiliated-with,西太后与荣禄关系密切,西太后与荣禄关系密切。,"[0.03707129508256912, -0.015029300935566425, -..."


In [15]:
df_entities_with_embeddings=load_df_from_csv_with_embedding("export_temp/all_entities_with_node2vec_embedding.csv",['embedding','node2vec_embedding'])
df_entities_with_embeddings

Unnamed: 0,entity_name,entity_type,description,entity_id,summary,embedding,community_id,sub_community_id,node2vec_embedding
0,醇贤亲王,Person,奕讠瞏，道光皇帝的第七子，初封郡王，后晋亲王，死后谥法“贤”，所以后来称做醇贤亲王。,d1e4c6a0-905e-4467-b73e-3aa7fbf65e59,奕讠瞏，道光帝第七子，初封郡王，后晋亲王，谥号贤，世称醇贤亲王。,"[0.04685084894299507, 0.017370907589793205, -0...",1,0,"[-0.130188450217247, -0.16089242696762085, 0.2..."
1,醇王府,Location,醇王府，在北京曾占据过三处地方。,967d5d07-e3d7-4538-ac86-da76e64c904f,醇王府，曾在北京拥有三处府邸。,"[0.031854208558797836, 0.009978203102946281, -...",1,0,"[-0.0875520184636116, -0.20301604270935059, 0...."
2,北京,Location,中国首都,0a29970a-7106-49ce-a64c-de5486639a85,"Beijing (Location), the capital of China.","[0.03565094619989395, 0.03580751642584801, -0....",1,0,"[-0.10584648698568344, 0.07868792116641998, 0...."
3,光绪三十二年,Date,1906 年，清朝光绪三十二年的旧历正月十四,9e6d9324-8882-440a-b4dd-8a474a61e807,1906 年旧历正月十四为光绪三十二年。,"[0.023060264065861702, 0.015912292525172234, -...",1,1,"[-0.1903756707906723, -0.037391722202301025, 0..."
4,宣武门内的太平湖东岸,Location,醇王府第一座府邸所在地,c5abb6ee-d4a1-4926-ad78-04717b0701a0,太平湖东岸的醇王府第一座府邸位于宣武门内。,"[0.03185725212097168, 0.02970491722226143, 0.0...",1,0,"[-0.08088800311088562, -0.25965312123298645, 0..."
...,...,...,...,...,...,...,...,...,...
190,醇王府,Location,位于北京什刹海后海北沿的醇亲王府,b461aae9-8d50-4518-9fa1-b2badd948654,醇亲王府位于北京什刹海后海北沿。,"[0.024454347789287567, 0.0030572873074561357, ...",14,0,"[0.2772037088871002, -0.4804370403289795, -0.0..."
191,光绪,Person,清德宗光绪皇帝,677e95ae-d91f-4f6a-b5fd-017018a6510c,清德宗光绪皇帝，名为爱新觉罗·载湉，是清朝第十一位皇帝。,"[0.02224552258849144, 0.040778279304504395, -0...",14,1,"[0.26255157589912415, -0.46251654624938965, -0..."
192,荣禄,Person,晚清军事家、政治家,f2c6a49e-f1e5-4eb5-ad11-cca0743a8620,荣禄是晚清军事家、政治家。,"[0.03118782304227352, 0.02121950499713421, -0....",14,0,"[0.28197216987609863, -0.4737991988658905, -0...."
193,克林德,Person,德国驻华公使,2a83f563-c7f0-4c7f-832a-338a9bc9057a,克林德是德国驻华公使。,"[-0.015106506645679474, 0.018515728414058685, ...",72,0,"[0.006551899015903473, 0.0023413794115185738, ..."


In [16]:
from src.gnn_clustering.data_loader import load_custom_data
from src.gnn_clustering.model import get_model
from src.gnn_clustering.loss_functions import modularity_loss
from src.gnn_clustering.train import train_model
from src.gnn_clustering.evaluate import (
    get_embeddings,
    kmeans_clustering,
    leiden_clustering,
    random_clustering,
    compute_modularity,
    format_communities
)
from src.gnn_clustering.utils import get_device, get_dense_adj
import torch
import numpy as np

def main():
    # 设置设备
    device = get_device()

    # 数据加载
    data = load_custom_data(df_entities_with_embeddings,df_relationships)
    data = data.to(device)

    # 使用Leiden算法进行聚类，获取簇数
    communities_leiden, modularity_leiden = leiden_clustering(data)
    num_clusters_leiden = len(communities_leiden)
    print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')
    print(f'Leiden 算法得到的簇数: {num_clusters_leiden}')

    # 获取模型
    model = get_model(data, device=device)

    # 定义优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # 获取密集邻接矩阵
    adj = get_dense_adj(data.edge_index, device=device)

    # 初始评估
    initial_embeddings = get_embeddings(model, data, device=device)
    clusters_kmeans_initial = kmeans_clustering(initial_embeddings, n_clusters=num_clusters_leiden)
    communities_kmeans_initial = format_communities(clusters_kmeans_initial, n_clusters=num_clusters_leiden)
    modularity_kmeans_initial = compute_modularity(data, communities_kmeans_initial)
    print(f'未训练模型（初始嵌入）的 KMeans 聚类模块度: {modularity_kmeans_initial:.4f}')

    # 模型训练
    model = train_model(model, data, adj, modularity_loss, optimizer)

    # 获取训练后的嵌入
    embeddings = get_embeddings(model, data, device=device)

    # KMeans 聚类评估
    clusters_kmeans = kmeans_clustering(embeddings, n_clusters=num_clusters_leiden)
    communities_kmeans = format_communities(clusters_kmeans, n_clusters=num_clusters_leiden)
    modularity_kmeans = compute_modularity(data, communities_kmeans)
    print(f'KMeans 聚类的模块度: {modularity_kmeans:.4f}')

    # # Leiden 算法评估
    # communities_leiden, modularity_leiden = leiden_clustering(data)
    # print(f'Leiden 算法的模块度: {modularity_leiden:.4f}')

    # 随机聚类评估
    communities_random = random_clustering(data.num_nodes, n_clusters=num_clusters_leiden)
    modularity_random = compute_modularity(data, communities_random)
    print(f'随机聚类的模块度: {modularity_random:.4f}')

if __name__ == '__main__':
    main()



Leiden 算法的模块度: 0.9486
Leiden 算法得到的簇数: 73
未训练模型（初始嵌入）的 KMeans 聚类模块度: 0.5197
Epoch 50, Loss: -0.8034
Epoch 100, Loss: -0.8044
Epoch 150, Loss: -0.8047
Epoch 200, Loss: -0.8048
Epoch 250, Loss: -0.8049
Epoch 300, Loss: -0.8049
Epoch 350, Loss: -0.8050
Epoch 400, Loss: -0.8050
Epoch 450, Loss: -0.8050
Epoch 500, Loss: -0.8050
Epoch 550, Loss: -0.8050
Epoch 600, Loss: -0.8050
Epoch 650, Loss: -0.8050
Epoch 700, Loss: -0.8050
Epoch 750, Loss: -0.8050
Epoch 800, Loss: -0.8050
Epoch 850, Loss: -0.8050
Epoch 900, Loss: -0.8050
Epoch 950, Loss: -0.8050
Epoch 1000, Loss: -0.8050
KMeans 聚类的模块度: 0.5099
随机聚类的模块度: 0.0109
