In [3]:
# 知识库文件内容与生成顺序见：
# graphrag/index/run.py中workflow_dependencies变量
# https://www.bilibili.com/video/BV1u6iFeAEx9?p=8
"""
├── Loading Input (text)
├── create_base_text_units
├── create_base_extracted_entities
├── create_summarized_entities
├── create_base_entity_graph
├── create_final_entities
├── create_final_nodes
├── create_final_communities
├── join_text_units_to_entity_ids
├── create_final_relationships
├── join_text_units_to_relationship_ids
├── create_final_community_reports
├── create_final_text_units
├── create_base_documents
└── create_final_documents
"""

'\n├── Loading Input (text)\n├── create_base_text_units\n├── create_base_extracted_entities\n├── create_summarized_entities\n├── create_base_entity_graph\n├── create_final_entities\n├── create_final_nodes\n├── create_final_communities\n├── join_text_units_to_entity_ids\n├── create_final_relationships\n├── join_text_units_to_relationship_ids\n├── create_final_community_reports\n├── create_final_text_units\n├── create_base_documents\n└── create_final_documents\n'

In [4]:
import io

import pandas as pd
import networkx as nx
from pyvis.network import Network

path_prefix = "../kongyiji/output/kongyiji/artifacts/"

In [5]:
dependencies = {'create_base_text_units': [],
                'create_base_extracted_entities': ['create_base_text_units'],
                'create_summarized_entities': ['create_base_extracted_entities'],
                'create_base_entity_graph': ['create_summarized_entities'],
                'create_final_entities': ['create_base_entity_graph'],
                'create_final_nodes': ['create_base_entity_graph'],
                'create_final_communities': ['create_base_entity_graph'],
                'join_text_units_to_entity_ids': ['create_final_entities'],
                'create_final_relationships': ['create_final_nodes', 'create_base_entity_graph'],
                'join_text_units_to_relationship_ids': ['create_final_relationships'],
                'create_final_community_reports': ['create_final_relationships', 'create_final_nodes'],
                'create_final_text_units': ['join_text_units_to_relationship_ids', 'join_text_units_to_entity_ids',
                                            'create_base_text_units'],
                'create_base_documents': ['create_final_text_units'],
                'create_final_documents': ['create_base_documents'],
                }

# 展示工作流之间的拓扑结构
dependencies_graph = nx.DiGraph()

for workflow, deps in dependencies.items():
    dependencies_graph.add_node(workflow)
    for dep in deps:
        dependencies_graph.add_edge(dep, workflow)

dependencies_vis = Network(notebook=True, cdn_resources='remote', directed=True)
dependencies_vis.from_nx(dependencies_graph)
dependencies_vis.show("workflow_dependencies.html")

workflow_dependencies.html


In [6]:
# 原始文本切片的结果
# document_ids 是原始文本文件的id
# chunk_id 是切片的id
df21 = pd.read_parquet(path_prefix + "create_base_text_units.parquet")
df21

Unnamed: 0,id,chunk,chunk_id,document_ids,n_tokens
0,1fae6f3d55b5cb0dba1f89f668218cd2,鲁镇的酒店的格局，是和别处不同的：都是当街一个尺形的大柜台，柜里面预备着热水，可以随时温酒。...,1fae6f3d55b5cb0dba1f89f668218cd2,[129084b0fa1815780605fa5c38c85b77],600
1,355e705881fd7c83e813a83d24db74b7,了这事。幸亏荐头的情面大，辞退不得，便改为专管温酒的一种无聊职务了。\n\n我从此便整天的站...,355e705881fd7c83e813a83d24db74b7,[129084b0fa1815780605fa5c38c85b77],600
2,dee20f36e11c4185c682fd582d02ed69,东西了!”孔乙己睁大眼睛说，“你怎么这样凭空污人清白……”“什么清白?我前天亲眼见你偷了何家...,dee20f36e11c4185c682fd582d02ed69,[129084b0fa1815780605fa5c38c85b77],600
3,5d3d851e7c8a5cea251f64785f55435d,渐复了原，旁人便又问道，“孔乙己，你当真认识字么?”孔乙己看着问他的人，显出不屑置辩的神气。...,5d3d851e7c8a5cea251f64785f55435d,[129084b0fa1815780605fa5c38c85b77],600
4,07730a19b8110138a785adc20fce8020,�的答他道，“谁要你教，不是草头底下一个来回的回字么?”孔乙己显出极高兴的样子，将两个指头的...,07730a19b8110138a785adc20fce8020,[129084b0fa1815780605fa5c38c85b77],600
5,b2c9b122d0e796bb26d85487fdaae53a,…他打折了腿了。”掌柜说，“哦!”“他总仍旧是偷。这一回，是自己发昏，竟偷到丁举人家里去了。...,b2c9b122d0e796bb26d85487fdaae53a,[129084b0fa1815780605fa5c38c85b77],600
6,138b67752488e94c5e5a9d788efce181,��要好。”掌柜仍然同平常一样，笑着对他说，“孔乙己，你又偷了东西了!”但他这回却不十分分辩...,138b67752488e94c5e5a9d788efce181,[129084b0fa1815780605fa5c38c85b77],408


In [7]:
# 对文本切片进行实体提取的结果
# source_id表示来源文本切片的id
# 同一实体可能在多个文本切片中出现过，因此source_id不唯一
# 多次出现的同一实体，每次被提取时的description会被汇总到一个list中，之后会进行总结
df31 = pd.read_parquet(path_prefix + "create_base_extracted_entities.parquet")
df31

Unnamed: 0,entity_graph
0,"<graphml xmlns=""http://graphml.graphdrawing.or..."


In [8]:
graph_file31 = io.StringIO(df31["entity_graph"][0])
graph31 = nx.read_graphml(graph_file31)
net_vis31 = Network(notebook=True, cdn_resources='remote')
for node, node_data in graph31.nodes(data=True):
    attr_text = "\n\n".join([f"{key}: {value}" for key, value in node_data.items()])
    title = f"Node: {node}\n\n{attr_text}"
    # 根据node_data中的type属性["ORGANIZATION","PERSON","GEO","EVENT"]，设置不同的颜色
    entity_type = node_data.get("type", "")
    if entity_type == '"ORGANIZATION"':
        net_vis31.add_node(node, title=title, label=node, color="#FFA500")
    elif entity_type == '"PERSON"':
        net_vis31.add_node(node, title=title, label=node, color="#00FF00")
    elif entity_type == '"GEO"':
        net_vis31.add_node(node, title=title, label=node, color="#0000FF")
    elif entity_type == '"EVENT"':
        net_vis31.add_node(node, title=title, label=node, color="#FF0000")
    else:
        net_vis31.add_node(node, title=title, label=node)

    net_vis31.add_node(node, title=title, label=node)

for source, target, edge_data in graph31.edges(data=True):
    attr_text = "\n\n".join([f"{key}: {value}" for key, value in edge_data.items()])
    title = f"Edge from {source} to {target}\n\n{attr_text}"
    # 根据edge_data中的weight属性数值大小，设置不同的颜色
    edge_weight = edge_data.get("weight", 0)
    if edge_weight > 2:
        net_vis31.add_edge(source, target, title=title, color="#FF0000")
    elif edge_weight > 1:
        net_vis31.add_edge(source, target, title=title, color="#00FF00")
    else:
        net_vis31.add_edge(source, target, title=title)
    net_vis31.add_edge(source, target, title=title)

net_vis31.show("create_base_extracted_entities.html")

create_base_extracted_entities.html


In [9]:
# 将同一实体的多个描述进行总结，得到一个综合的描述
df32 = pd.read_parquet(path_prefix + "create_summarized_entities.parquet")
df32

Unnamed: 0,entity_graph
0,"<graphml xmlns=""http://graphml.graphdrawing.or..."


In [10]:
graph_file32 = io.StringIO(df32["entity_graph"][0])
graph32 = nx.read_graphml(graph_file32)
net_vis32 = Network(notebook=True, cdn_resources='remote')
for node, node_data in graph32.nodes(data=True):
    attr_text = "\n\n".join([f"{key}: {value}" for key, value in node_data.items()])
    title = f"Node: {node}\n\n{attr_text}"
    # 根据node_data中的type属性["ORGANIZATION","PERSON","GEO","EVENT"]，设置不同的颜色
    entity_type = node_data.get("type", "")
    if entity_type == '"ORGANIZATION"':
        net_vis32.add_node(node, title=title, label=node, color="#FFA500")
    elif entity_type == '"PERSON"':
        net_vis32.add_node(node, title=title, label=node, color="#00FF00")
    elif entity_type == '"GEO"':
        net_vis32.add_node(node, title=title, label=node, color="#0000FF")
    elif entity_type == '"EVENT"':
        net_vis32.add_node(node, title=title, label=node, color="#FF0000")
    else:
        net_vis32.add_node(node, title=title, label=node)

    net_vis32.add_node(node, title=title, label=node)

for source, target, edge_data in graph32.edges(data=True):
    attr_text = "\n\n".join([f"{key}: {value}" for key, value in edge_data.items()])
    title = f"Edge from {source} to {target}\n\n{attr_text}"
    # 根据edge_data中的weight属性数值大小，设置不同的颜色
    edge_weight = edge_data.get("weight", 0)
    if edge_weight > 2:
        net_vis32.add_edge(source, target, title=title, color="#FF0000")
    elif edge_weight > 1:
        net_vis32.add_edge(source, target, title=title, color="#00FF00")
    else:
        net_vis32.add_edge(source, target, title=title)
    net_vis32.add_edge(source, target, title=title)

net_vis32.show("create_summarized_entities.html")

create_summarized_entities.html


In [11]:
# 经过分层莱顿算法，进行分层和社区划分之后的图
# df33每一行是，图的一个层次，一层中包含若干个社区
# 高层次中的社区，是对上一层社区的进一步划分，例如：
# 层次0包含社区0和社区1，层次1包含社区2和社区3
# 层次1社区2 和 层次1社区3 中的实体都来自 层次0社区1
df33 = pd.read_parquet(path_prefix + "create_base_entity_graph.parquet")
df33

Unnamed: 0,level,clustered_graph,embeddings
0,0,"<graphml xmlns=""http://graphml.graphdrawing.or...","{'""丁举人""': [0.002083270810544491, -0.0186006743..."
1,1,"<graphml xmlns=""http://graphml.graphdrawing.or...","{'""丁举人""': [0.002083270810544491, -0.0186006743..."


In [12]:
graph_file330 = io.StringIO(df33["clustered_graph"][0])
graph330 = nx.read_graphml(graph_file330)
net_vis330 = Network(notebook=True, cdn_resources='remote')
for node, node_data in graph330.nodes(data=True):
    attr_text = "\n\n".join([f"{key}: {value}" for key, value in node_data.items()])
    title = f"Node: {node}\n\n{attr_text}"
    # 只保留第0层次的节点
    entity_level = node_data.get("level", -1)
    if entity_level != 0:
        continue
    # 根据node_data中的cluster属性，设置不同的颜色
    entity_cluster = node_data.get("cluster", "")
    if entity_cluster == '0':
        net_vis330.add_node(node, title=title, label=node, color="#FF0000")
    elif entity_cluster == '1':
        net_vis330.add_node(node, title=title, label=node, color="#00FF00")
    else:
        net_vis330.add_node(node, title=title, label=node, color="#0000FF")

    net_vis330.add_node(node, title=title, label=node)

for source, target, edge_data in graph330.edges(data=True):
    attr_text = "\n\n".join([f"{key}: {value}" for key, value in edge_data.items()])
    title = f"Edge from {source} to {target}\n\n{attr_text}"
    # 加边之前先判断source和target是否在图中存在，如果不存在则跳过该边
    try:
        net_vis330.get_node(source)
        net_vis330.get_node(target)
    except KeyError:
        continue
    # 根据edge_data中的weight属性数值大小，设置不同的颜色
    edge_weight = edge_data.get("weight", 0)
    if edge_weight > 3:
        net_vis330.add_edge(source, target, title=title, color="#FFA500")
    elif edge_weight > 1:
        net_vis330.add_edge(source, target, title=title, color="#00FF00")
    else:
        net_vis330.add_edge(source, target, title=title)
    net_vis330.add_edge(source, target, title=title)

net_vis330.show("create_base_entity_graph_0.html")

create_base_entity_graph_0.html


In [13]:
graph_file331 = io.StringIO(df33["clustered_graph"][1])
graph331 = nx.read_graphml(graph_file331)
net_vis331 = Network(notebook=True, cdn_resources='remote')
for node, node_data in graph331.nodes(data=True):
    attr_text = "\n\n".join([f"{key}: {value}" for key, value in node_data.items()])
    title = f"Node: {node}\n\n{attr_text}"
    # 只保留第1层次的节点
    entity_level = node_data.get("level", -1)
    if entity_level != 1:
        continue
    # 根据node_data中的cluster属性，设置不同的颜色
    entity_cluster = node_data.get("cluster", "")
    if entity_cluster == '2':
        net_vis331.add_node(node, title=title, label=node, color="#FFA500")
    elif entity_cluster == '3':
        net_vis331.add_node(node, title=title, label=node, color="#00FFA5")
    else:
        net_vis331.add_node(node, title=title, label=node, color="#0000FF")

    net_vis331.add_node(node, title=title, label=node)

for source, target, edge_data in graph331.edges(data=True):
    attr_text = "\n\n".join([f"{key}: {value}" for key, value in edge_data.items()])
    title = f"Edge from {source} to {target}\n\n{attr_text}"
    # 加边之前先判断source和target是否在图中存在，如果不存在则跳过该边
    try:
        net_vis331.get_node(source)
        net_vis331.get_node(target)
    except KeyError:
        continue
    # 根据edge_data中的weight属性数值大小，设置不同的颜色
    edge_weight = edge_data.get("weight", 0)
    if edge_weight > 2:
        net_vis331.add_edge(source, target, title=title, color="#FF0000")
    elif edge_weight > 1:
        net_vis331.add_edge(source, target, title=title, color="#00FF00")
    else:
        net_vis331.add_edge(source, target, title=title)
    net_vis331.add_edge(source, target, title=title)

net_vis331.show("create_base_entity_graph_1.html")

create_base_entity_graph_1.html


In [14]:
# 汇总整理每个实体的信息
# 并且加入基于node2vec的节点embedding、实体名embedding、描述embedding
df34 = pd.read_parquet(path_prefix + "create_final_entities.parquet")
df34

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,name_embedding,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,"""鲁镇""","""GEO""",鲁镇是一个充满浓厚地方特色和独特社会结构的背景小镇。在这个地方，酒店的布局独具一格，反映了当...,0,"[0.0012126521905884147, -0.01631433144211769, ...","[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.0735669955611229, 0.09959721565246582, -0....","[-0.055016759783029556, 0.031515203416347504, ..."
1,4119fd06010c494caa07f439b333f4c5,"""咸亨酒店""","""ORGANIZATION""",咸亨酒店是鲁镇一家著名的酒店，既是故事发生的具体地点，也是背景场所。它是一个充满人间百态和市...,1,"[0.002068893751129508, -0.021418889984488487, ...","[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.022785751149058342, -0.0065047163516283035...","[-0.012898093089461327, 0.013879787176847458, ..."
2,d3835bf3dda84ead99deadbeac5d0d7d,"""短衣帮""","""PERSON""","""短衣帮""是指一群做工的普通劳动者，他们通常在酒店外面站着喝酒，消费较为节俭。这些劳动者与穿...",2,"[0.0014271448599174619, -0.018397178500890732,...","[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.06655868887901306, 0.019998403266072273, -...","[-0.06994222104549408, 0.003092781640589237, -..."
3,077d2820ae1845bcbb1803379a3d1eae,"""长衫主顾""","""PERSON""","""长衫主顾""是指那些穿着长衫的顾客，他们通常较为富裕，具有较高的消费能力。在酒店环境中，长衫...",3,"[0.001347171375527978, -0.01570451632142067, 0...","[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.06811540573835373, 0.02075207605957985, -0...","[-0.0612790621817112, -0.025796692818403244, -..."
4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,"""我""","""PERSON""","""我是咸亨酒店的一名伙计，从十二岁起在酒店工作，负责温酒等事务。""",4,"[0.002342285355553031, -0.017849063500761986, ...",[1fae6f3d55b5cb0dba1f89f668218cd2],"[-0.04335128888487816, 0.0332072377204895, -0....","[-0.024990806356072426, 0.0028743341099470854,..."
5,19a7f254a5d64566ab5cc15472df02de,"""掌柜""","""PERSON""",掌柜是咸亨酒店的老板或管理者，负责监督酒店的运营、员工的工作以及日常管理事务，包括结账等。他...,5,"[0.0016673633363097906, -0.0175942275673151, 0...","[07730a19b8110138a785adc20fce8020, 138b6775248...","[-0.035059791058301926, 0.01561013888567686, -...","[-0.017183413729071617, -0.029239201918244362,..."
6,e7ffaee9d31d4d3c96e04f911d0a8f9e,"""丁举人""","""PERSON""",丁举人是鲁镇一个有地位、有权势且富有影响力的人物。他通过科举考试获得了功名，成为社会上层的一...,6,"[0.002083270810544491, -0.018600674346089363, ...","[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.03734252229332924, 0.0145729286596179, -0....","[-0.028294086456298828, 0.04944520816206932, -..."
7,f7e11b0e297a44a896dc67928368f600,"""孩子们""","""PERSON""","""孩子们是鲁镇的一部分居民，他们有时会在咸亨酒店外面聚集，观察或参与酒店内外的活动。""",7,"[0.0011556732933968306, -0.013554797507822514,...",[1fae6f3d55b5cb0dba1f89f668218cd2],"[-0.03457967936992645, 0.03139243647456169, -0...","[-0.013658163137733936, 0.014908626675605774, ..."
8,1fd3fa8bb5a2408790042ab9573779ee,"""酒客""","""PERSON""","""酒客是指在咸亨酒店喝酒的顾客，包括短衣帮和长衫主顾等不同群体。""",8,"[0.0015726208221167326, -0.015209603123366833,...",[1fae6f3d55b5cb0dba1f89f668218cd2],"[-0.08454140275716782, 0.036982156336307526, -...","[-0.05934860184788704, -0.005974501371383667, ..."
9,27f9fbe6ad8c4a8b9acee0d3596ed57c,"""碗碟""","""OBJECT""","""碗碟是咸亨酒店内用于盛放食物和酒水的器皿。""",9,"[0.0015572767006233335, -0.014106987044215202,...",[1fae6f3d55b5cb0dba1f89f668218cd2],"[-0.03174711763858795, 0.05417044833302498, -0...","[0.005624593701213598, 0.017494555562734604, -..."


In [15]:
# 汇总各个节点的信息
# 并根据node2vec的embedding结果，调用UMAP降维算法，生成各节点2维可视化时的坐标
# 同一节点可能在不同层中多次出现，因此这里的表格长度是 节点数 * 层次数
df35 = pd.read_parquet(path_prefix + "create_final_nodes.parquet")
df35

Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,top_level_node_id,x,y
0,0,"""鲁镇""","""GEO""",鲁镇是一个充满浓厚地方特色和独特社会结构的背景小镇。在这个地方，酒店的布局独具一格，反映了当...,"07730a19b8110138a785adc20fce8020,1fae6f3d55b5c...",1,2,0,b45241d70f0e43fca764df95b2b81f77,2,"[0.0012126521905884147, -0.01631433144211769, ...",b45241d70f0e43fca764df95b2b81f77,-31.406649,-19.495066
1,0,"""咸亨酒店""","""ORGANIZATION""",咸亨酒店是鲁镇一家著名的酒店，既是故事发生的具体地点，也是背景场所。它是一个充满人间百态和市...,"07730a19b8110138a785adc20fce8020,1fae6f3d55b5c...",1,14,1,4119fd06010c494caa07f439b333f4c5,14,"[0.002068893751129508, -0.021418889984488487, ...",4119fd06010c494caa07f439b333f4c5,15.223695,4.372850
2,0,"""短衣帮""","""PERSON""","""短衣帮""是指一群做工的普通劳动者，他们通常在酒店外面站着喝酒，消费较为节俭。这些劳动者与穿...","07730a19b8110138a785adc20fce8020,1fae6f3d55b5c...",1,3,2,d3835bf3dda84ead99deadbeac5d0d7d,3,"[0.0014271448599174619, -0.018397178500890732,...",d3835bf3dda84ead99deadbeac5d0d7d,17.560656,5.461694
3,0,"""长衫主顾""","""PERSON""","""长衫主顾""是指那些穿着长衫的顾客，他们通常较为富裕，具有较高的消费能力。在酒店环境中，长衫...","07730a19b8110138a785adc20fce8020,1fae6f3d55b5c...",1,2,3,077d2820ae1845bcbb1803379a3d1eae,2,"[0.001347171375527978, -0.01570451632142067, 0...",077d2820ae1845bcbb1803379a3d1eae,22.915752,24.380701
4,0,"""我""","""PERSON""","""我是咸亨酒店的一名伙计，从十二岁起在酒店工作，负责温酒等事务。""",1fae6f3d55b5cb0dba1f89f668218cd2,1,2,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,2,"[0.002342285355553031, -0.017849063500761986, ...",3671ea0dd4e84c1a9b02c5ab2c8f4bac,16.253996,4.803540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,1,"""笔墨纸砚""","""ITEM""","""笔墨纸砚是文人常用的书写工具，孔乙己曾用它们写字教人知识。""",07730a19b8110138a785adc20fce8020,,1,32,85c79fd84f5e4f918471c386852204c5,1,"[0.0018167212838307023, -0.016923651099205017,...",85c79fd84f5e4f918471c386852204c5,23.272655,24.090113
70,1,"""秀才""","""PERSON""","""秀才是科举制度中的一种功名，孔乙己曾经努力考取但未成功。""",07730a19b8110138a785adc20fce8020,,1,33,eae4259b19a741ab9f9f6af18c4a0470,1,"[0.0019483972573652864, -0.015298962593078613,...",eae4259b19a741ab9f9f6af18c4a0470,4.452078,12.036289
71,1,"""初冬""","""GEO""","""初冬是一个季节描述，表示故事发生的季节背景。""",b2c9b122d0e796bb26d85487fdaae53a,,1,34,3138f39f2bcd43a69e0697cd3b05bc4d,1,"[0.0013473564758896828, -0.015686839818954468,...",3138f39f2bcd43a69e0697cd3b05bc4d,2.766302,9.519711
72,1,"""年关""","""EVENT""","""年关是指每年年底的时期，在文中提到两次，用于描述掌柜对孔乙己欠款的提及。""",138b67752488e94c5e5a9d788efce181,,1,35,dde131ab575d44dbb55289a6972be18f,1,"[0.0010946310358121991, -0.015775222331285477,...",dde131ab575d44dbb55289a6972be18f,3.723950,11.748106


In [16]:
# 整理汇总社区划分的结果
df41 = pd.read_parquet(path_prefix + "create_final_communities.parquet")
df41

Unnamed: 0,id,title,level,raw_community,relationship_ids,text_unit_ids
0,1,Community 1,0,1,"[e2bf260115514fb3b252fd879fb3e7be, b462b94ce47...","[07730a19b8110138a785adc20fce8020,1fae6f3d55b5..."
1,0,Community 0,0,0,"[b785a9025069417f94950ad231bb1441, 3b6cd96a273...","[07730a19b8110138a785adc20fce8020,138b67752488..."
2,3,Community 3,1,3,"[e2bf260115514fb3b252fd879fb3e7be, b462b94ce47...","[07730a19b8110138a785adc20fce8020,1fae6f3d55b5..."
3,2,Community 2,1,2,"[17ed1d92075643579a712cc6c29e8ddb, 3ce7c210a21...","[07730a19b8110138a785adc20fce8020,1fae6f3d55b5..."


In [17]:
df23 = pd.read_parquet(path_prefix + "join_text_units_to_entity_ids.parquet")
df23

Unnamed: 0,text_unit_ids,entity_ids,id
0,07730a19b8110138a785adc20fce8020,"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...",07730a19b8110138a785adc20fce8020
1,1fae6f3d55b5cb0dba1f89f668218cd2,"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...",1fae6f3d55b5cb0dba1f89f668218cd2
2,355e705881fd7c83e813a83d24db74b7,"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...",355e705881fd7c83e813a83d24db74b7
3,5d3d851e7c8a5cea251f64785f55435d,"[4119fd06010c494caa07f439b333f4c5, 19a7f254a5d...",5d3d851e7c8a5cea251f64785f55435d
4,dee20f36e11c4185c682fd582d02ed69,"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...",dee20f36e11c4185c682fd582d02ed69
5,138b67752488e94c5e5a9d788efce181,"[19a7f254a5d64566ab5cc15472df02de, 9646481f66c...",138b67752488e94c5e5a9d788efce181
6,b2c9b122d0e796bb26d85487fdaae53a,"[19a7f254a5d64566ab5cc15472df02de, e7ffaee9d31...",b2c9b122d0e796bb26d85487fdaae53a


In [18]:
df36 = pd.read_parquet(path_prefix + "create_final_relationships.parquet")
df36

Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,"""鲁镇""","""咸亨酒店""",1.0,"""咸亨酒店位于鲁镇，是当地居民日常生活中的一部分，反映了鲁镇的社会结构。""",[1fae6f3d55b5cb0dba1f89f668218cd2],e2bf260115514fb3b252fd879fb3e7be,0,2,14,16
1,"""鲁镇""","""酒店""",1.0,"""酒店位于鲁镇，是镇上的一个重要社交场所。""",[07730a19b8110138a785adc20fce8020],b462b94ce47a4b8c8fffa33f7242acec,1,2,4,6
2,"""咸亨酒店""","""短衣帮""",1.0,"""短衣帮是咸亨酒店的主要顾客之一，他们在酒店外面喝酒，形成了酒店的一种典型景象。""",[1fae6f3d55b5cb0dba1f89f668218cd2],17ed1d92075643579a712cc6c29e8ddb,2,14,3,17
3,"""咸亨酒店""","""长衫主顾""",1.0,"""长衫主顾是咸亨酒店的重要顾客，他们在店内享受更高档次的服务，受到酒店掌柜的特别关注。""",[1fae6f3d55b5cb0dba1f89f668218cd2],3ce7c210a21b4deebad7cc9308148d86,3,14,2,16
4,"""咸亨酒店""","""我""",1.0,"""我在咸亨酒店担任伙计，负责温酒等事务，经历了酒店内外的不同顾客群体。""",[1fae6f3d55b5cb0dba1f89f668218cd2],d64ed762ea924caa95c8d06f072a9a96,4,14,2,16
5,"""咸亨酒店""","""掌柜""",1.0,"""掌柜是咸亨酒店的管理者，负责监督酒店的运营和员工的工作。""",[1fae6f3d55b5cb0dba1f89f668218cd2],adf4ee3fbe9b4d0381044838c4f889c8,5,14,3,17
6,"""咸亨酒店""","""孩子们""",1.0,"""孩子们常常在咸亨酒店外面玩耍或围观，增加了酒店周围的活力。""",[1fae6f3d55b5cb0dba1f89f668218cd2],32ee140946e5461f9275db664dc541a5,6,14,1,15
7,"""咸亨酒店""","""酒客""",1.0,"""酒客们构成了咸亨酒店的主要顾客群体，他们的行为和习惯反映了鲁镇的社会风貌。""",[1fae6f3d55b5cb0dba1f89f668218cd2],c160b9cb27d6408ba6ab20214a2f3f81,7,14,1,15
8,"""咸亨酒店""","""碗碟""",1.0,"""碗碟是咸亨酒店日常运营中不可或缺的用具，用于服务顾客。""",[1fae6f3d55b5cb0dba1f89f668218cd2],23527cd679ff4d5a988d52e7cd056078,8,14,1,15
9,"""咸亨酒店""","""茴香豆""",1.0,"""茴香豆是咸亨酒店提供给顾客的一种受欢迎的小吃。""",[1fae6f3d55b5cb0dba1f89f668218cd2],f1c6eed066f24cbdb376b910fce29ed4,9,14,2,16


In [19]:
df24 = pd.read_parquet(path_prefix + "join_text_units_to_relationship_ids.parquet")
df24

Unnamed: 0,id,relationship_ids
0,1fae6f3d55b5cb0dba1f89f668218cd2,"[e2bf260115514fb3b252fd879fb3e7be, 17ed1d92075..."
1,07730a19b8110138a785adc20fce8020,"[b462b94ce47a4b8c8fffa33f7242acec, de6fa244808..."
2,355e705881fd7c83e813a83d24db74b7,"[de6fa24480894518ab3cbcb66f739266, ef32c4b208d..."
3,dee20f36e11c4185c682fd582d02ed69,"[de6fa24480894518ab3cbcb66f739266, ef32c4b208d..."
4,5d3d851e7c8a5cea251f64785f55435d,"[6fae5ee1a831468aa585a1ea09095998, b785a902506..."
5,138b67752488e94c5e5a9d788efce181,"[b785a9025069417f94950ad231bb1441, 56d0e5ebe79..."
6,b2c9b122d0e796bb26d85487fdaae53a,"[b785a9025069417f94950ad231bb1441, 3b6cd96a273..."


In [20]:
df42 = pd.read_parquet(path_prefix + "create_final_community_reports.parquet")
df42

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,2,# Tech Innovators Network - Silicon Valley\n\n...,1,8.5,Tech Innovators Network - Silicon Valley,该社区在科技创新和技术发展方面具有显著影响力，其成员和活动对全球科技趋势有重要贡献。,Tech Innovators Network (TIN) 是一个位于硅谷的科技社区，汇集了...,[{'explanation': 'Tech Innovators Network 包含了几...,"{\n ""title"": ""Tech Innovators Network - Sil...",43f967ba-0ef2-4824-8d6c-3b3cb09339ec
1,3,# Tech Innovators Network - Silicon Valley\n\n...,1,7.5,Tech Innovators Network - Silicon Valley,The community's significant influence on globa...,Tech Innovators Network (TIN) is a prominent c...,[{'explanation': 'Entities such as TechCorp an...,"{\n ""title"": ""Tech Innovators Network - Sil...",f9536baa-a451-46c7-8e9b-7e428bf4e94e
2,0,# Tech Innovators Network - Silicon Valley\n\n...,0,7.5,Tech Innovators Network - Silicon Valley,The community's significant influence on globa...,Tech Innovators Network (TIN) is a prominent c...,[{'explanation': 'Entities such as TechCorp an...,"{\n ""title"": ""Tech Innovators Network - Sil...",9805d58e-1d04-4b94-9475-5acf581f4cf2
3,1,# Tech Innovators Network - Silicon Valley\n\n...,0,7.5,Tech Innovators Network - Silicon Valley,The community's significant influence on globa...,Tech Innovators Network (TIN) is a prominent c...,[{'explanation': 'Entities such as TechCorp an...,"{\n ""title"": ""Tech Innovators Network - Sil...",645806cd-a5b0-4537-9423-624cb864b1c5


In [21]:
df22 = pd.read_parquet(path_prefix + "create_final_text_units.parquet")
df22

Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,1fae6f3d55b5cb0dba1f89f668218cd2,鲁镇的酒店的格局，是和别处不同的：都是当街一个尺形的大柜台，柜里面预备着热水，可以随时温酒。...,600,[129084b0fa1815780605fa5c38c85b77],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[e2bf260115514fb3b252fd879fb3e7be, 17ed1d92075..."
1,355e705881fd7c83e813a83d24db74b7,了这事。幸亏荐头的情面大，辞退不得，便改为专管温酒的一种无聊职务了。\n\n我从此便整天的站...,600,[129084b0fa1815780605fa5c38c85b77],"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...","[de6fa24480894518ab3cbcb66f739266, ef32c4b208d..."
2,dee20f36e11c4185c682fd582d02ed69,东西了!”孔乙己睁大眼睛说，“你怎么这样凭空污人清白……”“什么清白?我前天亲眼见你偷了何家...,600,[129084b0fa1815780605fa5c38c85b77],"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...","[de6fa24480894518ab3cbcb66f739266, ef32c4b208d..."
3,5d3d851e7c8a5cea251f64785f55435d,渐复了原，旁人便又问道，“孔乙己，你当真认识字么?”孔乙己看着问他的人，显出不屑置辩的神气。...,600,[129084b0fa1815780605fa5c38c85b77],"[4119fd06010c494caa07f439b333f4c5, 19a7f254a5d...","[6fae5ee1a831468aa585a1ea09095998, b785a902506..."
4,07730a19b8110138a785adc20fce8020,�的答他道，“谁要你教，不是草头底下一个来回的回字么?”孔乙己显出极高兴的样子，将两个指头的...,600,[129084b0fa1815780605fa5c38c85b77],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[b462b94ce47a4b8c8fffa33f7242acec, de6fa244808..."
5,b2c9b122d0e796bb26d85487fdaae53a,…他打折了腿了。”掌柜说，“哦!”“他总仍旧是偷。这一回，是自己发昏，竟偷到丁举人家里去了。...,600,[129084b0fa1815780605fa5c38c85b77],"[19a7f254a5d64566ab5cc15472df02de, e7ffaee9d31...","[b785a9025069417f94950ad231bb1441, 3b6cd96a273..."
6,138b67752488e94c5e5a9d788efce181,��要好。”掌柜仍然同平常一样，笑着对他说，“孔乙己，你又偷了东西了!”但他这回却不十分分辩...,408,[129084b0fa1815780605fa5c38c85b77],"[19a7f254a5d64566ab5cc15472df02de, 9646481f66c...","[b785a9025069417f94950ad231bb1441, 56d0e5ebe79..."


In [22]:
df11 = pd.read_parquet(path_prefix + "create_base_documents.parquet")
df11

Unnamed: 0,id,text_units,raw_content,title
0,129084b0fa1815780605fa5c38c85b77,"[1fae6f3d55b5cb0dba1f89f668218cd2, 355e705881f...",鲁镇的酒店的格局，是和别处不同的：都是当街一个尺形的大柜台，柜里面预备着热水，可以随时温酒。...,kongyiji.txt


In [23]:
df12 = pd.read_parquet(path_prefix + "create_final_documents.parquet")
df12

Unnamed: 0,id,text_unit_ids,raw_content,title
0,129084b0fa1815780605fa5c38c85b77,"[1fae6f3d55b5cb0dba1f89f668218cd2, 355e705881f...",鲁镇的酒店的格局，是和别处不同的：都是当街一个尺形的大柜台，柜里面预备着热水，可以随时温酒。...,kongyiji.txt
