In [93]:
# 知识库文件内容与生成顺序见：
# https://www.bilibili.com/video/BV1u6iFeAEx9?p=8
"""
├── Loading Input (text)
├── create_base_text_units
├── create_base_extracted_entities
├── create_summarized_entities
├── create_base_entity_graph
├── create_final_entities
├── create_final_nodes
├── create_final_communities
├── join_text_units_to_entity_ids
├── create_final_relationships
├── join_text_units_to_relationship_ids
├── create_final_community_reports
├── create_final_text_units
├── create_base_documents
└── create_final_documents
"""

'\n├── Loading Input (text)\n├── create_base_text_units\n├── create_base_extracted_entities\n├── create_summarized_entities\n├── create_base_entity_graph\n├── create_final_entities\n├── create_final_nodes\n├── create_final_communities\n├── join_text_units_to_entity_ids\n├── create_final_relationships\n├── join_text_units_to_relationship_ids\n├── create_final_community_reports\n├── create_final_text_units\n├── create_base_documents\n└── create_final_documents\n'

In [94]:
import networkx as nx
from pyvis.network import Network
import io

def visualize_graph(graphml_str):
    # 由于各df的列名不同，所以设置graphml_str为输入参数
    # 同样由于各df文件名不同，返回graph和net_vis
    graph_file = io.StringIO(graphml_str)
    graph = nx.read_graphml(graph_file)
    net_vis = Network(notebook=True, cdn_resources='remote')
    # 添加节点和边到pyvis网络，并设置title属性用于显示description
    for node, node_data in graph.nodes(data=True):
        # title = node_data.get('description', '')  # 获取节点的description，若无则为空字符串
        attr_text = "\n\n".join([f"{key}: {value}" for key, value in node_data.items()])
        title = f"Node: {node}\n\n{attr_text}"
        net_vis.add_node(node, title=title, label=node)  # 或者使用其他你喜欢的label

    for source, target, edge_data in graph.edges(data=True):
        # title = edge_data.get('description', '')  # 获取边的description，若无则为空字符串
        attr_text = "\n\n".join([f"{key}: {value}" for key, value in edge_data.items()])
        title = f"Edge from {source} to {target}\n\n{attr_text}"
        net_vis.add_edge(source, target, title=title)

    return graph, net_vis


In [95]:
import pandas as pd

path_prefix = "./kongyiji/output/kongyiji/artifacts/"

In [96]:
df21 = pd.read_parquet(path_prefix + "create_base_text_units.parquet")
df21

Unnamed: 0,id,chunk,chunk_id,document_ids,n_tokens
0,1fae6f3d55b5cb0dba1f89f668218cd2,鲁镇的酒店的格局，是和别处不同的：都是当街一个尺形的大柜台，柜里面预备着热水，可以随时温酒。...,1fae6f3d55b5cb0dba1f89f668218cd2,[129084b0fa1815780605fa5c38c85b77],600
1,355e705881fd7c83e813a83d24db74b7,了这事。幸亏荐头的情面大，辞退不得，便改为专管温酒的一种无聊职务了。\n\n我从此便整天的站...,355e705881fd7c83e813a83d24db74b7,[129084b0fa1815780605fa5c38c85b77],600
2,dee20f36e11c4185c682fd582d02ed69,东西了!”孔乙己睁大眼睛说，“你怎么这样凭空污人清白……”“什么清白?我前天亲眼见你偷了何家...,dee20f36e11c4185c682fd582d02ed69,[129084b0fa1815780605fa5c38c85b77],600
3,5d3d851e7c8a5cea251f64785f55435d,渐复了原，旁人便又问道，“孔乙己，你当真认识字么?”孔乙己看着问他的人，显出不屑置辩的神气。...,5d3d851e7c8a5cea251f64785f55435d,[129084b0fa1815780605fa5c38c85b77],600
4,07730a19b8110138a785adc20fce8020,�的答他道，“谁要你教，不是草头底下一个来回的回字么?”孔乙己显出极高兴的样子，将两个指头的...,07730a19b8110138a785adc20fce8020,[129084b0fa1815780605fa5c38c85b77],600
5,b2c9b122d0e796bb26d85487fdaae53a,…他打折了腿了。”掌柜说，“哦!”“他总仍旧是偷。这一回，是自己发昏，竟偷到丁举人家里去了。...,b2c9b122d0e796bb26d85487fdaae53a,[129084b0fa1815780605fa5c38c85b77],600
6,138b67752488e94c5e5a9d788efce181,��要好。”掌柜仍然同平常一样，笑着对他说，“孔乙己，你又偷了东西了!”但他这回却不十分分辩...,138b67752488e94c5e5a9d788efce181,[129084b0fa1815780605fa5c38c85b77],408


In [97]:
df31 = pd.read_parquet(path_prefix + "create_base_extracted_entities.parquet")
df31

Unnamed: 0,entity_graph
0,"<graphml xmlns=""http://graphml.graphdrawing.or..."


In [98]:
graph31,net_vis31 = visualize_graph(df31["entity_graph"][0])
graph31

<networkx.classes.graph.Graph at 0x1d312572440>

In [99]:
net_vis31.show(path_prefix + "create_base_extracted_entities.html")

./kongyiji/output/kongyiji/artifacts/create_base_extracted_entities.html


In [100]:
df32 = pd.read_parquet(path_prefix + "create_summarized_entities.parquet")
df32

Unnamed: 0,entity_graph
0,"<graphml xmlns=""http://graphml.graphdrawing.or..."


In [101]:
graph32,net_vis32 = visualize_graph(df32["entity_graph"][0])
graph32

<networkx.classes.graph.Graph at 0x1d31253f7c0>

In [102]:
net_vis32.show(path_prefix + "create_summarized_entities.html")

./kongyiji/output/kongyiji/artifacts/create_summarized_entities.html


In [103]:
df33 = pd.read_parquet(path_prefix + "create_base_entity_graph.parquet")
df33

Unnamed: 0,level,clustered_graph
0,0,"<graphml xmlns=""http://graphml.graphdrawing.or..."
1,1,"<graphml xmlns=""http://graphml.graphdrawing.or..."


In [104]:
graph330,net_vis330 = visualize_graph(df33["clustered_graph"][0])
graph330

<networkx.classes.graph.Graph at 0x1d31249e350>

In [105]:
net_vis330.show(path_prefix + "create_base_entity_graph_0.html")

./kongyiji/output/kongyiji/artifacts/create_base_entity_graph_0.html


In [106]:
graph331,net_vis331 = visualize_graph(df33["clustered_graph"][1])
graph331

<networkx.classes.graph.Graph at 0x1d312587910>

In [107]:
net_vis331.show(path_prefix + "create_base_entity_graph_1.html")

./kongyiji/output/kongyiji/artifacts/create_base_entity_graph_1.html


In [108]:
df34 = pd.read_parquet(path_prefix + "create_final_entities.parquet")
df34

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,"""鲁镇""","""GEO""",鲁镇是一个充满浓厚地方特色和独特社会结构的背景小镇。在这个地方，酒店的布局独具一格，反映了当...,0,,"[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.055016759783029556, 0.031515203416347504, ..."
1,4119fd06010c494caa07f439b333f4c5,"""咸亨酒店""","""ORGANIZATION""",咸亨酒店是鲁镇一家著名的酒店，既是故事发生的具体地点，也是背景场所。它是一个充满人间百态和市...,1,,"[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.012898093089461327, 0.013879787176847458, ..."
2,d3835bf3dda84ead99deadbeac5d0d7d,"""短衣帮""","""PERSON""","""短衣帮""是指一群做工的普通劳动者，他们通常在酒店外面站着喝酒，消费较为节俭。这些劳动者与穿...",2,,"[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.06994222104549408, 0.003092781640589237, -..."
3,077d2820ae1845bcbb1803379a3d1eae,"""长衫主顾""","""PERSON""","""长衫主顾""是指那些穿着长衫的顾客，他们通常较为富裕，具有较高的消费能力。在酒店环境中，长衫...",3,,"[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.0612790621817112, -0.025796692818403244, -..."
4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,"""我""","""PERSON""","""我是咸亨酒店的一名伙计，从十二岁起在酒店工作，负责温酒等事务。""",4,,[1fae6f3d55b5cb0dba1f89f668218cd2],"[-0.024990806356072426, 0.0028743341099470854,..."
5,19a7f254a5d64566ab5cc15472df02de,"""掌柜""","""PERSON""",掌柜是咸亨酒店的老板或管理者，负责监督酒店的运营、员工的工作以及日常管理事务，包括结账等。他...,5,,"[07730a19b8110138a785adc20fce8020, 138b6775248...","[-0.017183413729071617, -0.029239201918244362,..."
6,e7ffaee9d31d4d3c96e04f911d0a8f9e,"""丁举人""","""PERSON""",丁举人是鲁镇一个有地位、有权势且富有影响力的人物。他通过科举考试获得了功名，成为社会上层的一...,6,,"[07730a19b8110138a785adc20fce8020, 1fae6f3d55b...","[-0.028294086456298828, 0.04944520816206932, -..."
7,f7e11b0e297a44a896dc67928368f600,"""孩子们""","""PERSON""","""孩子们是鲁镇的一部分居民，他们有时会在咸亨酒店外面聚集，观察或参与酒店内外的活动。""",7,,[1fae6f3d55b5cb0dba1f89f668218cd2],"[-0.013658163137733936, 0.014908626675605774, ..."
8,1fd3fa8bb5a2408790042ab9573779ee,"""酒客""","""PERSON""","""酒客是指在咸亨酒店喝酒的顾客，包括短衣帮和长衫主顾等不同群体。""",8,,[1fae6f3d55b5cb0dba1f89f668218cd2],"[-0.05934860184788704, -0.005974501371383667, ..."
9,27f9fbe6ad8c4a8b9acee0d3596ed57c,"""碗碟""","""OBJECT""","""碗碟是咸亨酒店内用于盛放食物和酒水的器皿。""",9,,[1fae6f3d55b5cb0dba1f89f668218cd2],"[0.005624593701213598, 0.017494555562734604, -..."


In [109]:
df35 = pd.read_parquet(path_prefix + "create_final_nodes.parquet")
df35

Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,top_level_node_id,x,y
0,0,"""鲁镇""","""GEO""",鲁镇是一个充满浓厚地方特色和独特社会结构的背景小镇。在这个地方，酒店的布局独具一格，反映了当...,"07730a19b8110138a785adc20fce8020,1fae6f3d55b5c...",1,2,0,b45241d70f0e43fca764df95b2b81f77,2,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,"""咸亨酒店""","""ORGANIZATION""",咸亨酒店是鲁镇一家著名的酒店，既是故事发生的具体地点，也是背景场所。它是一个充满人间百态和市...,"07730a19b8110138a785adc20fce8020,1fae6f3d55b5c...",1,14,1,4119fd06010c494caa07f439b333f4c5,14,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,"""短衣帮""","""PERSON""","""短衣帮""是指一群做工的普通劳动者，他们通常在酒店外面站着喝酒，消费较为节俭。这些劳动者与穿...","07730a19b8110138a785adc20fce8020,1fae6f3d55b5c...",1,3,2,d3835bf3dda84ead99deadbeac5d0d7d,3,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,"""长衫主顾""","""PERSON""","""长衫主顾""是指那些穿着长衫的顾客，他们通常较为富裕，具有较高的消费能力。在酒店环境中，长衫...","07730a19b8110138a785adc20fce8020,1fae6f3d55b5c...",1,2,3,077d2820ae1845bcbb1803379a3d1eae,2,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,"""我""","""PERSON""","""我是咸亨酒店的一名伙计，从十二岁起在酒店工作，负责温酒等事务。""",1fae6f3d55b5cb0dba1f89f668218cd2,1,2,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,2,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,1,"""笔墨纸砚""","""ITEM""","""笔墨纸砚是文人常用的书写工具，孔乙己曾用它们写字教人知识。""",07730a19b8110138a785adc20fce8020,,1,32,85c79fd84f5e4f918471c386852204c5,1,,85c79fd84f5e4f918471c386852204c5,0,0
70,1,"""秀才""","""PERSON""","""秀才是科举制度中的一种功名，孔乙己曾经努力考取但未成功。""",07730a19b8110138a785adc20fce8020,,1,33,eae4259b19a741ab9f9f6af18c4a0470,1,,eae4259b19a741ab9f9f6af18c4a0470,0,0
71,1,"""初冬""","""GEO""","""初冬是一个季节描述，表示故事发生的季节背景。""",b2c9b122d0e796bb26d85487fdaae53a,,1,34,3138f39f2bcd43a69e0697cd3b05bc4d,1,,3138f39f2bcd43a69e0697cd3b05bc4d,0,0
72,1,"""年关""","""EVENT""","""年关是指每年年底的时期，在文中提到两次，用于描述掌柜对孔乙己欠款的提及。""",138b67752488e94c5e5a9d788efce181,,1,35,dde131ab575d44dbb55289a6972be18f,1,,dde131ab575d44dbb55289a6972be18f,0,0


In [110]:
df41 = pd.read_parquet(path_prefix + "create_final_communities.parquet")
df41

Unnamed: 0,id,title,level,raw_community,relationship_ids,text_unit_ids
0,1,Community 1,0,1,"[e2bf260115514fb3b252fd879fb3e7be, b462b94ce47...","[07730a19b8110138a785adc20fce8020,1fae6f3d55b5..."
1,0,Community 0,0,0,"[b785a9025069417f94950ad231bb1441, 3b6cd96a273...","[07730a19b8110138a785adc20fce8020,138b67752488..."
2,3,Community 3,1,3,"[e2bf260115514fb3b252fd879fb3e7be, b462b94ce47...","[07730a19b8110138a785adc20fce8020,1fae6f3d55b5..."
3,2,Community 2,1,2,"[17ed1d92075643579a712cc6c29e8ddb, 3ce7c210a21...","[07730a19b8110138a785adc20fce8020,1fae6f3d55b5..."


In [111]:
df23 = pd.read_parquet(path_prefix + "join_text_units_to_entity_ids.parquet")
df23

Unnamed: 0,text_unit_ids,entity_ids,id
0,07730a19b8110138a785adc20fce8020,"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...",07730a19b8110138a785adc20fce8020
1,1fae6f3d55b5cb0dba1f89f668218cd2,"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...",1fae6f3d55b5cb0dba1f89f668218cd2
2,355e705881fd7c83e813a83d24db74b7,"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...",355e705881fd7c83e813a83d24db74b7
3,5d3d851e7c8a5cea251f64785f55435d,"[4119fd06010c494caa07f439b333f4c5, 19a7f254a5d...",5d3d851e7c8a5cea251f64785f55435d
4,dee20f36e11c4185c682fd582d02ed69,"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...",dee20f36e11c4185c682fd582d02ed69
5,138b67752488e94c5e5a9d788efce181,"[19a7f254a5d64566ab5cc15472df02de, 9646481f66c...",138b67752488e94c5e5a9d788efce181
6,b2c9b122d0e796bb26d85487fdaae53a,"[19a7f254a5d64566ab5cc15472df02de, e7ffaee9d31...",b2c9b122d0e796bb26d85487fdaae53a


In [112]:
df36 = pd.read_parquet(path_prefix + "create_final_relationships.parquet")
df36

Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,"""鲁镇""","""咸亨酒店""",1.0,"""咸亨酒店位于鲁镇，是当地居民日常生活中的一部分，反映了鲁镇的社会结构。""",[1fae6f3d55b5cb0dba1f89f668218cd2],e2bf260115514fb3b252fd879fb3e7be,0,2,14,16
1,"""鲁镇""","""酒店""",1.0,"""酒店位于鲁镇，是镇上的一个重要社交场所。""",[07730a19b8110138a785adc20fce8020],b462b94ce47a4b8c8fffa33f7242acec,1,2,4,6
2,"""咸亨酒店""","""短衣帮""",1.0,"""短衣帮是咸亨酒店的主要顾客之一，他们在酒店外面喝酒，形成了酒店的一种典型景象。""",[1fae6f3d55b5cb0dba1f89f668218cd2],17ed1d92075643579a712cc6c29e8ddb,2,14,3,17
3,"""咸亨酒店""","""长衫主顾""",1.0,"""长衫主顾是咸亨酒店的重要顾客，他们在店内享受更高档次的服务，受到酒店掌柜的特别关注。""",[1fae6f3d55b5cb0dba1f89f668218cd2],3ce7c210a21b4deebad7cc9308148d86,3,14,2,16
4,"""咸亨酒店""","""我""",1.0,"""我在咸亨酒店担任伙计，负责温酒等事务，经历了酒店内外的不同顾客群体。""",[1fae6f3d55b5cb0dba1f89f668218cd2],d64ed762ea924caa95c8d06f072a9a96,4,14,2,16
5,"""咸亨酒店""","""掌柜""",1.0,"""掌柜是咸亨酒店的管理者，负责监督酒店的运营和员工的工作。""",[1fae6f3d55b5cb0dba1f89f668218cd2],adf4ee3fbe9b4d0381044838c4f889c8,5,14,3,17
6,"""咸亨酒店""","""孩子们""",1.0,"""孩子们常常在咸亨酒店外面玩耍或围观，增加了酒店周围的活力。""",[1fae6f3d55b5cb0dba1f89f668218cd2],32ee140946e5461f9275db664dc541a5,6,14,1,15
7,"""咸亨酒店""","""酒客""",1.0,"""酒客们构成了咸亨酒店的主要顾客群体，他们的行为和习惯反映了鲁镇的社会风貌。""",[1fae6f3d55b5cb0dba1f89f668218cd2],c160b9cb27d6408ba6ab20214a2f3f81,7,14,1,15
8,"""咸亨酒店""","""碗碟""",1.0,"""碗碟是咸亨酒店日常运营中不可或缺的用具，用于服务顾客。""",[1fae6f3d55b5cb0dba1f89f668218cd2],23527cd679ff4d5a988d52e7cd056078,8,14,1,15
9,"""咸亨酒店""","""茴香豆""",1.0,"""茴香豆是咸亨酒店提供给顾客的一种受欢迎的小吃。""",[1fae6f3d55b5cb0dba1f89f668218cd2],f1c6eed066f24cbdb376b910fce29ed4,9,14,2,16


In [113]:
df24 = pd.read_parquet(path_prefix + "join_text_units_to_relationship_ids.parquet")
df24

Unnamed: 0,id,relationship_ids
0,1fae6f3d55b5cb0dba1f89f668218cd2,"[e2bf260115514fb3b252fd879fb3e7be, 17ed1d92075..."
1,07730a19b8110138a785adc20fce8020,"[b462b94ce47a4b8c8fffa33f7242acec, de6fa244808..."
2,355e705881fd7c83e813a83d24db74b7,"[de6fa24480894518ab3cbcb66f739266, ef32c4b208d..."
3,dee20f36e11c4185c682fd582d02ed69,"[de6fa24480894518ab3cbcb66f739266, ef32c4b208d..."
4,5d3d851e7c8a5cea251f64785f55435d,"[6fae5ee1a831468aa585a1ea09095998, b785a902506..."
5,138b67752488e94c5e5a9d788efce181,"[b785a9025069417f94950ad231bb1441, 56d0e5ebe79..."
6,b2c9b122d0e796bb26d85487fdaae53a,"[b785a9025069417f94950ad231bb1441, 3b6cd96a273..."


In [114]:
df42 = pd.read_parquet(path_prefix + "create_final_community_reports.parquet")
df42

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,2,# Tech Innovators Community: Key Entities and ...,1,7.3,Tech Innovators Community: Key Entities and Re...,The community's significant influence on the t...,The Tech Innovators Community is a network of ...,[{'explanation': 'StartUp Hub serves as an inc...,"{\n ""title"": ""Tech Innovators Community: Ke...",245d133b-86be-4fdf-a4c7-7752fe3f63b3
1,3,# Tech Innovators Community: Key Entities and ...,1,7.8,Tech Innovators Community: Key Entities and Th...,The rating reflects the community's significan...,The Tech Innovators Community is a network of ...,[{'explanation': 'Several startups within the ...,"{\n ""title"": ""Tech Innovators Community: Ke...",5198f845-da01-4d4c-9220-3cfa569adcf5
2,0,# Tech Innovators Community: Key Entities and ...,0,7.8,Tech Innovators Community: Key Entities and Th...,The rating reflects the community's significan...,The Tech Innovators Community is a network of ...,[{'explanation': 'Several startups within the ...,"{\n ""title"": ""Tech Innovators Community: Ke...",34c1436b-6ed9-4ccc-ab83-669e1fdd6f28
3,1,# Tech Innovators Community: Key Entities and ...,0,7.8,Tech Innovators Community: Key Entities and Th...,The rating reflects the community's significan...,The Tech Innovators Community is a network of ...,[{'explanation': 'Several startups within the ...,"{\n ""title"": ""Tech Innovators Community: Ke...",4173888b-23b3-4d09-a636-b44b0fe8e340


In [115]:
df22 = pd.read_parquet(path_prefix + "create_final_text_units.parquet")
df22

Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,1fae6f3d55b5cb0dba1f89f668218cd2,鲁镇的酒店的格局，是和别处不同的：都是当街一个尺形的大柜台，柜里面预备着热水，可以随时温酒。...,600,[129084b0fa1815780605fa5c38c85b77],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[e2bf260115514fb3b252fd879fb3e7be, 17ed1d92075..."
1,355e705881fd7c83e813a83d24db74b7,了这事。幸亏荐头的情面大，辞退不得，便改为专管温酒的一种无聊职务了。\n\n我从此便整天的站...,600,[129084b0fa1815780605fa5c38c85b77],"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...","[de6fa24480894518ab3cbcb66f739266, ef32c4b208d..."
2,dee20f36e11c4185c682fd582d02ed69,东西了!”孔乙己睁大眼睛说，“你怎么这样凭空污人清白……”“什么清白?我前天亲眼见你偷了何家...,600,[129084b0fa1815780605fa5c38c85b77],"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...","[de6fa24480894518ab3cbcb66f739266, ef32c4b208d..."
3,5d3d851e7c8a5cea251f64785f55435d,渐复了原，旁人便又问道，“孔乙己，你当真认识字么?”孔乙己看着问他的人，显出不屑置辩的神气。...,600,[129084b0fa1815780605fa5c38c85b77],"[4119fd06010c494caa07f439b333f4c5, 19a7f254a5d...","[6fae5ee1a831468aa585a1ea09095998, b785a902506..."
4,07730a19b8110138a785adc20fce8020,�的答他道，“谁要你教，不是草头底下一个来回的回字么?”孔乙己显出极高兴的样子，将两个指头的...,600,[129084b0fa1815780605fa5c38c85b77],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[b462b94ce47a4b8c8fffa33f7242acec, de6fa244808..."
5,b2c9b122d0e796bb26d85487fdaae53a,…他打折了腿了。”掌柜说，“哦!”“他总仍旧是偷。这一回，是自己发昏，竟偷到丁举人家里去了。...,600,[129084b0fa1815780605fa5c38c85b77],"[19a7f254a5d64566ab5cc15472df02de, e7ffaee9d31...","[b785a9025069417f94950ad231bb1441, 3b6cd96a273..."
6,138b67752488e94c5e5a9d788efce181,��要好。”掌柜仍然同平常一样，笑着对他说，“孔乙己，你又偷了东西了!”但他这回却不十分分辩...,408,[129084b0fa1815780605fa5c38c85b77],"[19a7f254a5d64566ab5cc15472df02de, 9646481f66c...","[b785a9025069417f94950ad231bb1441, 56d0e5ebe79..."


In [116]:
df11 = pd.read_parquet(path_prefix + "create_base_documents.parquet")
df11

Unnamed: 0,id,text_units,raw_content,title
0,129084b0fa1815780605fa5c38c85b77,"[1fae6f3d55b5cb0dba1f89f668218cd2, 355e705881f...",鲁镇的酒店的格局，是和别处不同的：都是当街一个尺形的大柜台，柜里面预备着热水，可以随时温酒。...,kongyiji.txt


In [117]:
df12 = pd.read_parquet(path_prefix + "create_final_documents.parquet")
df12

Unnamed: 0,id,text_unit_ids,raw_content,title
0,129084b0fa1815780605fa5c38c85b77,"[1fae6f3d55b5cb0dba1f89f668218cd2, 355e705881f...",鲁镇的酒店的格局，是和别处不同的：都是当街一个尺形的大柜台，柜里面预备着热水，可以随时温酒。...,kongyiji.txt
