In [7]:
GRAPHRAG_FOLDER="ragtest/output"

In [2]:
import pandas as pd
from neo4j import GraphDatabase
import time

In [3]:
NEO4J_URI="bolt://localhost"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="password"
NEO4J_DATABASE="neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [4]:
def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a batched approach.
    Parameters: statement is the Cypher query to execute, df is the dataframe to import, and batch_size is the number of rows to import in each batch.
    """
    total = len(df)
    start_s = time.time()
    for start in range(0,total, batch_size):
        batch = df.iloc[start: min(start+batch_size,total)]
        result = driver.execute_query("UNWIND $rows AS value " + statement, 
                                      rows=batch.to_dict('records'),
                                      database_=NEO4J_DATABASE)
        print(result.summary.counters)
    print(f'{total} rows in { time.time() - start_s} s.')    
    return total

In [5]:
# create constraints, idempotent operation

statements = """
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.name is unique;
create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;
""".split(";")

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


In [8]:
doc_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_documents.parquet', columns=["id", "title"])
doc_df.head(2)

Unnamed: 0,id,title
0,c305886e4aa2f6efcf64b57762777055,book.txt


In [9]:
# import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

batched_import(statement, doc_df)

{'_contains_updates': True, 'labels_added': 1, 'nodes_created': 1, 'properties_set': 2}
1 rows in 0.6029026508331299 s.


1

In [10]:
text_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_text_units.parquet',
                          columns=["id","text","n_tokens","document_ids"])
text_df.head(2)

Unnamed: 0,id,text,n_tokens,document_ids
0,d6583840046247f428a9f02738842a7c,﻿The Project Gutenberg eBook of A Christmas Ca...,1200,[c305886e4aa2f6efcf64b57762777055]
1,10730234d6ccc7cee08f3cfc58d8a9a1,and thither in\n restless haste and moanin...,1200,[c305886e4aa2f6efcf64b57762777055]


In [11]:
statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

{'_contains_updates': True, 'labels_added': 42, 'relationships_created': 42, 'nodes_created': 42, 'properties_set': 126}
42 rows in 0.3099355697631836 s.


42

In [12]:
entity_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_entities.parquet',
                            columns=["name","type","description","human_readable_id","id","description_embedding","text_unit_ids"])
entity_df.head(2)

Unnamed: 0,name,type,description,human_readable_id,id,description_embedding,text_unit_ids
0,BOB CRATCHIT,PERSON,"Bob Cratchit, an employee and clerk of Scrooge...",0,f9ff0bdf1ee5499f804930917bd49647,"[0.011679230257868767, 0.013168442994356155, -...","[04e5c071e4ee5496d5380662e1339f45, 6c362d3f8d0..."
1,MRS. CRATCHIT,PERSON,"Mrs. Cratchit, Bob's wife, is shown as a carin...",1,a008afeb4ec44a979d36a5b54c9bc0e4,"[0.041016917675733566, 0.016917483881115913, -...",[b4dec8fbe9f2a2c6a79d09c9484d15ae]


In [13]:
entity_statement = """
MERGE (e:__Entity__ {id:value.id})
SET e += value {.human_readable_id, .description, name:replace(value.name,'"','')}
WITH e, value
CALL db.create.setNodeVectorProperty(e, "description_embedding", value.description_embedding)
CALL apoc.create.addLabels(e, case when coalesce(value.type,"") = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id:text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

batched_import(entity_statement, entity_df)

{'_contains_updates': True, 'labels_added': 33, 'relationships_created': 44, 'nodes_created': 33, 'properties_set': 132}
33 rows in 0.4090137481689453 s.


33

In [14]:
rel_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_relationships.parquet',
                         columns=["source","target","id","rank","weight","human_readable_id","description","text_unit_ids"])
rel_df.head(2)

Unnamed: 0,source,target,id,rank,weight,human_readable_id,description,text_unit_ids
0,BOB CRATCHIT,MRS. CRATCHIT,dda00bff9e6c47df93ec2ed4ada803f2,8,9.0,0,Bob and Mrs. Cratchit are married and share th...,[b4dec8fbe9f2a2c6a79d09c9484d15ae]
1,BOB CRATCHIT,TINY TIM,c7a14df2ed8e499386aea5305d9a2be7,9,9.0,1,"Tiny Tim is Bob Cratchit's youngest son, and B...",[b4dec8fbe9f2a2c6a79d09c9484d15ae]


In [15]:
rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

{'_contains_updates': True, 'relationships_created': 39, 'properties_set': 234}
39 rows in 0.16939187049865723 s.


39

In [16]:
community_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_communities.parquet', 
                     columns=["id","level","title","text_unit_ids","relationship_ids"])

community_df.head(2)

Unnamed: 0,id,level,title,text_unit_ids,relationship_ids
0,0,0,Community 0,"[04e5c071e4ee5496d5380662e1339f45,6c362d3f8d01...","[dda00bff9e6c47df93ec2ed4ada803f2, c7a14df2ed8..."
1,1,0,Community 1,"[4cf4deeb7f61acb7b7db4ce0e57fb1e6, 04e5c071e4e...","[a29bb480decf451e82f86b8b9898dca2, 6dee3660366..."


In [17]:
statement = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURN count(distinct c) as createdCommunities
"""

batched_import(statement, community_df)

{'_contains_updates': True, 'labels_added': 3, 'relationships_created': 26, 'nodes_created': 3, 'properties_set': 9}
3 rows in 0.14153146743774414 s.


3

In [18]:
community_report_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_community_reports.parquet',
                               columns=["id","community","level","title","summary", "findings","rank","rank_explanation","full_content"])
community_report_df.head(2)

Unnamed: 0,id,community,level,title,summary,findings,rank,rank_explanation,full_content
0,e90b3470-37ce-404b-ba62-c3b7c26d9eb7,0,0,The Cratchit Family and Their Christmas Spirit,"The Cratchit family, central to the narrative ...","[{'explanation': 'Tiny Tim, despite his frailt...",7.5,The impact severity rating is relatively high ...,# The Cratchit Family and Their Christmas Spir...
1,296ce131-e8d7-4305-9444-0e78c0a4bdeb,1,0,Scrooge's Transformation and the Spirit of Chr...,This report delves into the profound transform...,[{'explanation': 'Scrooge's character undergoe...,8.5,The high impact severity rating reflects the p...,# Scrooge's Transformation and the Spirit of C...


In [19]:
# import communities
community_statement = """
MERGE (c:__Community__ {community:value.community})
SET c += value {.level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND range(0, size(value.findings)-1) AS finding_idx
WITH c, value, finding_idx, value.findings[finding_idx] as finding
MERGE (c)-[:HAS_FINDING]->(f:Finding {id:finding_idx})
SET f += finding
"""
batched_import(community_statement, community_report_df)

{'_contains_updates': True, 'labels_added': 15, 'relationships_created': 15, 'nodes_created': 15, 'properties_set': 63}
3 rows in 0.11243891716003418 s.


3

In [20]:
"""
# cov_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_covariates.parquet')
# cov_df.head(2)
"""

"\n# cov_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_covariates.parquet')\n# cov_df.head(2)\n"

In [21]:
cov_statement = """
MERGE (c:__Covariate__ {id:value.id})
SET c += apoc.map.clean(value, ["text_unit_id", "document_ids", "n_tokens"], [NULL, ""])
WITH c, value
MATCH (ch:__Chunk__ {id: value.text_unit_id})
MERGE (ch)-[:HAS_COVARIATE]->(c)
"""
# batched_import(cov_statement, cov_df)