# **Neo4j 데이터베이스 연결**

In [None]:
!pip install neo4j

from google.colab import drive
import os

# 구글 드라이브 아누트
drive.mount('/content/drive')

# 디렉터리 변경
os.chdir('워킹 디렉토리 지정')

GRAPHRAG_FOLDER = '아웃풋 디렉토리 지정'

In [None]:
from neo4j import GraphDatabase
import pandas as pd
import time

# 실제 인스턴스 정보를 입력한ㄷ
NEO4J_URI = "neo4j+s://..."
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

# **배치 임포트 함수와 제약 조건**

In [None]:
def batched_import(statement, df, batch_size=1000):
  """
  Import a dataframe into Neo4j using a batched approach.

  Parameters: statement is the Cypher query to execute, df is the dataframe to
  import, and batch_size is the number of rows to import in each batch.
  """

  total = len(df)
  start_s = time.time()

  for start in range(0, total, batch_size):
    batch = df.iloc[start : min(start + batch_size, total)]
    result = driver.execute_query(
        "UNWIND $rows AS value " + statement,
        rows=batch.to_dict("records"),
        database_=NEO4J_DATABASE,
    )

    print(result.summary.counters)

  print(f"{total} rows in {time.time() - start_s} s.")
  return total

# 제약 조건 설정
statements = [
    "\ncreate constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique",
    "\ncreate constraint document_id if not exists for (d:__Document__) require d.id is unique",
    "\ncreate constraint community_id if not exists for (c:__Community__) require c.community is unique",
    "\ncreate constraint entity_id if not exists for (e:__Entity__) require e.id is unique",
    "\ncreate constraint entity_title if not exists for (e:__Entity__) require e.name is unique",
    "\ncreate constraint covariate_title if not exists for (e:__Covariate__) require e.title is unique",
    "\ncreate constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique",
    "\n",
]

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)

# **GraphRAG Neo4j Aura 연동**

In [None]:
doc_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_documents.parquet", columns=["id", "title"]
)

# 문서 노드 병합
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

batched_import(statement, doc_df)

In [None]:
# 텍스트 유닛(청크) 임포트
text_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_text_units.parquet',
                          columns=["id", "text", "n_tokens", "document_ids"])

statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

In [None]:
# 엔티티 임포트
entity_df = pd.read_parquet(
    f'{GRAPHRAG_FOLDER}/create_final_entities.parquet',
    columns=["title", "type", "description", "human_readable_id", "id", "text_unit_ids"]
)

# Cypher 쿼리
statement = """
MERGE (e:__Entity__ {id: value.id})
SET e.human_readable_id = value.human_readable_id,
    e.description = value.decsription,
    e.name = coalesce(replcae(value.title, '"', ''), 'Unknown')
WITH e, value
CALL apoc.create.addLabels(e, CASE WHEN coalesce(value.type, "") = "" THEN [] ELSE
[apoc.text.upperCamelCase(replace(value.type, '"', ''))] END) YIELD node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id: text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

# 데이터 임포트 실행
batched_import(statement, entity_df)

In [None]:
# 관계 임포트
rel_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_relationships.parquet',
                         columns=["source", "target", "id", "combined_degree", "weight",
                                  "human_readable_id", "description", "text_unit_ids"])

rel_df.rename(dolumns=['combined_degree': 'rank'])

rel_statement = """
  MATCH (source: __Entity__ {name:replace(value.source, '"', '')})
  MATCH (target: __Entity__ {name:replace(value.target, '"', '')})
  MERGE (source)-[rel:RELATED {id: value.id}]->(target)
  SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
  RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

In [None]:
# 커뮤니티 임포트
community_df = pd.read_parquet(
    f'{GRAPHRAG_FOLDER}/create_final_communities.parquet',
    columns=["id", "level", "title", "text_unit_ids", "relationship_ids"]
)

statement = """
MERGE (c:__Community__ {community: value.title})
SET c.title = value.title,
    c.level = value.level
WITH c, value
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id: text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id: rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
METGE (end)-[:IN_COMMUNITY]->(c)
RETURN count(distinct c) as createdCommunities
"""

batched_import(statement, community_df)