In [None]:
# Install modules
%pip install -U -r requirements.txt

In [None]:
# Configure import paths.
import sys
sys.path.append("../../")

# Initialize environment variables.
from utils import initialize_environment
initialize_environment()

In [None]:
#@ GraphVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.graph_vectorstores.cassandra import CassandraGraphVectorStore
import cassio

cassio.init(auto=True)
store = CassandraGraphVectorStore(
    embedding = OpenAIEmbeddings(),
    node_table="neighborhood_nodes",
    insert_timeout = 1000.0,
)


In [None]:
#@ Load Data Into the Graph VectorStore
if input("load data (y/N): ").lower() == "y":
    print("Loading data...")
    from datasets.wikimultihop.load import load_2wikimultihop
    load_2wikimultihop(store)
else:
    print("Skipped loading data")

In [None]:
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import get_links
from typing import Iterable

def communities(documents: Iterable[Document]) -> Iterable[Iterable[Document]]:
    """Group documents by community inferred from the links."""
    import networkx as nx
    from networkx.algorithms.community.centrality import girvan_newman

    graph = nx.DiGraph()

    # First pass -- map from tag to noed IDs with that incoming.
    documents_by_id = {}
    documents_by_incoming = {}
    for document in documents:
        # Add the node to the graph
        graph.add_node(document.id)
        documents_by_id[document.id] = document

        # Record the incoming edges.
        for link in get_links(document):
            if link.direction == "in" or link.direction == "bidir":
                documents_by_incoming.setdefault((link.kind, link.tag), set()).add(document.id)

    # Second pass -- add edges for each outgoing edge.
    for document in documents:
        for link in get_links(document):
            if link.direction == "out" or link.direction == "bidir":
                for target in documents_by_incoming.get((link.kind, link.tag), set()):
                    graph.add_edge(document.id, target)

    # Find communities and output documents grouped by community.
    communities = girvan_newman(graph)
    return [[documents_by_id[id] for id in community] for community in communities]

# Next Steps

1. Fetch many chunks, group by community, map summarization on communities, reduce to a single summary.
   (See https://python.langchain.com/v0.2/docs/tutorials/summarization/#orchestration-via-langgraph)
2. Demonstrate on a dateset / write text