In [34]:
# Install modules
%pip install -U -r requirements.txt

Collecting langgraph (from -r requirements.txt (line 3))
  Downloading langgraph-0.2.14-py3-none-any.whl.metadata (13 kB)
Collecting langgraph-checkpoint<2.0.0,>=1.0.2 (from langgraph->-r requirements.txt (line 3))
  Downloading langgraph_checkpoint-1.0.6-py3-none-any.whl.metadata (4.5 kB)
Downloading langgraph-0.2.14-py3-none-any.whl (87 kB)
Downloading langgraph_checkpoint-1.0.6-py3-none-any.whl (15 kB)
Installing collected packages: langgraph-checkpoint, langgraph
Successfully installed langgraph-0.2.14 langgraph-checkpoint-1.0.6
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Configure import paths.
import sys
sys.path.append("../../")

# Initialize environment variables.
from utils import initialize_environment
initialize_environment()

In [3]:
#@ GraphVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.graph_vectorstores.cassandra import CassandraGraphVectorStore
import cassio

cassio.init(auto=True)
store = CassandraGraphVectorStore(
    embedding = OpenAIEmbeddings(),
    node_table="neighborhood_nodes",
    insert_timeout = 1000.0,
)


In [4]:
#@ Load Data Into the Graph VectorStore
if input("load data (y/N): ").lower() == "y":
    print("Loading data...")
    from datasets.wikimultihop.load import load_2wikimultihop
    load_2wikimultihop(store)
else:
    print("Skipped loading data")

Skipped loading data


In [76]:
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import get_links
from typing import Iterable, List, Set, Tuple
import networkx as nx

def best_communities(graph: nx.DiGraph) -> Tuple[Set[str]] | None:
    best_modularity = float("-inf")
    best_communities = None
    for new_communities in nx.algorithms.community.girvan_newman(graph):
        new_modularity = nx.algorithms.community.modularity(graph, new_communities)
        if new_modularity > best_modularity:
            best_modularity = new_modularity
            best_communities = new_communities
        else:
            break
    return best_communities

def communities(documents: Iterable[Document]) -> List[List[Document]]:
    """Group documents by community inferred from the links."""

    graph = nx.DiGraph()

    # First pass -- map from tag to noed IDs with that incoming.
    documents_by_id = {}
    documents_by_incoming = {}
    for document in documents:
        # Add the node to the graph
        graph.add_node(document.id)
        documents_by_id[document.id] = document

        # Record the incoming edges.
        for link in get_links(document):
            if link.direction == "in" or link.direction == "bidir":
                documents_by_incoming.setdefault((link.kind, link.tag), set()).add(document.id)


    # Second pass -- add edges for each outgoing edge.
    for document in documents_by_id.values():
        for link in get_links(document):
            if link.direction == "out" or link.direction == "bidir":
                for target in documents_by_incoming.get((link.kind, link.tag), set()):
                    graph.add_edge(document.id, target)

    # Find communities and output documents grouped by community.
    # The algorithm returns an iterator over iterations.
    # Iterate until the modularity no longer increases.
    return [
        [documents_by_id[id] for id in community]
        for community in best_communities(graph)
    ]

# Next Steps

1. Fetch many chunks, group by community, map summarization on communities, reduce to a single summary.
   (See https://python.langchain.com/v0.2/docs/tutorials/summarization/#orchestration-via-langgraph)
2. Demonstrate on a dateset / write text

In [9]:
# Create the retriever.
# For summurization, we request a higher `k`
retriever = store.as_retriever(
    search_type = "mmr_traversal",
    search_kwargs = {
        "k": 20,
        "fetch_k": 50,
        "depth": 2,
        # "score_threshold": 0.2,
    },
)

retriever.invoke("Tell me about Russia")

[Document(id='18521670', metadata={'content_id': '18521670', 'links': [Link(kind='href', direction='in', tag='18521670')]}, page_content='The environment of Russia'),
 Document(id='17447040', metadata={'content_id': '17447040', 'links': [Link(kind='href', direction='out', tag='25391'), Link(kind='href', direction='out', tag='21485652'), Link(kind='href', direction='out', tag='31750'), Link(kind='href', direction='in', tag='17447040'), Link(kind='href', direction='out', tag='21487063'), Link(kind='href', direction='out', tag='3457')]}, page_content='Russia-Sweden relations date back to the 10th century; when Swedish Vikings called Varangians founded new states that later evolved into Russia, Ukraine and Belarus.'),
 Document(id='3457', metadata={'content_id': '3457', 'links': [Link(kind='href', direction='out', tag='7884711'), Link(kind='href', direction='out', tag='380252'), Link(kind='href', direction='out', tag='2530445'), Link(kind='href', direction='out', tag='275297'), Link(kind='

In [88]:
# Create the communities and summarize

import operator
from typing import Annotated, List, Literal, TypedDict

from langchain.chains.combine_documents.reduce import (
    acollapse_docs,
    split_list_of_docs,
)
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langgraph.constants import Send
from langgraph.graph import END, START, StateGraph

token_max = 1000

llm = ChatOpenAI(model="gpt-4o", temperature=0)


def length_function(documents: List[Document]) -> int:
    """Get number of tokens for input contents."""
    return sum(llm.get_num_tokens(doc.page_content) for doc in documents)


# This will be the overall state of the main graph.
# It will contain the input document contents, corresponding
# summaries, and a final summary.
class OverallState(TypedDict):
    # Notice here we use the operator.add
    # This is because we want combine all the summaries we generate
    # from individual nodes back into one list - this is essentially
    # the "reduce" part
    question: str
    communities: List[List[Document]]
    summaries: Annotated[list, operator.add]
    collapsed_summaries: List[Document]
    final_summary: str


# This will be the state of the node that we will "map" all
# documents to in order to generate summaries
class SummaryState(TypedDict):
    content: List[str]
    question: str

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

map_template = """
The following is a set of documents:
{content}

First, describe the common topic in the documents.

Then, write a concise summary of that topic based on the documents related to the question:
{question}
"""

map_prompt = ChatPromptTemplate.from_messages(
    [("human", map_template)]
)

map_chain = map_prompt | llm | StrOutputParser()

# Here we generate a summary, given a document
async def generate_summary(state: SummaryState):
    response = await map_chain.ainvoke(state)
    return {"summaries": [response]}


# Here we define the logic to map out over the documents
# We will use this an edge in the graph
def map_summaries(state: OverallState):
    # We will return a list of `Send` objects
    # Each `Send` object consists of the name of a node in the graph
    # as well as the state to send to that node
    return [
        Send("generate_summary", {"content": [doc.page_content for doc in community],
                                  "question": state["question"]})
        for community in state["communities"]
    ]


def collect_summaries(state: OverallState):
    return {
        "collapsed_summaries": [Document(summary) for summary in state["summaries"]]
    }

reduce_template = """
The following is a set of summaries:
{docs}
Take these and distill it into a final, consolidated summary
of the main themes answering the following question:
{question}
"""

reduce_prompt = ChatPromptTemplate([("human", reduce_template)])

reduce_chain = reduce_prompt | llm | StrOutputParser()

# Add node to collapse summaries
async def collapse_summaries(state: OverallState):
    doc_lists = split_list_of_docs(
        state["collapsed_summaries"], length_function, token_max
    )
    results = []
    for doc_list in doc_lists:
        results.append(await acollapse_docs(doc_list,
                                            lambda docs: reduce_chain.ainvoke({
                                                "docs": docs,
                                                "question": state["question"],
                                            })))

    return {"collapsed_summaries": results}


# This represents a conditional edge in the graph that determines
# if we should collapse the summaries or not
def should_collapse(
    state: OverallState,
) -> Literal["collapse_summaries", "generate_final_summary"]:
    num_tokens = length_function(state["collapsed_summaries"])
    if num_tokens > token_max:
        return "collapse_summaries"
    else:
        return "generate_final_summary"


# Here we will generate the final summary
async def generate_final_summary(state: OverallState):
    response = await reduce_chain.ainvoke({
        "docs": state["collapsed_summaries"],
        "question": state["question"],
    })
    return {"final_summary": response}


# Construct the graph
# Nodes:
graph = StateGraph(OverallState)
graph.add_node("generate_summary", generate_summary)
graph.add_node("collect_summaries", collect_summaries)
graph.add_node("collapse_summaries", collapse_summaries)
graph.add_node("generate_final_summary", generate_final_summary)

# Edges:
graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
graph.add_edge("generate_summary", "collect_summaries")
graph.add_conditional_edges("collect_summaries", should_collapse)
graph.add_conditional_edges("collapse_summaries", should_collapse)
graph.add_edge("generate_final_summary", END)

community_summarizer = graph.compile()

from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
summary_retriever = {
    "question": RunnablePassthrough(),
    "communities": RunnablePassthrough() | retriever | RunnableLambda(communities),
    } | community_summarizer | itemgetter("final_summary")
await summary_retriever.ainvoke("Tell me about Russia")

'Russia, the largest country in the world, is known for its vast and diverse natural landscapes, ranging from expansive forests and tundras to mountain ranges and extensive river systems. The climate varies significantly from the frigid Arctic conditions in the north to more temperate climates in the south. Russia\'s ecosystems host a wide variety of flora and fauna, some unique to the region, but face significant environmental challenges such as pollution, deforestation, and climate change. Efforts are ongoing to address these issues and preserve the country\'s rich natural heritage.\n\nHistorically, Russia\'s origins trace back to the 10th century with the establishment of Kievan Rus\' by Swedish Vikings known as Varangians. Over the centuries, Russia expanded significantly, becoming a major part of the Russian Empire and later the Soviet Union, which existed from 1922 to 1991. The Soviet Union was a highly centralized one-party state governed by the Communist Party, playing a crucia