In [3]:
!pip install langchain langchain-google-genai langchain-text-splitters langchain-community langchain-core langchain-huggingface sentence-transformers bs4 pypdf requests

Collecting langchain-google-genai
  Downloading langchain_google_genai-4.0.0-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.1.0-py3-none-any.whl.metadata (2.8 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting pypdf
  Downloading pypdf-6.4.1-py3-none-any.whl.metadata (7.1 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting langchain-core
  Downloading langchain_core-1.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests
  Down

In [2]:
import os
import bs4
import requests
from langchain.tools import tool
from langchain.agents import AgentState
from langchain.chat_models import init_chat_model
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.messages import MessageLikeRepresentation
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
pdf_url = "https://peraturan.bpk.go.id/Download/46205/PP%20No.%2040%20Th%201996.pdf"
pdf_path = "PP_No_40_Th_1996.pdf"

if not os.path.exists(pdf_path):
    print(f"Downloading PDF from {pdf_url}...")
    response = requests.get(pdf_url)
    with open(pdf_path, 'wb') as f:
        f.write(response.content)
    print(f"PDF downloaded successfully to {pdf_path}")
else:
    print(f"PDF already exists at {pdf_path}")

Downloading PDF from https://peraturan.bpk.go.id/Download/46205/PP%20No.%2040%20Th%201996.pdf...
PDF downloaded successfully to PP_No_40_Th_1996.pdf


In [7]:
loader = PyPDFLoader(pdf_path)
docs = loader.load()
print(f"Loaded {len(docs)} pages from PDF")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_store = InMemoryVectorStore(embeddings)
_ = vector_store.add_documents(documents=all_splits)

Loaded 64 pages from PDF


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [9]:
def prompt_with_context(state: AgentState) -> list[MessageLikeRepresentation]:
    last_query = state["messages"][-1].text
    retrieved_docs = vector_store.similarity_search(last_query)

    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    system_message = (
        "Anda adalah asisten hukum dan ahli geospasial yang membantu menjawab pertanyaan berdasarkan"
        f"\n\n{docs_content}"
    )

    return [{"role": "system", "content": system_message}, *list(state["messages"])]

tools = [retrieve_context]

In [11]:
llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai", api_key=userdata.get('GOOGLE_API_KEY'))

In [12]:
checkpointer = MemorySaver()

In [13]:
agent = create_react_agent(
    model=llm,
    tools=tools,
    prompt=prompt_with_context,
    checkpointer=checkpointer
)

/tmp/ipython-input-1421255084.py:1: LangGraphDeprecatedSinceV10: create_react_agent has been moved to `langchain.agents`. Please update your import to `from langchain.agents import create_agent`. Deprecated in LangGraph V1.0 to be removed in V2.0.
  agent = create_react_agent(


In [14]:
query = "Tanah yang dapat diberikan dengan Hak Guna Usaha adalah tanah Negara tersebut dalam pasal berapa?"

In [15]:
config = {"configurable": {"thread_id": "1"}}

for step in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    config=config,
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


Tanah yang dapat diberikan dengan Hak Guna Usaha adalah tanah Negara tersebut dalam pasal berapa?

[{'type': 'text', 'text': 'Tanah yang dapat diberikan dengan Hak Guna Usaha adalah tanah Negara tersebut dalam Pasal 4 ayat (1).', 'extras': {'signature': 'CtoDAXLI2nxjjySA0gCl4r2QnmJOeOXdxOGXoUq+geMhzXNhoUsLI9/ECb0IGD+eg7rHsWo+OQCzyEPzXECSaaYLJs60p+ffLVUtRsmlVtXiDs1thQXk6jiqjEM7nr6r6e7PLZADF6f7/AqfrSC1MqDQ8tiYax1j7kO++X8Ox6o8CD5Bm01RJJpadjxmWRTF/aWSBIEyUaZq0qSub1EPbPHIoB72e4+QH5L8lmnYyS3iDS6ozqxWqJxKlfkIhyP6Wox7Q5cP9Uh3kPMaigXXgSpJ5kwd8hoRhf3GYBS9d2CDsp6fgyVGMlw46qOjzBjvCYGvaadJ15a3uRhbFLQjWaok+C4ubNzrEbeJHU7bPNw+AZpjWCdASU+8+WubQZKRro++9D3hlIb5f/glJN8gsjkO1wMisF+LSUtVEfRVIzhR5+FJPG2x4ognGo+x0TxY3poF1Gbty20kfITEsMp4PYIDK3zm7DIHi0EHad/xil+flbbXGg+SYAj6yPFWZroSFepv8Vdty29EoGiAUkGkA+25I7WIGc3KxCfjVqIn0wfFbZAbf6UzDYvViu8ctuucitY/G7HAN9uXQPbAGNiNrM6KAFReepty3hYSxde4X8kxYz59FJzvBgYHrUJo'}}]
