# **RAG for KBLI**

## Import Libraries

In [None]:
import os
import json
import pandas as pd
from langchain.schema import Document, HumanMessage, AIMessage, SystemMessage
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers import MergerRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors.base import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_core.vectorstores import VectorStore  # for type hint
from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.embeddings import Embeddings


# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "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"
os.environ['PINECONE_API_KEY'] = "pcsk_4mXCMJ_ANjVcgHrC2gMq7Gs68BRPbFSdwqW6JU1tgwxaUg4VgAUF7aw43iDbYfC5u8p6HY"

## Directory Setup

In [2]:
os.makedirs("vector_db", exist_ok=True)
os.makedirs("chat_history", exist_ok=True)
history_file = os.path.join("chat_history", "history.json")

## History Utilities

In [3]:
# %%
def load_history():
    if os.path.exists(history_file):
        with open(history_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return []

# %%
def save_history(messages):
    with open(history_file, 'w', encoding='utf-8') as f:
        json.dump(messages, f, ensure_ascii=False, indent=2)

# %%
def reset_history():
    save_history([])
    print("Chat history has been reset.")

## Load and Embed Documents

In [None]:
# %%
# Read CSVs
# df_kbji = pd.read_csv("dataset/kbji2014.csv", delimiter=",")
df_kbli = pd.read_csv("dataset/kbli2020.csv", delimiter=",")

# Build Document list
docs = []
# KBJI 2014
# for _, r in df_kbji.iterrows():
#     meta = {"digit": str(r.digit), "kode": str(r.kode), "judul": r.judul.strip()}
#     meta_str = "; ".join(f"{k}={v}" for k, v in meta.items())
#     content = f"METADATA: {meta_str}\nCONTENT: {str(r.deskripsi).strip()}"
#     docs.append(Document(page_content=content, metadata=meta))
# KBLI 2020
for _, r in df_kbli.iterrows():
    meta = {"kategori": r.kategori, "digit": str(r.digit), "kode": str(r.kode), "judul": r.judul.strip()}
    meta_str = "; ".join(f"{k}={v}" for k, v in meta.items())
    content = f"METADATA: {meta_str}\nCONTENT: {r.deskripsi.strip()}"
    docs.append(Document(page_content=content, metadata=meta))

# %%
# Persist vector store
embed_model = OpenAIEmbeddings(
    model="text-embedding-ada-002",
)
vectordb = Chroma.from_documents(docs, embedding=embed_model, persist_directory="vector_db")
vectordb.persist()

  vectordb.persist()


## Custom Prompt Templates

In [6]:
# 4) --- Prompt templates ------------------------
system_instructions = """
Answer with:
- **Kode:** <kode KBLI>
- **Nama:** <classification name>
- **Deskripsi:** <detailed description>
If the question is unrelated to KBLI, say you cannot answer.
""".strip()

discern_prompt = PromptTemplate(
    input_variables=["chat_history", "question"],
    template=(
        "Rephrase follow‑up question as standalone.\n"
        "Conversation:\n{chat_history}\n"
        "Follow‑up: {question}\n"
        "Standalone:"
    )
)

combine_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=f"""{system_instructions}

Context:
{{context}}

Question: {{question}}

Answer:"""
)

## Setup Retrieval Chain (Merged + Dedup)

In [25]:
def get_kbli_retriever(
    vector_store: VectorStore,
    llm_model: BaseLanguageModel,
    embed_model: Embeddings,
    top_k: int = 3,
):
    # 1) exact‐match / similarity retriever
    retriever_sim = vector_store.as_retriever(
        search_type="similarity", search_kwargs={"k": top_k}
    )
    # 2) MMR retriever
    retriever_mmr = vector_store.as_retriever(
        search_type="mmr", search_kwargs={"k": top_k, "fetch_k": top_k * 3}
    )

    # 3) Self‑Query Retriever
    #    ⚠️ document_contents must be List[str], not List[Document]
    metadata_info = [
        AttributeInfo(name="kategori", description="Primary KBLI category (A, B, ...)", type="string"),
        AttributeInfo(name="digit",    description="Number of digits in code level",        type="string"),
        AttributeInfo(name="kode",     description="Full KBLI code, e.g. '0111' or '95230'",    type="string"),
        AttributeInfo(name="judul",    description="KBLI classification title",                type="string"),
    ]
    retriever_self = SelfQueryRetriever.from_llm(
        llm=llm_model,
        vectorstore=vector_store,
        document_contents="metadata and content",
        # document_content_description="KBLI classification descriptions",
        metadata_field_info=metadata_info,
        search_type="mmr",
        search_kwargs={"k": top_k, "fetch_k": top_k * 3},
    )

    # 4) Merge them
    # merged = MergerRetriever(retrievers=[retriever_self, retriever_mmr, retriever_sim])
    merged = MergerRetriever(retrievers=[retriever_mmr, retriever_sim])

    # 5) Deduplicate similar hits
    redundancy_filter = EmbeddingsRedundantFilter(embeddings=embed_model)
    compressor = DocumentCompressorPipeline(transformers=[redundancy_filter])
    filtered = ContextualCompressionRetriever(
        base_retriever=merged,
        base_compressor=compressor
    )

    return filtered

In [None]:
# 6) --- Build RAG chain --------------------------
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    verbose=True,
)

retriever = get_kbli_retriever(vectordb, llm, embed_model, top_k=3)

rag_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    condense_question_prompt=discern_prompt,
    combine_docs_chain_kwargs={"prompt": combine_prompt},
    chain_type="stuff",
    return_source_documents=True,
)

In [27]:
retriever.invoke("kode dari KBLI yang berjudul \"Angkutan Sewa Khusus\"?")

[_DocumentWithState(metadata={'digit': '5', 'kode': '50142', 'kategori': 'H', 'judul': 'Angkutan Laut Luar Negeri untuk Barang Khusus'}, page_content='METADATA: kategori=H; digit=5; kode=50142; judul=Angkutan Laut Luar Negeri untuk Barang Khusus\nCONTENT: Kelompok ini mencakup usaha angkutan laut internasional untuk barang khusus, contohnya angkutan barang berbahaya, limbah bahan berbahaya dan beracun, termasuk ikan dan sejenisnya. Angkutan laut khusus dengan menggunakan kapal berbendera Indonesia dengan kondisi dan persyaratan kapalnya disesuaikan dengan jenis kegiatan usaha pokoknya serta untuk melayani trayek tidak tetap dan tidak teratur atau tramper antarpelabuhan di Indonesia dengan pelabuhan di luar negeri. Termasuk usaha persewaan angkutan laut berikut operatornya.', state={'embedded_doc': [0.019157411157712866, -0.01738005859991591, 0.016091139972024104, -0.008907111253543015, -0.02301060139834945, 0.012631409258861434, -0.027338656200562497, 0.009782219095435267, -0.023024168

## Chat Function

In [15]:
def ask(query: str) -> dict:
    history = load_history()
    msgs = []
    # you can prepend a system message here if you like,
    # but combine_prompt already contains your system_instructions.
    for m in history:
        if m["type"] == "human":
            msgs.append(HumanMessage(content=m["content"]))
        else:
            msgs.append(AIMessage(content=m["content"]))

    result = rag_chain.invoke({"question": query, "chat_history": msgs})
    answer = result["answer"]
    sources = result.get("source_documents", [])
    context = "\n---\n".join(
        f"{d.page_content}"
        for d in sources
    )

    history.extend([
        {"type": "human", "content": query},
        {"type": "ai", "content": answer}
    ])
    save_history(history)

    return {"context": context, "answer": answer, "history": history}

In [16]:
# Reset chat history if needed
reset_history()

Chat history has been reset.


In [34]:
resp = ask("Judul KBLI dengan kode 74202?")

In [35]:
print(resp['context'])

METADATA: kategori=H; digit=4; kode=5224; judul=Penanganan Kargo (Bongkar Muat Barang)
CONTENT: Subgolongan ini mencakup:
- Kegiatan memuat dan membongkar barang atau bagasi (barang penumpang ) terlepas dari moda transportasi yang digunakan untuk pengangkutan
- Kegiatan bongkar muat kapal
- Kegiatan bongkar muat gerbong kereta api barang

Subgolongan ini tidak mencakup:
- Pengoperasian fasilitas terminal, lihat 5221, 5222 dan 5223
---
METADATA: kategori=M; digit=5; kode=74142; judul=Aktivitas Desain Konten Game
CONTENT: Kelompok ini mencakup kegiatan perencanaan konten kreatif game antara lain: desain logika mekanik permainan; desain cerita; desain artistic seperti desain visual karakter, desain user interface, desain level dan lain-lain; desain  teknis terkait teknologi yang digunakan; pembuatan dokumen desain; riset dan pengembangan; dan aktivitas penunjang lainnya. Kegiatan produksi alat permainan masuk dalam kelompok 32401 dan pengembangan video game 62011.
---
METADATA: kategori=J

In [37]:
print(resp["answer"])

**Kode:** 74202
**Nama:** Kegiatan Penyelenggaraan Game Online
**Deskripsi:** Kelompok ini mencakup kegiatan penyelenggaraan game online yang dapat dimainkan melalui internet, termasuk game yang dimainkan melalui media sosial. Kegiatan ini meliputi pengelolaan server game, pengembangan fitur game, dan aktivitas lain yang terkait dengan operasional game online.
