Berikut contoh alur RAG lokal:
rag_local_llm.py
Retriever: Hybrid BM25 + SBERT (seperti sebelumnya)
Generator: LLM lokal, misal Llama-3, Gemma, atau Mistral (via llama-cpp-python atau transformers)

In [1]:
import json
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from llama_cpp import Llama

2025-06-07 06:52:36.330752: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# === LOAD KNOWLEDGE BASE ===
with open("data/pertanyaan_jawaban.json", "r", encoding="utf-8") as f:
    kb_data = json.load(f)

kb_questions = [item["pertanyaan"] for item in kb_data]
kb_answers = [item["jawaban"] for item in kb_data]


In [3]:
# === INISIALISASI BM25 DAN SBERT ===
tokenized_questions = [q.lower().split() for q in kb_questions]
bm25 = BM25Okapi(tokenized_questions)
sbert_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
kb_embeddings = sbert_model.encode(kb_questions)


In [4]:
# === INISIALISASI LLM LOKAL (llama.cpp) ===
# Ganti path ke model GGUF kamu (misal: "llama-3-8b-instruct.Q4_K_M.gguf")
llm = Llama(model_path="llama-3-8b-instruct.Q4_K_M.gguf", n_ctx=2048, verbose=False)

def hybrid_retrieve(user_question, bm25_top_n=5, sbert_top_k=3):
    tokenized_query = user_question.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:bm25_top_n]

    candidate_questions = [kb_questions[i] for i in bm25_top_indices]
    candidate_answers = [kb_answers[i] for i in bm25_top_indices]
    candidate_embeddings = [kb_embeddings[i] for i in bm25_top_indices]
    user_embedding = sbert_model.encode([user_question])[0]
    cos_sims = util.cos_sim(user_embedding, candidate_embeddings)[0].cpu().tolist()

    pairs = list(zip(bm25_top_indices, candidate_questions, candidate_answers, cos_sims))
    pairs = sorted(pairs, key=lambda x: x[3], reverse=True)  # sort by SBERT similarity

    return pairs[:sbert_top_k]

def generate_answer_llama(query, contexts):
    context_text = "\n".join([f"Q: {q}\nA: {a}" for _, q, a, _ in contexts])
    prompt = (
        f"Konteks berikut berisi informasi dari knowledge base:\n{context_text}\n\n"
        f"Pertanyaan: {query}\n"
        f"Jawablah pertanyaan ini secara lengkap dan jelas berdasarkan informasi di atas."
    )
    # Llama.cpp API
    response = llm(prompt=prompt, max_tokens=256, temperature=0.2, stop=["Q:", "\n\n"])
    # Ambil hasil
    return response["choices"][0]["text"].strip()

if __name__ == "__main__":
    print("Chatbot RAG Lokal (Hybrid Retrieval + LLM Lokal)")
    print("-----------------------------------------------")
    while True:
        user_question = input("\nTulis pertanyaanmu (atau ketik 'selesai' untuk keluar): ")
        if user_question.lower() == "selesai":
            break

        # Step 1: Hybrid Retrieve
        contexts = hybrid_retrieve(user_question)
        print("\nDokumen paling relevan (knowledge base):")
        for i, (_, q, a, s) in enumerate(contexts, 1):
            print(f"{i}. Q: {q}\n   A: {a}\n   Skor kemiripan: {s:.4f}")

        # Step 2: Generasi Jawaban
        answer = generate_answer_llama(user_question, contexts)
        print("\n=== Jawaban Generatif (LLM Lokal) ===")
        print(answer)

ValueError: Model path does not exist: llama-3-8b-instruct.Q4_K_M.gguf