In [13]:
import os
import fitz  # PyMuPDF untuk PDF
from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OpenAIEmbeddings

FAISS_INDEX_PATH = "faiss_index"

# Fungsi membaca PDF
def read_pdf(file_path):
    text = ""
    with fitz.open(file_path) as pdf:
        for page in pdf:
            text += page.get_text() + "\n"
    return text

# Fungsi membaca DOCX
def read_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Fungsi untuk memuat data dari folder
def load_knowledge(role):
    role_folders = {"Laws": "regulation", "Engineering": "engineering"}
    BASE_DIR = os.getcwd()
    data_folder = os.path.join(BASE_DIR, "data", role_folders.get(role, "data"))

    combined_text = ""
    if not os.path.exists(data_folder):
        print(f"⚠️ Folder {data_folder} tidak ditemukan.")
        return ""

    for file_name in os.listdir(data_folder):
        file_path = os.path.join(data_folder, file_name)
        if not os.path.isfile(file_path):
            continue
        
        if file_name.endswith(".pdf"):
            combined_text += read_pdf(file_path) + "\n"
        elif file_name.endswith(".docx"):
            combined_text += read_docx(file_path) + "\n"

    return combined_text

# Load data hukum dan engineering, lalu split menjadi chunks
for role in ["Laws", "Engineering"]:
    print(f"🔹 Memproses data untuk {role}...")
    knowledge_base = load_knowledge(role)
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_text(knowledge_base)

    # Buat embedding dan FAISS Index
    embeddings = OpenAIEmbeddings()
    vector_store = FAISS.from_texts(chunks, embedding=embeddings)

    # Simpan FAISS Index ke disk
    save_path = f"{FAISS_INDEX_PATH}_{role.lower()}"
    vector_store.save_local(save_path)
    print(f"✅ FAISS Index untuk {role} disimpan di {save_path}")

print("✅ Semua FAISS Index telah dibuat.")


🔹 Memproses data untuk Laws...
⚠️ Folder d:\01. Bapak\Github\HazChat\test\data\regulation tidak ditemukan.


ValidationError: 1 validation error for OpenAIEmbeddings
  Value error, Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. [type=value_error, input_value={'model_kwargs': {}, 'cli...20, 'http_client': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/value_error

In [9]:
print(knowledge_base)




In [None]:
import os
import streamlit as st
import pickle
from openai import OpenAI
import google.generativeai as genai
import anthropic
from langchain_community.vectorstores import FAISS

# API Keys
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
ANTHROPIC_API_KEY = st.secrets["ANTHROPIC_API_KEY"]
GEMINI_API_KEY = st.secrets["GEMINI_API_KEY"]

# Fungsi untuk memuat FAISS yang sudah ada
def load_faiss(role):
    file_path = f"faiss_{role}.pkl"
    if os.path.exists(file_path):
        with open(file_path, "rb") as f:
            return pickle.load(f)
    else:
        return None

# Fungsi untuk menjalankan precompute_embeddings.py
def run_precompute_embeddings():
    os.system("python precompute_embeddings.py")
    st.success("✅ Embedding selesai! Silakan refresh halaman.")

# Fungsi memuat prompt dari file
def load_prompts():
    prompt_dir = os.path.join(os.getcwd(), "add_prompt")

    prompt_engineering_path = os.path.join(prompt_dir, "prompt_engineering.txt")
    prompt_laws_path = os.path.join(prompt_dir, "prompt_laws.txt")

    prompt_engineering = open(prompt_engineering_path, "r", encoding="utf-8").read() if os.path.exists(prompt_engineering_path) else "Tidak ada prompt engineering tersedia."
    prompt_laws = open(prompt_laws_path, "r", encoding="utf-8").read() if os.path.exists(prompt_laws_path) else "Tidak ada prompt laws tersedia."

    return prompt_engineering, prompt_laws

# Memuat prompt
prompt_engineering, prompt_laws = load_prompts()

# Streamlit UI
st.title("HazChat")
role = st.selectbox("Pilih Role", ["Laws", "Engineering"])
provider = st.selectbox("Pilih Provider API", ["OpenAI", "Anthropic", "Gemini"])

# **Tombol untuk melakukan embedding ulang**
if st.button("🔄 Jalankan Embedding (Jika Ada Data Baru)"):
    run_precompute_embeddings()

# Load FAISS
vector_store = load_faiss(role)
if vector_store:
    st.success(f"✅ Knowledge base untuk {role} berhasil dimuat!")
else:
    st.warning(f"⚠️ Tidak ada knowledge base untuk {role}. Chatbot tetap bisa berjalan hanya dengan prompt bawaan.")

# Fungsi untuk memilih provider
def set_provider(provider):
    if provider == "OpenAI":
        return OpenAI(api_key=OPENAI_API_KEY)
    elif provider == "Anthropic":
        return anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    elif provider == "Gemini":
        genai.configure(api_key=GEMINI_API_KEY)
        return genai
    return None

# Fungsi untuk mendapatkan respons + menghitung token usage
def get_response(provider, client, prompt, role, vector_store, prompt_laws, prompt_engineering):
    # Jika FAISS tersedia, gunakan retrieval
    if vector_store:
        retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
        relevant_docs = retriever.get_relevant_documents(prompt)
        context = "\n".join([doc.page_content for doc in relevant_docs])
    else:
        context = ""

    # Jika FAISS tidak ada, hanya gunakan prompt default
    if role == "Laws":
        augmented_prompt = f"Gunakan informasi berikut jika relevan:\n{prompt_laws}\n\n{context}\n\nPertanyaan: {prompt}"
    elif role == "Engineering":
        augmented_prompt = f"Gunakan informasi berikut jika relevan:\n{prompt_engineering}\n\n{context}\n\nPertanyaan: {prompt}"
    else:
        return "Peran tidak dikenali."

    try:
        token_usage = 0  # Default token usage

        if provider == "OpenAI":
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": augmented_prompt}]
            )
            token_usage = response.usage.total_tokens  # OpenAI API memberikan jumlah token
            return response.choices[0].message.content, token_usage

        elif provider == "Anthropic":
            response = client.messages.create(
                model="claude-2",
                max_tokens=1024,
                messages=[{"role": "user", "content": augmented_prompt}]
            )
            token_usage = 1024  # Estimasi karena Anthropic tidak memberikan token usage
            return response.content, token_usage

        elif provider == "Gemini":
            model = client.GenerativeModel("gemini-pro")
            response = model.generate_content(augmented_prompt)
            token_usage = "Tidak tersedia untuk Gemini"  # Gemini tidak menyediakan token usage
            return response.text, token_usage

    except Exception as e:
        return f"Terjadi kesalahan: {str(e)}", 0

# Chat Input
prompt = st.chat_input("Masukkan prompt...")
if prompt:
    client = set_provider(provider)
    if client:
        response, token_usage = get_response(provider, client, prompt, role, vector_store, prompt_laws, prompt_engineering)
    else:
        response, token_usage = "Provider belum diatur.", 0
    
    st.chat_message("user").markdown(prompt)
    st.chat_message("assistant").markdown(response)

    # Menampilkan token usage
    if isinstance(token_usage, int):
        st.info(f"📊 Token digunakan: **{token_usage}**")
    else:
        st.info(f"📊 Token digunakan: **{token_usage}**")


🔄 Memproses embedding untuk role: Laws


  embeddings = OpenAIEmbeddings()


ValidationError: 1 validation error for OpenAIEmbeddings
  Value error, Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. [type=value_error, input_value={'model_kwargs': {}, 'cli...20, 'http_client': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/value_error