In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

GROQ_API_KEY   = os.getenv("GROQ_API_KEY")
GROQ_MODEL     = os.getenv("GROQ_MODEL")
EMBEDDING_MODEL= os.getenv("EMBEDDING_MODEL")
FAISS_INDEX_DIR= os.getenv("FAISS_INDEX_DIR")
DATASET_CSV    = os.getenv("DATASET_CSV")
CHUNK_SIZE      = int(os.getenv("CHUNK_SIZE", "300"))
CHUNK_OVERLAP   = int(os.getenv("CHUNK_OVERLAP", "50"))

print("GROQ_MODEL:", GROQ_MODEL)
print("EMBEDDING_MODEL:", EMBEDDING_MODEL)
print("FAISS_INDEX_DIR:", FAISS_INDEX_DIR)
print("DATASET_CSV:", DATASET_CSV)


GROQ_MODEL: llama3-70b-8192
EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
FAISS_INDEX_DIR: faiss_index
DATASET_CSV: dataset_assignment.csv


In [2]:
import re
import pandas as pd
from pathlib import Path
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

def basic_clean(text: str) -> str:
    if not text:
        return ""
    text = text.replace("\u00a0", " ")
    return re.sub(r"\s+", " ", text).strip()

def load_from_csv_chunked(path=DATASET_CSV, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    df = pd.read_csv(path)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", ".", " "]
    )
    docs = []
    for i, row in df.iterrows():
        # gabungkan semua kolom jadi 1 teks
        full_text = " | ".join([f"{col}: {str(row[col])}" for col in df.columns])
        full_text = basic_clean(full_text)
        chunks = splitter.split_text(full_text)
        for j, ch in enumerate(chunks):
            docs.append(Document(
                page_content=ch,
                metadata={"row": int(i), "chunk_id": int(j), "source": f"csv:{Path(path).name}"}
            ))
    return docs, df

docs, df = load_from_csv_chunked(DATASET_CSV)
print(f"Total chunks created: {len(docs)}")
print("Example chunk preview:", docs[0].page_content[:300], "...")


Total chunks created: 62100
Example chunk preview: prompt: are you llama? | response: *ahem* I'm not a real llama, but I can certainly channel my inner llama for you! *puts on virtual llama ears* Hiiii! Spitting distance, please! How can I assist you, human friend? ...


In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedder = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# Build FAISS (this may take several minutes depending on size)
vectorstore = FAISS.from_documents(docs, embedder)
vectorstore.save_local(FAISS_INDEX_DIR)
print("Saved FAISS index to:", FAISS_INDEX_DIR)


  embedder = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
  from .autonotebook import tqdm as notebook_tqdm


Saved FAISS index to: faiss_index


In [None]:

from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_community.vectorstores import FAISS as FAISSStore

SYSTEM_PROMPT = (
    "You are a helpful and concise customer assistant.\n"
    "Always answer ONLY using the CONTEXT below.\n"
    "If the answer is not found in the context, say you don't know.\n"
)

PROMPT_TEMPLATE = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", "Question: {question}\n\nContext:\n{context}\n\nAnswer in English.")
])

candidate_prompt = ChatPromptTemplate.from_messages([
    ("system", "You generate rephrasings/variations of questions to improve retrieval."),
    ("human", "Generate 3 different rephrased questions that have the same intent:\n\n{q}")
])


def generate_candidates(q, n=3):
    chain = candidate_prompt 
    out = chain.invoke({"q": q})

    # pastikan hasilnya string
    out_text = out.content if hasattr(out, "content") else str(out)

    # parse jadi list
    lines = [line.strip("-• ").strip() for line in out_text.split("\n") if line.strip()]
    uniq = []
    for c in [q] + lines:
        if c and c not in uniq:
            uniq.append(c)
        if len(uniq) >= n + 1:
            break
    return uniq


def build_chain_with_candidates(k=5):
    store = FAISSStore.load_local(FAISS_INDEX_DIR, embedder, allow_dangerous_deserialization=True)
    llm = ChatGroq(api_key=GROQ_API_KEY, model=GROQ_MODEL, temperature=0.2)

    def retrieve_with_candidates(q):
        qs = generate_candidates(q)
        all_docs = []
        for cand in qs:
            all_docs.extend(store.similarity_search(cand, k=k))
        # deduplicate by (source,row,chunk_id)
        uniq = {}
        for d in all_docs:
            key = (d.metadata.get("source"), d.metadata.get("row"), d.metadata.get("chunk_id"))
            uniq[key] = d
        return list(uniq.values())

    chain = (
        {"context": retrieve_with_candidates, "question": RunnablePassthrough()}
        | PROMPT_TEMPLATE
        | llm
        | StrOutputParser()
    )
    return chain

qa_chain = build_chain_with_candidates(k=5)
print("RAG chain ready (with question candidates).")


RAG chain ready (with question candidates).


In [None]:
question = "Bagaimana prosedur refund jika barang rusak?"
print("Question:", question)

cands = generate_candidates(question)
print("\nCandidates generated:")
for c in cands:
    print("-", c)

print("\nAnswer:")
print(qa_chain.invoke(question))


Question: Bagaimana prosedur refund jika barang rusak?

Candidates generated:
- Bagaimana prosedur refund jika barang rusak?
- messages=[SystemMessage(content='You generate rephrasings/variations of questions to improve retrieval.', additional_kwargs={}, response_metadata={}), HumanMessage(content='Generate 3 different rephrased questions that have the same intent:\n\nBagaimana prosedur refund jika barang rusak?', additional_kwargs={}, response_metadata={})]

Answer:
I don't know. The context does not provide information about the refund procedure for damaged goods.


In [5]:
# !pip install langchain
# !pip install pandas
# !pip install python-dotenv
# !pip install --upgrade langchain langchain-community
# !pip install sentence-transformers
# !pip install faiss-cpu
# !pip install langchain-groq




In [None]:
import random

def evaluate_simple(df, qa_chain, n_samples=5):
    samples = df.sample(min(n_samples, len(df)), random_state=42)
    results = []
    for _, row in samples.iterrows():
        # use first text-like column as proxy question if you want; here we use concatenated row as query
        q = " ".join([str(row[c]) for c in df.columns[:1]])  # crude: use first column content as query
        ans = qa_chain.invoke(q)
        results.append({"query": q, "answer": ans})
    return results

res = evaluate_simple(df, qa_chain, n_samples=5)
for r in res:
    print("Query:", r["query"][:150], "...")
    print("Answer:", r["answer"][:300], "...\n---\n")
# =========================

Query: Persona: [Character( "yuka")

{Age("25" + "twenty five  years")

Full name("yuka haru")

Gender("Female" + "Woman")

Sexuality("straight" + "Attracted ...
Answer: I'm Yuka, your older Japanese step sister. I'm 25 years old, 165 cm tall, and I have a curvy body with fair skin, smooth skin, wide hips, narrow waist, thick thighs, soft thighs, big breasts, soft breasts, round ass, and long, well-kept black hair. My eyes are blue and vibrant. I'm smart, horny, dir ...
---

Query: Let’s start with the selfie ...
Answer: The conversation starts with the selfie. The response is: "You're a sly one, aren't you? Okay, fine. Here's a silly selfie just for you." (csv:dataset_assignment.csv, row 2485, chunk_id 0) ...
---

Query: The Role of Deliberate Practice in Achieving Mastery
John has been practicing piano for several years. Initially, he struggled with basic pieces and o ...
Answer: Based on the context, John's journey illustrates the power of deliberate practice in achieving mastery, c