# <h3><b>Libraries

In [26]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter
import pymupdf4llm
import glob

from typing import List, Dict, Tuple
from langchain_core.documents import Document
from sentence_transformers import CrossEncoder
import time

# <h3><b>Environment

In [2]:
load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# <h3><b>LLM

In [None]:
llm = ChatGroq(
    groq_api_key=KEY,
    model_name="openai/gpt-oss-120b",
    temperature=0
)
print("✅ LLM initialized!")

✅ LLM initialized!


# <h3><b>Chunking

In [4]:
# # Ekstrak PDF ke Markdown
# md_text = pymupdf4llm.to_markdown(r"D:\Portfolio\rag-llm-education\data\documents\Machine Learning _ Model Klasifikasi pdf.pdf")

# # Simpan hasilnya
# with open("output.md", "w", encoding="utf-8") as f:
#     f.write(md_text)

# Muat dokumen Markdown
# print(f"Jumlah total chunk hierarkis: {len(hierarchical_chunks)}\n")

# for i, chunk in enumerate(hierarchical_chunks):
#     print(f"--- Chunk #{i+1} ---")
#     print(f"Metadata: {chunk.metadata}")
#     # Cetak 100 karakter pertama dari konten untuk pratinjau
#     print(f"Konten: {chunk.page_content[:100]}...\n")

In [5]:

def pdf_chunking_hierarchical(pdf_path: str):
    md_text = pymupdf4llm.to_markdown(pdf_path)

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on
    )
    
    hierarchical_chunks = markdown_splitter.split_text(md_text)
    return hierarchical_chunks

# <h3><b>Embeddings

In [6]:
!pip install -U sentence-transformers transformers torch ml-dtypes



ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'c:\\users\\saita\\anaconda3\\envs\\portfolio\\lib\\site-packages\\charset_normalizer-3.4.3.dist-info\\METADATA'



In [7]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
    # model_name="Qwen/Qwen3-Embedding-0.6B"
)
print("✅ Embeddings model loaded")

  embeddings = HuggingFaceEmbeddings(


✅ Embeddings model loaded


# <h3><b>Vector DB

In [8]:
# vectorstore = Chroma.from_documents(
#     documents=splits,
#     embedding=embeddings,
#     persist_directory="../chroma_db"
# )
# print("✅ Vector store created")

In [9]:
# retriever = vectorstore.as_retriever(
#     search_type="similarity",
#     search_kwargs={"k": 3}
# )
# print("✅ Retriever ready")

In [10]:
hierarchical_chunks = pdf_chunking_hierarchical(pdf_path="D:/Portfolio/rag-llm-education/data/documents/Machine Learning _ Model Klasifikasi pdf.pdf")

CHROMA_PERSIST_DIRECTORY = "../chroma_db"
COLLECTION_NAME = "machine_learning_modules"

try:
    vector_store = Chroma.from_documents(
        documents=hierarchical_chunks,
        embedding=embeddings,
        persist_directory=CHROMA_PERSIST_DIRECTORY,
        collection_name=COLLECTION_NAME
    )

    print(f"Berhasil menyimpan {len(hierarchical_chunks)} dokumen ke database Chroma di direktori '{CHROMA_PERSIST_DIRECTORY}'")

except Exception as e:
    print(f"Gagal menyimpan ke ChromaDB: {e}")
    exit()

Berhasil menyimpan 104 dokumen ke database Chroma di direktori '../chroma_db'


In [11]:
vector_store = Chroma(
    persist_directory=CHROMA_PERSIST_DIRECTORY,
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings
)
print("Database berhasil dimuat.")

Database berhasil dimuat.


  vector_store = Chroma(


In [12]:
retriever = vector_store.as_retriever(
    # search_type="similarity_score_threshold",
    search_type="similarity",
    # search_kwargs={"k": 3, "score_threshold": 0.1}
    search_kwargs={"k": 5}
)

In [13]:
m

NameError: name 'm' is not defined

# <h3><b>Reranker

In [20]:
# ========== CELL 8: Load Reranker Model ==========
print("\n" + "="*70)
print("🎯 LOADING RERANKER MODEL")
print("="*70)

# Load cross-encoder untuk reranking
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

print("✅ Reranker model loaded!")
print("   • Model: cross-encoder/ms-marco-MiniLM-L-6-v2")
print("   • Type: Cross-encoder (BERT-based)")


🎯 LOADING RERANKER MODEL


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✅ Reranker model loaded!
   • Model: cross-encoder/ms-marco-MiniLM-L-6-v2
   • Type: Cross-encoder (BERT-based)


In [27]:
# ========== CELL 1: Enhanced Reranking Function dengan Hierarki ==========
def rerank_documents_hierarchical(
    query: str, 
    documents: List[Document], 
    top_n: int = 5,
    use_parent: bool = True,
    parent_boost: float = 0.1
) -> List[Document]:
    """
    Rerank documents using cross-encoder dengan support untuk hierarchical chunks
    
    Args:
        query: Search query
        documents: List of retrieved documents (bisa child atau parent chunks)
        top_n: Number of top documents to return
        use_parent: Jika True, akan mencoba retrieve parent chunk jika ada
        parent_boost: Boost score untuk parent chunks (0.1 = +10%)
    
    Returns:
        List of reranked documents with scores and hierarchy info
    """
    if not documents:
        return []
    
    print(f"🔍 Reranking {len(documents)} documents...")
    
    # Prepare pairs for cross-encoder
    pairs = []
    doc_info = []
    
    for doc in documents:
        # Check if document has hierarchy metadata
        chunk_type = doc.metadata.get('chunk_type', 'unknown')
        has_parent = doc.metadata.get('parent_id') is not None
        
        pairs.append([query, doc.page_content])
        doc_info.append({
            'doc': doc,
            'chunk_type': chunk_type,
            'has_parent': has_parent
        })
    
    # Get scores from cross-encoder
    scores = reranker.predict(pairs)
    
    # Apply hierarchy-aware scoring
    adjusted_scores = []
    for score, info in zip(scores, doc_info):
        adjusted_score = float(score)
        
        # Boost parent chunks (biasanya lebih informatif)
        if info['chunk_type'] == 'parent':
            adjusted_score += parent_boost
        
        # Store in metadata
        info['doc'].metadata['relevance_score'] = adjusted_score
        info['doc'].metadata['raw_score'] = float(score)
        
        adjusted_scores.append(adjusted_score)
    
    # Sort by adjusted score (descending)
    ranked_items = sorted(
        zip(doc_info, adjusted_scores),
        key=lambda x: x[1],
        reverse=True
    )
    
    # Return top N
    top_docs = [item[0]['doc'] for item in ranked_items[:top_n]]
    
    print(f"✅ Reranking complete! Top score: {ranked_items[0][1]:.4f}")
    
    return top_docs


# ========== CELL 2: Function to Expand to Parent Chunks ==========
def expand_to_parent_chunks(
    documents: List[Document],
    vector_store
) -> List[Document]:
    """
    Expand child chunks ke parent chunks jika tersedia
    
    Args:
        documents: List of documents (might be child chunks)
        vector_store: ChromaDB vector store instance
    
    Returns:
        List of documents with parent chunks included
    """
    expanded_docs = []
    parent_ids_seen = set()
    
    for doc in documents:
        # Add original document
        expanded_docs.append(doc)
        
        # Check if has parent
        parent_id = doc.metadata.get('parent_id')
        
        if parent_id and parent_id not in parent_ids_seen:
            try:
                # Retrieve parent chunk by ID
                parent_results = vector_store.get(ids=[parent_id])
                
                if parent_results and parent_results['documents']:
                    # Create Document object for parent
                    parent_doc = Document(
                        page_content=parent_results['documents'][0],
                        metadata=parent_results['metadatas'][0] if parent_results['metadatas'] else {}
                    )
                    expanded_docs.append(parent_doc)
                    parent_ids_seen.add(parent_id)
                    
            except Exception as e:
                print(f"⚠️ Could not retrieve parent {parent_id}: {e}")
    
    return expanded_docs


# ========== CELL 3: Advanced Retrieval with Hierarchical Reranking ==========
def retrieve_and_rerank_hierarchical(
    query: str,
    vector_store,
    k_initial: int = 10,
    top_n_final: int = 5,
    expand_parents: bool = True,
    use_parent_boost: bool = True
) -> Tuple[List[Document], Dict]:
    """
    Complete retrieval pipeline dengan hierarchical support dan reranking
    
    Returns:
        Tuple of (reranked_documents, metrics_dict)
    """
    metrics = {}
    
    print("\n" + "="*70)
    print("🔍 HIERARCHICAL RETRIEVAL + RERANKING")
    print("="*70)
    
    # Step 1: Initial retrieval
    print(f"\n📥 Step 1: Initial retrieval (k={k_initial})")
    start = time.time()
    initial_results = vector_store.similarity_search(query, k=k_initial)
    retrieval_time = time.time() - start
    metrics['retrieval_time'] = retrieval_time
    metrics['initial_count'] = len(initial_results)
    
    print(f"   ✅ Retrieved {len(initial_results)} chunks")
    print(f"   ⏱️  Time: {retrieval_time*1000:.2f}ms")
    
    # Analyze chunk types
    chunk_types = {}
    for doc in initial_results:
        chunk_type = doc.metadata.get('chunk_type', 'unknown')
        chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
    
    print(f"   📊 Chunk types: {chunk_types}")
    
    # Step 2: Expand to parents (optional)
    if expand_parents:
        print(f"\n🔼 Step 2: Expanding to parent chunks")
        start = time.time()
        expanded_results = expand_to_parent_chunks(initial_results, vector_store)
        expand_time = time.time() - start
        metrics['expand_time'] = expand_time
        metrics['expanded_count'] = len(expanded_results)
        
        print(f"   ✅ Expanded to {len(expanded_results)} chunks (includes parents)")
        print(f"   ⏱️  Time: {expand_time*1000:.2f}ms")
        
        documents_to_rerank = expanded_results
    else:
        documents_to_rerank = initial_results
        metrics['expand_time'] = 0
        metrics['expanded_count'] = len(initial_results)
    
    # Step 3: Rerank
    print(f"\n🎯 Step 3: Reranking to top {top_n_final}")
    start = time.time()
    reranked_results = rerank_documents_hierarchical(
        query=query,
        documents=documents_to_rerank,
        top_n=top_n_final,
        use_parent=True,
        parent_boost=0.1 if use_parent_boost else 0
    )
    rerank_time = time.time() - start
    metrics['rerank_time'] = rerank_time
    metrics['final_count'] = len(reranked_results)
    
    print(f"   ⏱️  Time: {rerank_time*1000:.2f}ms")
    
    # Total time
    total_time = retrieval_time + metrics['expand_time'] + rerank_time
    metrics['total_time'] = total_time
    
    print(f"\n⏱️  TOTAL TIME: {total_time*1000:.2f}ms")
    print("="*70)
    
    return reranked_results, metrics


# ========== CELL 4: Display Results Function ==========
def display_reranked_results(documents: List[Document], query: str = None):
    """
    Display reranked results dengan hierarchy information
    """
    if query:
        print(f"\n❓ Query: '{query}'")
    
    print(f"\n📚 Top {len(documents)} Results:\n")
    print("="*70)
    
    for i, doc in enumerate(documents, 1):
        # Extract metadata
        page = doc.metadata.get('page', 'N/A')
        chunk_type = doc.metadata.get('chunk_type', 'unknown')
        chunk_id = doc.metadata.get('chunk_id', 'N/A')
        parent_id = doc.metadata.get('parent_id', None)
        relevance_score = doc.metadata.get('relevance_score', 0)
        raw_score = doc.metadata.get('raw_score', 0)
        
        # Header
        print(f"\n{i}. 📄 Page {page if page == 'N/A' else page+1} | Type: {chunk_type.upper()}")
        print(f"   ID: {chunk_id}")
        if parent_id:
            print(f"   Parent ID: {parent_id}")
        
        # Scores
        print(f"   🎯 Relevance Score: {relevance_score:.4f} (raw: {raw_score:.4f})")
        
        # Content preview
        content_preview = doc.page_content[:200] if len(doc.page_content) > 200 else doc.page_content
        print(f"   📝 Content:\n   {content_preview}...")
        
        # Hierarchy indicator
        if chunk_type == 'parent':
            print(f"   🔼 [PARENT CHUNK - Contains broader context]")
        elif chunk_type == 'child':
            print(f"   🔽 [CHILD CHUNK - Focused content]")
        
        print("-"*70)

In [28]:
# ========== CELL 5: Test Hierarchical Retrieval + Reranking ==========
print("\n" + "="*70)
print("🧪 TEST: HIERARCHICAL RETRIEVAL + RERANKING")
print("="*70)

# Test query
test_query = "Apa itu overfitting?"

# Retrieve and rerank
reranked_docs, metrics = retrieve_and_rerank_hierarchical(
    query=test_query,
    vector_store=vector_store,
    k_initial=10,           # Retrieve 10 chunks initially
    top_n_final=5,          # Return top 5 after reranking
    expand_parents=True,    # Include parent chunks
    use_parent_boost=True   # Boost parent chunk scores
)

# Display results
display_reranked_results(reranked_docs, query=test_query)

# Display metrics
print("\n📊 METRICS:")
print("="*70)
for key, value in metrics.items():
    if 'time' in key:
        print(f"   • {key}: {value*1000:.2f}ms")
    else:
        print(f"   • {key}: {value}")
print("="*70)


🧪 TEST: HIERARCHICAL RETRIEVAL + RERANKING

🔍 HIERARCHICAL RETRIEVAL + RERANKING

📥 Step 1: Initial retrieval (k=10)
   ✅ Retrieved 10 chunks
   ⏱️  Time: 22.10ms
   📊 Chunk types: {'unknown': 10}

🔼 Step 2: Expanding to parent chunks
   ✅ Expanded to 10 chunks (includes parents)
   ⏱️  Time: 0.00ms

🎯 Step 3: Reranking to top 5
🔍 Reranking 10 documents...
✅ Reranking complete! Top score: 0.5135
   ⏱️  Time: 901.71ms

⏱️  TOTAL TIME: 923.81ms

❓ Query: 'Apa itu overfitting?'

📚 Top 5 Results:


1. 📄 Page N/A | Type: UNKNOWN
   ID: N/A
   🎯 Relevance Score: 0.5135 (raw: 0.5135)
   📝 Content:
   Normalisasi data: JST sangat sensitif terhadap perbedaan skala antar fitur, oleh karena itu,
sangat disarankan untuk menormalisasi data sebelum melatih model
Inisialisasi bobot: Inisialisasi bobot yan...
----------------------------------------------------------------------

2. 📄 Page N/A | Type: UNKNOWN
   ID: N/A
   🎯 Relevance Score: -1.4964 (raw: -1.4964)
   📝 Content:
   Ensemble methods di

## <h4><b>Delete Collection

In [None]:
# import chromadb

# try:
#     # Inisialisasi klien Chroma yang terhubung ke folder persistensi
#     client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIRECTORY)

#     # Hapus koleksi berdasarkan nama
#     client.delete_collection(name=COLLECTION_NAME)
    
#     print(f"✅ Koleksi '{COLLECTION_NAME}' berhasil dihapus dari database.")

# except Exception as e:
#     print(f"❌ Gagal menghapus koleksi: {e}")
#     print("Mungkin koleksi tidak ditemukan atau database belum ada.")

✅ Koleksi 'machine_learning_modules' berhasil dihapus dari database.


# <h3><b>Prompt

In [None]:
template = """Kamu adalah asisten AI yang membantu menjawab pertanyaan berdasarkan konteks yang diberikan.

Konteks:
{context}

Pertanyaan: {question}

Instruksi:
- Jawab berdasarkan konteks yang diberikan
- Jika tidak ada informasi di konteks, katakan "Maaf, saya tidak menemukan informasi tersebut dalam dokumen"
- Berikan jawaban yang jelas dan ringkas
- Gunakan bahasa Indonesia yang baik

Jawaban:"""

prompt = ChatPromptTemplate.from_template(template)
print("✅ Prompt template created")

✅ Prompt template created


# <h3><b>RAG Chain

In [None]:
def format_docs(docs):
    """Format dokumen untuk context"""
    return "\n\n".join(doc.page_content for doc in docs)

# Chain menggunakan LCEL (LangChain Expression Language)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("✅ RAG Chain created!")

✅ RAG Chain created!


# <h3><b>Query Function

In [None]:
def ask_question(question):
    """Fungsi untuk bertanya ke chatbot"""
    print("\n" + "="*60)
    print(f"❓ Pertanyaan: {question}")
    print("="*60)
    
    # Get answer
    answer = rag_chain.invoke(question)
    print(f"\n💬 Jawaban:\n{answer}")
    
    # Get source documents
    docs = retriever.invoke(question)
    print("\n" + "="*60)
    print("📚 Sumber Dokumen:")
    for i, doc in enumerate(docs, 1):
        print(f"\n{i}. {doc.page_content[:150]}...")
    print("="*60 + "\n")
    
    return answer


# <h3><b>Test

In [None]:
# --- 3. LAKUKAN PENCARIAN ---
query = "Apa itu overfitting?"
print(f"\nMelakukan pencarian untuk query: '{query}'")

relevant_docs = retriever.invoke(query)


Melakukan pencarian untuk query: 'Apa itu overfitting?'


In [None]:
# --- 4. TAMPILKAN HASIL ---
if relevant_docs:
    print(f"\nDitemukan {len(relevant_docs)} dokumen relevan:")
    for i, doc in enumerate(relevant_docs):
        print(f"\n--- Dokumen #{i+1} ---")
        print(f"Metadata: {doc.metadata}")
        # print(f"Konten: {doc.page_content[:300]}...")
        print(f"Konten: {doc.page_content}...")
else:
    print("Tidak ada dokumen relevan ditemukan.")


Ditemukan 5 dokumen relevan:

--- Dokumen #1 ---
Metadata: {'Header 3': '**9.5 Kasus-kasus yang Direkomendasikan**', 'Header 1': '**Bab 9: Ensemble Methods:** **Meningkatkan Performa dengan** **Metode Gabungan**'}
Konten: Ensemble methods direkomendasikan dalam situasi berikut:  
40  
Ketika model dasar mengalami overfitting atau underfitting
Ketika model dasar memiliki kecenderungan yang berbeda dan varians
Ketika menghadapi masalah klasifikasi yang kompleks atau data yang sangat tidak seimbang...

--- Dokumen #2 ---
Metadata: {'Header 3': '**8.7 Hal-hal yang Harus Diperhatikan**', 'Header 1': '**Bab 8: Neural Networks - Klasifikasi** **dengan Menggunakan Jaringan Saraf** **Tiruan**'}
Konten: Normalisasi data: JST sangat sensitif terhadap perbedaan skala antar fitur, oleh karena itu,
sangat disarankan untuk menormalisasi data sebelum melatih model
Inisialisasi bobot: Inisialisasi bobot yang baik bisa membantu konvergensi model lebih  
cepat
Overfitting: JST cenderung mudah overfittin

In [None]:
ask_question("Apa itu Model Klasifikasi?")


❓ Pertanyaan: Apa itu Model Klasifikasi?

💬 Jawaban:
Model klasifikasi adalah jenis model pembelajaran mesin yang digunakan untuk memetakan data input ke dalam satu atau beberapa kategori (kelas) yang telah ditentukan. Model ini mempelajari pola atau hubungan antara fitur‑fitur data (baik numerik maupun yang telah dikonversi menjadi numerik) dan label target, sehingga dapat memprediksi kelas dari data baru. Contohnya termasuk SVM, decision tree, logistic regression, dan lain‑lain, yang dapat diterapkan pada masalah klasifikasi binomial maupun multi‑kelas.

📚 Sumber Dokumen:

1. Setelah mempelajari berbagai model klasifikasi dan metrik evaluasi yang relevan, kita akan
membahas tentang praktik terbaik dalam menerapkan model kla...

2. SVM direkomendasikan untuk kasus-kasus berikut:  
1. Klasifikasi binomial atau multi-kelas.  
20  
2. Data dengan fitur numerik atau kategorikal (yang...

3. Berikut adalah tiga contoh kasus nyata yang menunjukkan bagaimana model klasifikasi bisa
diterapka

'Model klasifikasi adalah jenis model pembelajaran mesin yang digunakan untuk memetakan data input ke dalam satu atau beberapa kategori (kelas) yang telah ditentukan. Model ini mempelajari pola atau hubungan antara fitur‑fitur data (baik numerik maupun yang telah dikonversi menjadi numerik) dan label target, sehingga dapat memprediksi kelas dari data baru. Contohnya termasuk SVM, decision tree, logistic regression, dan lain‑lain, yang dapat diterapkan pada masalah klasifikasi binomial maupun multi‑kelas.'

In [None]:
ask_question("Siapa presiden Indonesia?")


❓ Pertanyaan: Siapa presiden Indonesia?

💬 Jawaban:
Maaf, saya tidak menemukan informasi tersebut dalam dokumen.

📚 Sumber Dokumen:

1. Akurasi adalah metrik yang paling sederhana dan intuitif. Akurasi menggambarkan
seberapa sering model membuat prediksi yang benar.  
Akurasi = (Jumlah...

2. Contoh: Mengklasifikasikan jenis kelamin berdasarkan tinggi, berat, dan ukuran sepatu.  
24...

3. JST cocok untuk digunakan pada kasus-kasus berikut:  
1. Klasifikasi gambar dan pengenalan pola
2. Analisis teks dan pemrosesan bahasa alami  
36  
3....



'Maaf, saya tidak menemukan informasi tersebut dalam dokumen.'