RAG
notes:
- conda env: rag-ta, requirements-rag.txt

# Import Library

In [2]:
import json
import os
from typing import List
import numpy as np


from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from sentence_transformers import SentenceTransformer


from langchain_community.document_loaders import JSONLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from pymilvus import Collection, AnnSearchRequest, WeightedRanker
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage
from langchain_community.chat_message_histories import ChatMessageHistory
import re

  from .autonotebook import tqdm as notebook_tqdm


# Initiate variable

In [3]:
dataset_path = "dataset_3sample_json"

model_name = "gpt-4o-mini"
temperature = 0.1

# Load Dataset XML to JSON

In [4]:
# import os
# import xml.etree.ElementTree as ET
# import json
# from tqdm import tqdm

# # Fungsi untuk mengonversi elemen XML ke dictionary
# def xml_to_dict(element):
#     result = {}
#     # Tambahkan atribut elemen ke dictionary
#     result.update(element.attrib)
#     # Tambahkan elemen anak ke dictionary
#     for child in element:
#         if len(child):  # Jika elemen memiliki anak
#             result[child.tag] = xml_to_dict(child)
#         else:  # Jika elemen tidak memiliki anak
#             result[child.tag] = child.text.strip() if child.text else None
#     return result

# # Fungsi untuk mengonversi file XML ke JSON dengan progress bar
# def convert_xml_to_json(input_folder, output_folder):
#     # Pastikan folder output ada
#     if not os.path.exists(output_folder):
#         os.makedirs(output_folder)

#     # Dapatkan daftar file XML di folder input
#     xml_files = [f for f in os.listdir(input_folder) if f.endswith(".xml")]

#     # Inisialisasi progress bar
#     with tqdm(total=len(xml_files), desc="Processing Files", unit="file") as pbar:
#         for filename in xml_files:
#             file_path = os.path.join(input_folder, filename)
#             try:
#                 # Parse file XML
#                 tree = ET.parse(file_path)
#                 root = tree.getroot()

#                 # Konversi ke dictionary
#                 data_dict = xml_to_dict(root)

#                 # Nama file JSON
#                 json_filename = os.path.splitext(filename)[0] + ".json"
#                 json_path = os.path.join(output_folder, json_filename)

#                 # Simpan sebagai file JSON
#                 with open(json_path, 'w', encoding='utf-8') as json_file:
#                     json.dump(data_dict, json_file, indent=4, ensure_ascii=False)

#                 pbar.set_postfix({"Current File": filename})
#             except Exception as e:
#                 pbar.set_postfix({"Error": str(e)})
            
#             # Update progress bar
#             pbar.update(1)


In [5]:
# # Folder input dan output
# input_folder = "/Users/auva/Documents/GitHub/indo-law-rag/dataset"  # Ganti dengan path folder input
# output_folder = "/Users/auva/Documents/GitHub/indo-law-rag/dataset_json"  # Ganti dengan path folder output

# # Jalankan konversi
# convert_xml_to_json(input_folder, output_folder)

# Chunk docs

In [6]:
def load_json_documents(folder_path: str) -> List[Document]:
    """
    Load all JSON files from a folder recursively using JSONLoader
    
    Args:
        folder_path (str): Path to the folder containing JSON files
        
    Returns:
        List[Document]: List of loaded documents
    """
    all_documents = []
    
    # Walk through all files in directory and subdirectories
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                try:
                    # Initialize JSONLoader for each file
                    loader = JSONLoader(
                        file_path=file_path,
                        jq_schema='.',  # You might need to adjust this based on your JSON structure
                        text_content=False
                    )
                    
                    # Load documents from the file
                    documents = loader.load()
                    all_documents.extend(documents)
                    
                    print(f"Successfully loaded: {file_path}")
                except Exception as e:
                    print(f"Error loading {file_path}: {str(e)}")
    
    return all_documents


In [7]:
documents = load_json_documents(dataset_path)
# Print total number of documents loaded
print(f"\nTotal documents loaded: {len(documents)}")

Successfully loaded: dataset_3sample_json/0a0b31d91c24aa455fcc6848702b507c.json
Successfully loaded: dataset_3sample_json/0a0e54aa47236640faa91000e271aa0d.json

Total documents loaded: 2


In [8]:
documents[1].page_content

'{"amar": "pidana", "amar_lainnya": "hukum", "id": "0a0e54aa47236640faa91000e271aa0d", "klasifikasi": "pidana-khusus", "lama_hukuman": "720", "lembaga_peradilan": "pn-cirebon", "provinsi": "jabar", "status": "berkekuatan-hukum-tetap", "sub_klasifikasi": "narkotika-dan-psikotropika", "url": "https://putusan3.mahkamahagung.go.id/direktori/putusan/0a0e54aa47236640faa91000e271aa0d.html", "kepala_putusan": "putusan\\nnomor 110 pid sus 2018 pn cbn\\ndemi keadilan berdasarkan ketuhanan yang maha esa\\npengadilan negeri cirebon yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara terdakwa", "identitas": "nama lengkap 1 adi daswadi bin wasdam\\ntempat lahir 2 indramayu\\numur tanggal lahir 3 30 tahun 20 november 1988\\njenis kelamin 4 laki laki\\nkebangsaan 5 indonesia\\ntempat tinggal 6 desa santing blok ujung jaya rt 11 rw 02 kecamatan losarang kabupaten indramayu\\nagama 7 islam\\npekerjaan 8 buruh", "riwayat_pen

In [9]:
documents

[Document(metadata={'source': '/Users/auva/Documents/GitHub/indo-law-rag/dataset_3sample_json/0a0b31d91c24aa455fcc6848702b507c.json', 'seq_num': 1}, page_content='{"amar": "pidana", "amar_lainnya": "hukum", "id": "0a0b31d91c24aa455fcc6848702b507c", "klasifikasi": "pidana-umum", "lama_hukuman": "420", "lembaga_peradilan": "pn-klaten", "provinsi": "jateng", "status": "berkekuatan-hukum-tetap", "sub_klasifikasi": "penggelapan", "url": "https://putusan3.mahkamahagung.go.id/direktori/putusan/0a0b31d91c24aa455fcc6848702b507c.html", "kepala_putusan": "putusan\\nnomor 28 pid b 2017 pn kln\\ndemi keadilan berdasarkan ketuhanan yang maha esa\\npengadilan negeri klaten yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara terdakwa", "identitas": "1 nama lengkap ahmad samsudin bin marsadi\\n2 tempat lahir klaten", "riwayat_penahanan": "terdakwa ditangkap pada tanggal 6 desember 2016 oleh polres klaten\\nterdakwa ditahan

In [10]:
# text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, #max num of char 
    chunk_overlap=100,#overlap between chunks to maintain context 
    length_function=len,
    is_separator_regex=False,
)

chunk_docs = text_splitter.split_documents(documents)

# Define LLM

In [11]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = OpenAI(model=model_name, temperature=temperature)

  llm = OpenAI(model=model_name, temperature=temperature)


In [12]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedding_dim = 1536

# Milvus Preparation

## Connect to Milvus

In [14]:
def connect_to_milvus():
    """Connect to Milvus standalone server"""
    try:
        connections.connect(
            alias="default",
            host='127.0.0.1',
            port='19530',
            db_name='database_ta_testing'
        )
        print("Successfully connected to Milvus")
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")
        raise
    
connect_to_milvus()

Successfully connected to Milvus


In [None]:
# # only run once, for creating the db 
# # create database 
# database = db.create_database("database_ta_testing")

In [None]:
from pymilvus import FieldSchema, DataType, CollectionSchema, Collection, Index
from tqdm import tqdm
import json

## Create Schema

In [None]:
def create_legal_document_schema():
    """Create schema for legal document collection"""
    
    # Define fields
    id_field = FieldSchema(
        name="id", 
        dtype=DataType.INT64, 
        is_primary=True, 
        description="primary id"
    )
    
    content_vector_field = FieldSchema(
        name="content_vector", 
        dtype=DataType.FLOAT_VECTOR, 
        dim=1536,  # Assuming using OpenAI embeddings
        description="document content vector"
    )
    
    identitas_field = FieldSchema(
        name="identitas", 
        dtype=DataType.VARCHAR, 
        max_length=5000,
        description="defendant identity information"
    )
    
    riwayat_perkara_field = FieldSchema(
        name="riwayat_perkara",
        dtype=DataType.VARCHAR,
        max_length=10000, 
        description="case history"
    )
    
    amar_putusan_field = FieldSchema(
        name="amar_putusan",
        dtype=DataType.VARCHAR,
        max_length=5000,
        description="court decision"
    )
    
    pertimbangan_hukum_field = FieldSchema(
        name="pertimbangan_hukum",
        dtype=DataType.VARCHAR, 
        max_length=10000,
        description="legal considerations"
    )
    
    url_field = FieldSchema(
        name="url",
        dtype=DataType.VARCHAR,
        max_length=500,
        description="source document URL"
    )
    
    # Create schema
    schema = CollectionSchema(
        fields=[
            id_field,
            content_vector_field, 
            identitas_field,
            riwayat_perkara_field,
            amar_putusan_field,
            pertimbangan_hukum_field,
            url_field
        ],
        auto_id=False,
        enable_dynamic_field=True,
        description="Collection for legal documents"
    )
    
    return schema

## Create Collection and Index

In [None]:
def create_collection_and_index(collection_name="legal_documents"):
    """Create collection and index"""
    
    # Create schema
    schema = create_legal_document_schema()
    
    # Create collection
    collection = Collection(
        name=collection_name, 
        schema=schema, 
        using='default'
    )
    
    # Create index parameters
    index_params = {
        "index_type": "IVF_FLAT",
        "metric_type": "COSINE",
        "params": {"nlist": 128}
    }
    
    # Create index
    index = Index(collection, "content_vector", index_params)
    
    # Flush collection
    collection.flush()
    
    return collection

## Load Documents to DB

In [None]:
def process_and_insert_documents(documents, embeddings_model, collection_name="legal_documents"):
    """Process documents and insert into collection"""
    
    data_db = []
    
    def embed_text(text):
        """Generate embeddings for text"""
        return embeddings_model.embed_query(text)
    
    # Process documents
    for i, doc in tqdm(enumerate(documents), total=len(documents), desc="Processing Documents"):
        content = doc.page_content
        metadata = doc.metadata
        
        # Parse JSON content
        doc_data = json.loads(content)
        
        # Create combined text for embedding
        combined_text = f"{doc_data.get('identitas', '')} {doc_data.get('riwayat_perkara', '')} {doc_data.get('amar_putusan', '')} {doc_data.get('pertimbangan_hukum', '')}"
        
        # Generate embedding
        content_vector = embed_text(combined_text)
        
        # Create document entry
        data_entry = {
            "id": i,
            "content_vector": content_vector,
            "identitas": doc_data.get('identitas', ''),
            "riwayat_perkara": doc_data.get('riwayat_perkara', ''),
            "amar_putusan": doc_data.get('amar_putusan', ''),
            "pertimbangan_hukum": doc_data.get('pertimbangan_hukum', ''),
            "url": doc_data.get('url', '')
        }
        
        data_db.append(data_entry)
    
    # Get collection
    collection = Collection(collection_name)
    
    # Insert data
    insert_result = collection.insert(data=data_db)
    
    # Flush to ensure data is written
    collection.flush()
    
    # Load
    collection.load()
    
    # Save as JSON backup
    with open('legal_documents_vector.json', 'w') as f:
        json.dump(data_db, f, indent=2)
        
    return insert_result

In [None]:
# Create collection and index
collection = create_collection_and_index()

# Process and insert documents
result = process_and_insert_documents(documents, embeddings)

Processing Documents: 100%|██████████| 2/2 [00:00<00:00,  2.35it/s]


# Retriever

In [15]:
def retrieve_from_milvus(query, collection_name, embeddings, k=5):
    """Retrieve relevant documents from Milvus using hybrid search"""
    
    # Embed the query
    embedded_query = embeddings.embed_query(query)
    collection = Collection(name=collection_name)

    # Set up ANN search parameters
    search_param = {
        "data": [embedded_query],
        "anns_field": "content_vector",
        "param": {
            "metric_type": "COSINE",
            "params": {"nprobe": 10}
        },
        "limit": k
    }
    request = AnnSearchRequest(**search_param)

    # Perform hybrid search
    results = collection.hybrid_search(
        reqs=[request],
        rerank=WeightedRanker(0.8),
        limit=k,
        output_fields=["identitas", "riwayat_perkara", "amar_putusan", "pertimbangan_hukum", "url"]
    )

    # Extract relevant information from results
    documents = []
    if results:
        for hit in results[0]:
            # Access fields using the fields property of the hit object
            fields = hit.fields
            doc_info = {
                "identitas": fields.get("identitas", ""),
                "riwayat_perkara": fields.get("riwayat_perkara", ""),
                "amar_putusan": fields.get("amar_putusan", ""),
                "pertimbangan_hukum": fields.get("pertimbangan_hukum", ""),
                "url": fields.get("url", "")
            }
            documents.append(doc_info)

    return documents

# Example usage:
"""
collection_name = "legal_documents"
query = "apa rangkumannya"
retrieved_docs = retrieve_from_milvus(query, collection_name, embeddings, k=5)
pprint(retrieved_docs)
"""

'\ncollection_name = "legal_documents"\nquery = "apa rangkumannya"\nretrieved_docs = retrieve_from_milvus(query, collection_name, embeddings, k=5)\npprint(retrieved_docs)\n'

In [16]:
from pprint import pprint
collection_name = "legal_documents"
query = "siapa adi daswadi bin wasdam"
retrieved_docs = retrieve_from_milvus(query, collection_name, embeddings, k=5)
pprint(retrieved_docs)

RPC error: [hybrid_search], <MilvusException: (code=503, message=failed to search: channel not subscribed: channel lacks[channel=by-dev-rootcoord-dml_0_454756620526950040v0]: channel not available[channel=by-dev-rootcoord-dml_0_454756620526950040v0])>, <Time:{'RPC start': '2024-12-23 00:54:02.900458', 'RPC error': '2024-12-23 00:54:23.999899'}>


MilvusException: <MilvusException: (code=503, message=failed to search: channel not subscribed: channel lacks[channel=by-dev-rootcoord-dml_0_454756620526950040v0]: channel not available[channel=by-dev-rootcoord-dml_0_454756620526950040v0])>

# Generation

In [19]:
# COSTAR-based prompt template
legal_prompt = """
Anda adalah asisten hukum yang dirancang untuk membantu pengguna menganalisis dokumen hukum Indonesia, khususnya putusan pengadilan Mahkamah Agung (MA). Ikuti panduan CO-STAR berikut:

Context:
- Gunakan HANYA informasi dari dokumen putusan pengadilan MA yang disediakan.
- Jika informasi tidak ditemukan dalam dokumen tersebut, jawab "Maaf, saya tidak dapat menemukan informasi tersebut dalam dokumen yang tersedia."
- Fokus pada fakta hukum, seperti identitas terdakwa, riwayat perkara, dan pertimbangan hukum.

Objective:
- Bantu pengguna dalam menganalisis dokumen hukum secara akurat dan efisien menggunakan pendekatan Retrieval-Augmented Generation (RAG).
- Jelaskan terminologi hukum dengan bahasa yang mudah dipahami.
- Jangan memberikan nasihat hukum, tetapi sediakan interpretasi data dari dokumen.
- Gunakan format yang jelas untuk menampilkan hasil analisis.

Style:
- Gunakan gaya bahasa yang formal dan teknis.

Tone:
- Nada profesional dan netral untuk menjaga keakuratan dan kredibilitas respons.

Audience:
- Peneliti hukum, praktisi hukum, atau pengguna umum yang membutuhkan bantuan analisis dokumen putusan MA.

Response:
- Berikan respons dalam format berikut:
  Jawaban: [jawaban]
  URL Sumber: [informasi]
- Jika informasi tidak ditemukan, sebutkan "Tidak tersedia."

Question: {messages}
Context: {context}

Berikan jawaban dalam bahasa Indonesia yang jelas dan terstruktur.
"""


question_answering_prompt = ChatPromptTemplate.from_messages([
    ("system", legal_prompt),
    MessagesPlaceholder(variable_name="messages"),
])

# Document class for handling legal documents
class LegalDocument:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

# Response generator function
def generate_response(query, llm, collection_name, embeddings):
    """Generate responses to queries using RAG"""
    
    # Initialize chat history
    message_history = ChatMessageHistory()
    
    # Retrieve relevant documents
    retrieved_docs = retrieve_from_milvus(query, collection_name, embeddings, k=5)
    
    # Create document chain
    document_chain = create_stuff_documents_chain(llm, question_answering_prompt)
    
    # Format documents
    docs = [LegalDocument(page_content=str(doc)) for doc in retrieved_docs]
    
    # Add query to message history
    message_history.add_message(HumanMessage(content=query))
    
    # Generate response
    response = document_chain.invoke({
        "context": docs,
        "messages": [HumanMessage(content=query)]
    })
    
    return response

In [23]:
# Initialize necessary components
embeddings = OpenAIEmbeddings()
llm = ChatOpenAI()

# Generate response to a query
query = "siapa sih adi daswadi bin wasdam? dia terlibat kasus apa?"
response = generate_response(
    query=query,
    llm=llm,
    collection_name="legal_documents",
    embeddings=embeddings
)
print(response)

Jawaban: Adi Daswadi bin Wasdam adalah seorang terdakwa yang terlibat dalam kasus penyalahgunaan narkotika golongan I bagi diri sendiri. Ia terbukti melakukan tindak pidana tersebut berdasarkan dakwaan alternatif kedua. Adi Daswadi bin Wasdam dijatuhi pidana penjara selama 2 tahun dan ditetapkan tetap ditahan. Barang bukti berupa narkotika jenis daun ganja kering yang dimilikinya juga dimusnahkan. Selain itu, Adi Daswadi bin Wasdam juga diwajibkan membayar biaya perkara sejumlah Rp 5.000,00. 
URL Sumber: https://putusan3.mahkamahagung.go.id/direktori/putusan/0a0e54aa47236640faa91000e271aa0d.html


# Evaluation