# **Store Data to Vector Store (OJK)**

Ini cara untuk storing ke Redis, tapi untuk [Load](#load) Document beda-beda untuk tiap data BI, OJK, dan SIKEPO. Jadi buat sendiri function `extract_all_documents_in_directory` nya

## **Setup**

In [98]:
# import nest_asyncio
# nest_asyncio.apply()

## **Config**

In [2]:
from utils.config import get_config
from utils.models import ModelName, get_model

config = get_config()

## **Define Model**

In [3]:
from utils.models import ModelName, LLMModelName, EmbeddingModelName, get_model

model_name = ModelName.AZURE_OPENAI
llm_model, embed_model = get_model(model_name=model_name, config=config, llm_model_name=LLMModelName.GPT_35_TURBO, embedding_model_name=EmbeddingModelName.EMBEDDING_3_SMALL)

## **Indexing (All)**

In [4]:
# from utils.documents_extractor.documents_extract_ojk import extract_all_documents_in_directory
# from utils.documents_split import document_splitter
# import pickle
# import os

# documents_dirs = ['./data/documents1/', './data/documents2/', './data/documents3/']
# pickle_path = './data/pickles/'
# metadata_path = './data/metadata/files_metadata.csv'

# LOAD_PICKLE = False

# for dir in documents_dirs:
#     if not LOAD_PICKLE:
#         documents = extract_all_documents_in_directory(dir, metadata_path, treshold=0.98)

#     if not LOAD_PICKLE:
#         all_splits = document_splitter(docs=documents)
#         all_splits_sorted = sorted(all_splits, key=lambda x: (x.metadata['doc_id'], x.metadata.get('page_number', '0')))

#         # Determine the pickle file name based on the directory
#         dir_name = os.path.basename(os.path.normpath(dir))
#         file_pickle_name = f'{dir_name}.pkl'

#         # Save the sorted splits to a pickle file
#         with open(os.path.join(pickle_path, file_pickle_name), 'wb') as file:
#             pickle.dump(all_splits_sorted, file)

## **Indexing**

In [5]:
documents_dir = './data/documents1/'
pickle_path = './data/pickles/'
metadata_path = './data/metadata/files_metadata.csv'

LOAD_PICKLE = True

### **Load**

Untuk SIKEPO dan BI beda cara extract documentsnya, file document_extractor buat sendiri :D.

In [6]:
from utils.documents_extractor.documents_extract_ojk import extract_all_documents_in_directory

if not LOAD_PICKLE:
    documents = extract_all_documents_in_directory(documents_dir, metadata_path, treshold=0.98)

### **Split**

In [7]:
def clean_document_content(content):
    return content.replace('\x00', '')  # Remove NUL characters

In [8]:
from utils.documents_split import document_splitter
import pickle

if not LOAD_PICKLE:
    all_splits = document_splitter(docs=documents)
    all_splits1 = sorted(all_splits, key=lambda x: (x.metadata['doc_id'], x.metadata.get('page_number', '0')))
    for split in all_splits1:
        split.page_content = clean_document_content(split.page_content)
    # Open a file and use dump() 
    with open(pickle_path + 'documents1.pkl', 'wb') as file:
        # A new file will be created
        pickle.dump(all_splits1, file)

# Open the file in binary mode 
with open(pickle_path + 'documents3.pkl', 'rb') as file:
    # Call load method to deserialze 
    all_splits = pickle.load(file)
    for split in all_splits:
        split.page_content = clean_document_content(split.page_content)

In [9]:
len(all_splits)

49253

### **Storing**

In [10]:
from database.vector_store.vector_store import PostgresIndexManager

# vector_store_manager = RedisIndexManager(index_name='ojk', embed_model=embed_model, config=config, db_id=0)
vector_store_manager = PostgresIndexManager(index_name='ojk', embed_model=embed_model, config=config)

# vector_store_manager.delete_index() # WARNING: This will delete the index
# vector_store_manager.store_vector_index(docs=all_splits, batch_size=5000)
vector_store = vector_store_manager.load_vector_index()

Database 'vector_store' already exists.
Vector extension created successfully (if it didn't exist).
Loaded collection 'ojk_collection'.


## **NYOBA2**

In [11]:
# vector_store.as_retriever().invoke("Berapa SWDKLLJ dari buldozer?")

[Document(metadata={'title': 'Peraturan Menteri Keuangan Nomor 36/PMK.010/2008 tentang Besar Santunan dan Sumbangan Wajib Dana Kecelakaan Lalu Lintas Jalan', 'doc_id': 1718, 'sector': 'IKNB', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/PMK-Nomor-36PMK.010-Tahun-2008-tentang-Besar-Santunan-dan-Sumbangan-Wajib-Dana-Kecelakaan-Lalu-Lintas-Jalan/menas13_1389258036.pdf', 'subsector': 'Asuransi', 'page_number': 3, 'effective_date': '2008/02/26', 'regulation_type': 'Klasifikasi Bapepam', 'regulation_number': '36/PMK.010/2008'}, page_content='b. Traktor, buidozer, forklift, mobil derek, excavator, crane dan\nsejenisnya sebesar Rp 20.000,00 (dua puluh ribu rupiah):\n Sepeda motor, sepeda kumbang dan scooter di atas 50 cc sampai\n250 cc dan kendaraan bermotor roda tiga sebesar Rp 32.000,00\n(tiga puluh dua ribu rupiah).\nd.\nPick up/mobil barang sampai dengan 2400 cc, sedan, jeep dan\n mobil penumpang bukan angkutan umum sebesar Rp140.000,00\n(seratus empat puluh ribu rupiah).

In [12]:
from langchain_community.vectorstores.pgvector import PGVector
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)
from langchain_core.language_models.base import BaseLanguageModel
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_core.vectorstores import VectorStore
from langchain.retrievers.self_query.pgvector import PGVectorTranslator
from langchain_core.prompts import PromptTemplate
from langchain_core.structured_query import Operator, Comparator

# Define metadata field information
metadata_field_info = [
    AttributeInfo(
        name="title",
        description="The title of the document of regulation",
        type="string",
    ),
    AttributeInfo(
        name="sector",
        description="""The sector of the regulation""",
        type="string",
    ),
    AttributeInfo(
        name="subsector",
        description="The subsector of the regulation",
        type="string",
    ),
    AttributeInfo(
        name="regulation_type",
        description="""The type of the regulation""",
        type="string",
    ),
    AttributeInfo(
        name="regulation_number",
        description="The number of the regulation",
        type="string",
    ),
    AttributeInfo(
        name="effective_date",
        description="The effective date of the regulation in string format 'YYYY/MM/DD'",
        type="string",
    ),
]

# Define document content description
document_content_description = "The content of the document"

# Define prompt
SCHEMA = """\
<< Structured Request Schema >>
When responding use a markdown code snippet with a JSON object formatted in the following schema:

```json
{{{{
    "query": string \\ text string to compare to document contents
    "filter": string \\ logical condition statement for filtering documents
}}}}
```

The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.

A logical condition statement is composed of one or more comparison and logical operation statements.

A comparison statement takes the form: `comp(attr, val)`:
- `comp` ({allowed_comparators}): comparator
- `attr` (string):  name of attribute to apply the comparison to
- `val` (string): is the comparison value

A logical operation statement takes the form `op(statement1, statement2, ...)`:
- `op` ({allowed_operators}): logical operator
- `statement1`, `statement2`, ... (comparison statements or logical operation statements): one or more statements to apply the operation to

Make sure that you only use the comparators and logical operators listed above and no others.
Make sure that filters only refer to attributes that exist in the data source.
Make sure that filters only use the attributed names with its function names if there are functions applied on them.
Make sure that filters take into account the descriptions of attributes and only make comparisons that are feasible given the type of data being stored.
Make sure that filters are only used as needed. If there are no filters that should be applied return "NO_FILTER" for the filter value.
Make sure that date attributes are compared using ASCII comparison operators.
"""

SCHEMA_PROMPT = PromptTemplate.from_template(SCHEMA)

# prompt = get_query_constructor_prompt(
#     document_contents=document_content_description,
#     attribute_info=metadata_field_info,
#     schema_prompt=SCHEMA_PROMPT,
    
# )
# output_parser = StructuredQueryOutputParser.from_components()
# query_constructor = prompt | llm_model | output_parser

# query_constructor.invoke("Berikan dokumen yang berlaku pada tanggal 1 Januari 2023 hingga 1 Januari 2024")



# # Create query constructor
# def self_query_ojk(llm_model: BaseLanguageModel, vector_store: VectorStore, search_type: str = "similarity") -> SelfQueryRetriever:
#     retriever = SelfQueryRetriever.from_llm(
#         document_contents=document_content_description,
#         # enable_limit=False,
#         use_original_query=True,
#         llm=llm_model,
#         vectorstore=vector_store,
#         metadata_field_info=metadata_field_info,
#         structured_query_translator=PGVectorTranslator(),
#     )

#     return retriever



def self_query_ojk(llm_model: BaseLanguageModel, vector_store: VectorStore, search_type: str = "similarity") -> SelfQueryRetriever:
    prompt = get_query_constructor_prompt(
        document_contents=document_content_description,
        attribute_info=metadata_field_info,
        schema_prompt=SCHEMA_PROMPT,
        allowed_operators = [Operator.AND, Operator.OR],
        # """Subset of allowed logical operators."""
        allowed_comparators = [
            Comparator.EQ,
            Comparator.NE,
            Comparator.GT,
            Comparator.LT,
            Comparator.IN,
            Comparator.NIN,
            Comparator.CONTAIN,
            Comparator.LIKE,
        ]
    )
    output_parser = StructuredQueryOutputParser.from_components()
    query_constructor = prompt | llm_model | output_parser

    retriever = SelfQueryRetriever(
        query_constructor=query_constructor,
        vectorstore=vector_store,
        search_type=search_type,
        structured_query_translator=PGVectorTranslator(),
        verbose=True,
        
    )

    return retriever

retriever = self_query_ojk(llm_model=llm_model, vector_store=vector_store, search_type="similarity")

In [16]:
# 2024-07-01
from langchain.globals import set_debug

set_debug(True)

context = retriever.invoke('Berikan dokumen pada subsektor Asuransi dan tanggal berlaku pada 1992/02/11')
context

[32;1m[1;3m[chain/start][0m [1m[retriever:Retriever > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "query": "Berikan dokumen pada subsektor Asuransi dan tanggal berlaku pada 1992/02/11"
}
[32;1m[1;3m[chain/start][0m [1m[retriever:Retriever > chain:RunnableSequence > prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "Berikan dokumen pada subsektor Asuransi dan tanggal berlaku pada 1992/02/11"
}
[36;1m[1;3m[chain/end][0m [1m[retriever:Retriever > chain:RunnableSequence > prompt:FewShotPromptTemplate] [1ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[retriever:Retriever > chain:RunnableSequence > llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\

[]