# **Store Data to Vector Store (OJK)**

Ini cara untuk storing ke Redis, tapi untuk [Load](#load) Document beda-beda untuk tiap data BI, OJK, dan SIKEPO. Jadi buat sendiri function `extract_all_documents_in_directory` nya

## **Setup**

In [1]:
# import nest_asyncio
# nest_asyncio.apply()

## **Config**

In [2]:
from utils.config import get_config
from utils.models import ModelName, get_model

config = get_config()

## **Define Model**

In [3]:
from utils.models import ModelName, LLMModelName, EmbeddingModelName, get_model

model_name = ModelName.OPENAI
llm_model, embed_model = get_model(model_name=model_name, config=config, llm_model_name=LLMModelName.GPT_35_TURBO, embedding_model_name=EmbeddingModelName.EMBEDDING_3_SMALL)

## **Indexing**

In [4]:
documents_dir = './data/documents1/'
pickle_path = './data/pickles/'
metadata_path = './data/metadata/files_metadata.csv'

LOAD_PICKLE = True

### **Load**

Untuk SIKEPO dan BI beda cara extract documentsnya, file document_extractor buat sendiri :D.

In [5]:
from utils.documents_extractor.documents_extract_ojk import extract_all_documents_in_directory

if not LOAD_PICKLE:
    documents = extract_all_documents_in_directory(documents_dir, metadata_path, treshold=0.98)

### **Split**

In [6]:
from utils.documents_split import document_splitter
import pickle


if not LOAD_PICKLE:
    all_splits = document_splitter(docs=documents)
    all_splits1 = sorted(all_splits, key=lambda x: (x.metadata['doc_id'], x.metadata.get('page_number', '0')))
    # Open a file and use dump() 
    with open(pickle_path + 'documents1.pkl', 'wb') as file:

        # A new file will be created
        pickle.dump(all_splits1, file) 

# Open the file in binary mode 
with open(pickle_path + 'documents1.pkl', 'rb') as file:
    
    # Call load method to deserialze 
    all_splits = pickle.load(file)

In [7]:
len(all_splits)

132966

### **Storing**

In [8]:
from database.vector_store.vector_store import RedisIndexManager, PostgresIndexManager

# vector_store_manager = RedisIndexManager(index_name='ojk', embed_model=embed_model, config=config, db_id=0)
vector_store_manager = PostgresIndexManager(index_name='ojk', embed_model=embed_model, config=config)

# vector_store_manager.delete_index() # WARNING: This will delete the index
vector_store_manager.store_vector_index(docs=all_splits, batch_size=5000)
vector_store = vector_store_manager.load_vector_index()

Database 'vector_store' already exists.
Vector extension created successfully (if it didn't exist).
Start loading from idx: 0


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************6qd9. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

## **NYOBA2**

In [37]:
vector_store.as_retriever().invoke("Halo")

[Document(metadata={'doc_id': 4, 'title': 'Tata Cara dan Mekanisme Penyampaian Data Transaksi Pendanaan dan Pelaporan Penyelenggara Layanan Pendanaan Bersama Berbasis Teknologi Informasi (LPBBTI)', 'sector': 'IKNB', 'subsector': 'Peraturan Lainnya', 'regulation_type': 'Surat Edaran OJK', 'regulation_number': '1/SEOJK.06/2024', 'effective_date': '1 Juli 2024', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/Tata-Cara-dan-Mekanisme-Penyampaian-Data-Transaksi-Pendanaan-dan-Pelaporan-Penyelenggara-Layanan-Pendanaan-Bersama-Berbasis/SEOJK%201-SEOJK.06-2024%20Tata%20Cara%20dan%20Mekanisme%20Penyampaian%20Data%20Transaksi%20Pendanaan%20dan%20Pelaporan%20Penyelenggara%20LPBBTI.pdf', 'page_number': 1}, page_content='LPBBTI. \n4. \nSistem Elektronik adalah serangkaian perangkat dan prosedur \nelektronik \nyang \nberfungsi \nmempersiapkan, \nmengumpulkan, \nmengolah, \nmenganalisis, \nmenyimpan, \nmenampilkan, \nmengumumkan, mengirimkan, dan/atau menyebarkan informasi \nelektronik 

In [56]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)
from langchain_core.language_models.base import BaseLanguageModel
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_core.vectorstores import VectorStore
from langchain.retrievers.self_query.pgvector import PGVectorTranslator

# Define metadata field information
metadata_field_info = [
    AttributeInfo(
        name="title",
        description="The title of the document of regulation",
        type="string",
    ),
    AttributeInfo(
        name="sector",
        description="""The sector of the regulation""",
        type="string",
    ),
    AttributeInfo(
        name="subsector",
        description="The subsector of the regulation",
        type="string",
    ),
    AttributeInfo(
        name="regulation_type",
        description="""The type of the regulation""",
        type="string",
    ),
    AttributeInfo(
        name="regulation_number",
        description="The number of the regulation",
        type="string",
    ),
    AttributeInfo(
        name="effective_date",
        description="The effective date of the regulation in format DD Month YYYY, e.g. 1 Januari 2021",
        type="string",
    ),
]

# Define document content description
document_content_description = "The content of the document"

# Define prompt
schema_prompt = """
Please provide the schema of the structured query. Only the following attributes are allowed:
- title
- sector
- subsector
- regulation_type
- regulation_number
- effective_date

Ensure that user queries are interpreted correctly by mapping common phrases to the corresponding attributes:
- "judul" or any mention of a title should be interpreted as the attribute 'title'
- "sektor" or any mention of a sector should be interpreted as the attribute 'sector'
- "subsektor" or any mention of a subsector should be interpreted as the attribute 'subsector'
- "tipe regulasi" or any mention of a regulation type should be interpreted as the attribute 'regulation_type'
- "nomor regulasi" or any mention of a regulation number should be interpreted as the attribute 'regulation_number'
- "tanggal berlaku" or any mention of an effective date should be interpreted as the attribute 'effective_date'
"""

# Create query constructor
def self_query_ojk(llm_model: BaseLanguageModel, vector_store: VectorStore, search_type: str = "similarity") -> SelfQueryRetriever:
    prompt = get_query_constructor_prompt(
        document_contents=document_content_description,
        attribute_info=metadata_field_info,
        # schema_prompt=schema_prompt,
    )
    output_parser = StructuredQueryOutputParser.from_components()
    query_constructor = prompt | llm_model | output_parser

    retriever = SelfQueryRetriever(
        query_constructor=query_constructor,
        vectorstore=vector_store,
        search_type=search_type,
        structured_query_translator=PGVectorTranslator(),
    )

    return retriever


retriever = self_query_ojk(llm_model=llm_model, vector_store=vector_store)
context = retriever.invoke("Berikan dokumen dengan subsektor Dana Pensiun")

In [58]:
all_splits[0:100]

[Document(metadata={'doc_id': 1, 'title': 'Dasar Penilaian Investasi Dana Pensiun', 'sector': 'IKNB', 'subsector': 'Dana Pensiun', 'regulation_type': 'Surat Edaran OJK', 'regulation_number': '4/SEOJK.05/2024', 'effective_date': '1 Juli 2024', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/Dasar-Penilaian-Investasi-Dana-Pensiun/SEOJK%204-SEOJK.05-2024%20Dasar%20Penilaian%20Investasi%20Dana%20Pensiun.pdf', 'page_number': 1}, page_content="metadata={'doc_id': 1, 'title': 'Dasar Penilaian Investasi Dana Pensiun', 'sector': 'IKNB', 'subsector': 'Dana Pensiun', 'regulation_type': 'Surat Edaran OJK', 'regulation_number': '4/SEOJK.05/2024', 'effective_date': '1 Juli 2024', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/Dasar-Penilaian-Investasi-Dana-Pensiun/SEOJK%204-SEOJK.05-2024%20Dasar%20Penilaian%20Investasi%20Dana%20Pensiun.pdf', 'page_number': 1}\n \n \n \n \n \nYth.  \nPengurus Dana Pensiun \ndi tempat. \n \nSALINAN \nSURAT EDARAN OTORITAS JASA KEUANGAN \

In [57]:
context

[Document(metadata={'doc_id': 1, 'title': 'Dasar Penilaian Investasi Dana Pensiun', 'sector': 'IKNB', 'subsector': 'Dana Pensiun', 'regulation_type': 'Surat Edaran OJK', 'regulation_number': '4/SEOJK.05/2024', 'effective_date': '1 Juli 2024', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/Dasar-Penilaian-Investasi-Dana-Pensiun/SEOJK%204-SEOJK.05-2024%20Dasar%20Penilaian%20Investasi%20Dana%20Pensiun.pdf', 'page_number': 1}, page_content='OTORITAS\nJASA\nKEUANGAN'),
 Document(metadata={'doc_id': 3, 'title': 'Dasar Penilaian Investasi Dana Pensiun', 'sector': 'IKNB', 'subsector': 'Dana Pensiun', 'regulation_type': 'Surat Edaran OJK', 'regulation_number': '4/SEOJK.05/2024', 'effective_date': '1 Juli 2024', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/Dasar-Penilaian-Investasi-Dana-Pensiun/FAQ%20SEOJK%204-SEOJK.05-2024%20Dasar%20Penilaian%20Investasi%20Dana%20Pensiun.pdf', 'page_number': 1}, page_content='OTORITAS\nJASA\nKEUANGAN'),
 Document(metadata={'doc