# Import Library

In [1]:
import os
import json
import pandas as pd
from dotenv import load_dotenv
from uuid import uuid4

# Vector DB
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from langchain_core.documents import Document

# Embedding Model
from langchain_ollama import OllamaEmbeddings

# Document Preparation

In [2]:
def get_json_document(file_path):
    '''
    Args:
        file_path (str): Path to the JSON file containing the data
    
    Returns:
        data (dict): JSON data loaded from the file
    '''

    with open(file_path) as f:
        data = json.load(f)
    return data

def get_documents(data):
    '''
    Args:
        data (dict): JSON data

    Returns:
        documents (list): List of Document objects created from the JSON data
    '''

    documents = []

    for item in data:
        doc = Document(
            page_content=item['content'],
            metadata={
                "id": str(uuid4()),
                "topic": item['topic'],
                "subtopic": item['subtopic'],
                "source": item['url'],
                "vector": []
            }
        )
        documents.append(doc)
    return documents  


# Store Document to Qdrant

### Embedding Model

In [5]:
def get_embedding_model(model_name):
    '''
    Args:
        model_name (str): Name of the model to be used for embeddings
    
    Returns:
        embeddings: Embedding model object
    '''

    embeddings = OllamaEmbeddings(
        model=model_name,
        
    )
    return embeddings

def encode_text(embeddings, text):
    '''
    Args:
        embeddings (OllamaEmbeddings): Embedding model object
        text (str): Text to be encoded

    Returns:
        np.array: Encoded vector for the input text
    '''
    
    # Melakukan embedding pada teks
    return embeddings.embed_query(text)

def add_vector(documents, embeddings):
    '''
    Args:
        documents (list): List of Document objects

    Returns:
        documents (list): List of Document objects with vectors added
    '''

    # Menambahkan vector embedding ke setiap dokumen
    for doc in documents:
        doc.metadata['vector'] = encode_text(embeddings, doc.page_content)
    return documents


def get_documents_with_vector(data, model_name):
    '''
    Args:
        data (dict): JSON data

    Returns:
        documents (list): List of Document objects with vectors added
    '''

    documents = get_documents(data)
    embeddings = get_embedding_model(model_name)
    documents = add_vector(documents, embeddings)
    return documents

### Creating Database

In [6]:
def instantiate_database(collection_name, embedding_model):
    '''
    Returns:
        client (QdrantClient): QdrantClient object
    '''

    # Mengakses Qdrant Client menggunakan API yang sudah dibuat sebelumnya
    load_dotenv(override=True)
    client = QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))

    # Mengambil ukuran embedding dari model yang digunakan
    embedding_size = len(encode_text(embedding_model, 'test'))
    
    # Membuat collection di Qdrant
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embedding_size, distance=Distance.COSINE),
    )

    return client

### Store Text and Vector to Database

In [None]:
def add_document(client, collection_name, doc):
    '''
    Args:
        client (QdrantVectorStore): Vector store object
        collection_name (str): Name of the collection to store the documents in
        doc (list): Document object to be stored
    '''

    # Menyiapkan metadata untuk disimpan di Qdrant
    metadata = {
        'topic': doc.metadata['topic'],
        'subtopic': doc.metadata['subtopic'],
        'source': doc.metadata['source']
    }

    # Menyimpan dokumen ke dalam Qdrant
    client.upsert(
        collection_name=collection_name,
        points=[PointStruct(id=doc.metadata['id'], vector=doc.metadata['vector'], payload={"text": doc.page_content, "metadata": metadata})]
    )

def store_documents(client, collection_name, documents):
    '''
    Args:
        client (QdrantVectorStore): Vector store object
        collection_name (str): Name of the collection to store the documents in
        documents (list): List of Document objects
    '''

    # Menyimpan kumpulan dokumen ke dalam Qdrant
    for doc in documents:
        add_document(client, collection_name, doc)
    print("Documents stored successfully")
        

# Indexing Pipeline

In [14]:
def indexing_pipeline(file_path, model_name, document_name, collection_name):
    '''
    Args:
        file_path (str): Path to the JSON file containing the data

    Returns:
        None
    '''

    data = get_json_document(file_path)
    documents = get_documents_with_vector(data, model_name)

    # Menyimpan backup metadata dokumen (tanpa vector)
    doc_path = os.path.join('../data/backup', document_name)
    documents_backup = get_documents(data)
    with open(doc_path, 'w') as f:
        json.dump([doc.metadata for doc in documents_backup], f, indent=4)
    print("Documents exported to", document_name)

    # Instantiasi database
    client = instantiate_database(collection_name, get_embedding_model(model_name))
    print("Database instantiated successfully")
    
    # Menyimpan dokumen ke dalam database
    store_documents(client, collection_name, documents)

# Pipeline Execution

In [15]:
# Versi Pertama: Menggunakan model embedding 'nomic-embed-text'

file_path='../data/cvd_prepared.json'
document_name = 'documents_v1.json'
model_name='nomic-embed-text'
collection_name = 'cvd_collection_v1' 

indexing_pipeline(file_path, model_name, document_name, collection_name)

Documents exported to documents_v1.json
Database instantiated successfully
Documents stored successfully


In [16]:
# Versi Kedua: Menggunakan model embedding 'mxbai-embed-large'

file_path='../data/cvd_prepared.json'
document_name = 'documents_v2.json'
model_name='mxbai-embed-large'
collection_name = 'cvd_collection_v2'  

indexing_pipeline(file_path, model_name, document_name, collection_name)

Documents exported to documents_v2.json
Database instantiated successfully
Documents stored successfully
