## For Statues
### This was for Statues which all are searchable PDFs

In [None]:
!pip install -q pymupdf4llm qdrant-client llama-index-llms-groq llama-index-vector-stores-qdrant llama-index-embeddings-openai

In [None]:
import os
import pymupdf4llm
from google.colab import userdata
from qdrant_client import QdrantClient, models
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.groq import Groq

# Environment Setup
os.environ['OPENAI_API_KEY'] = userdata.get("OPENAI_API_KEY")
os.environ['GROQ_API_KEY'] = userdata.get("GROQ_API_KEY")
QDRANT_URL = userdata.get("QDRANT_URL")
QDRANT_API_KEY = userdata.get("QDRANT_API_KEY")

In [None]:
# Create a 'data' directory and upload your PDF files there.
DATA_DIR = "data"
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

documents = []
for filename in os.listdir(DATA_DIR):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(DATA_DIR, filename)
        print(f"Processing {pdf_path}...")
        # Extract text as Markdown for better structural representation
        md_text = pymupdf4llm.to_markdown(pdf_path)

        # Create a LlamaIndex Document object
        doc = Document(
            text=md_text,
            metadata={"file_name": filename}
        )
        documents.append(doc)

print(f"\nSuccessfully loaded and processed {len(documents)} PDF documents.")

In [None]:
documents[99].text

In [None]:
# Qdrant Client
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

collection_name = "material_boq"

# Create collection with binary quantization if it doesn't exist
if not client.collection_exists(collection_name=collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
        size=1536, # For OpenAI embedding model text-embedding-3-small
        distance=models.Distance.COSINE,
        on_disk=True  # Move original vectors to disk
    ),
    quantization_config=models.BinaryQuantization(
        binary=models.BinaryQuantizationConfig(
            always_ram=True  # Store only quantized vectors in RAM
        )
    )
    )
    print(f"Collection '{collection_name}' created.")
else:
    print(f"Collection '{collection_name}' already exists.")

In [None]:
# Use the updated GoogleGenaiEmbedding class
embed_model = OpenAIEmbedding(
    model_name="text-embedding-3-small"
    )

# Qdrant Vector Store
vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
)

In [None]:
# Storage Context to link the vector store
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create the index
# This will embed the documents and store them in Qdrant
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    embed_model=embed_model,
    show_progress=True
)

print("\nIndexing complete.")

Parsing nodes:   0%|          | 0/693 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/739 [00:00<?, ?it/s]


Indexing complete.


In [None]:
from llama_index.core import Document, VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever

# LLM for response generation
llm = Groq(model="llama-3.3-70b-versatile")

# Retriever to fetch relevant documents from the index
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=5,
)

# Response Synthesizer to generate a response from the retrieved context
response_synthesizer = get_response_synthesizer(
    llm=llm,
    response_mode="compact",
)

# Assemble the query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

print("Query engine is ready.")

Query engine is ready.


In [None]:
# The embedding model for the query should have the 'retrieval_query' task type
query_engine.retriever.embed_model = OpenAIEmbedding(
    model_name="text-embedding-3-small",
)

# Now, query your data
query_text = "what is the summary of constitution of pakistan"
response = query_engine.query(query_text)

print("Query:", query_text)
print("\nResponse:")
print(response)

Query: what is the summary of constitution of pakistan

Response:
The Constitution of Pakistan is a foundational document that outlines the principles, structures, and powers of the government, as well as the rights and responsibilities of citizens. It establishes Pakistan as an Islamic republic, with Islam as the state religion, and sets out the framework for the country's governance. The Constitution emphasizes the responsibility of each organ and authority of the State, as well as individuals performing functions on behalf of the State, to act in accordance with the Principles of Policy. These principles are guidelines that outline the responsibilities of the State and individuals to promote social justice, protect minority rights, and eliminate exploitation.

The Constitution also enshrines fundamental rights, including the right to life, liberty, and property, freedom of speech and assembly, and the right to education and healthcare. It provides for the protection of minority righ