### Import Handling

In [None]:

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
    SummaryExtractor,
)

from llama_index.core.node_parser import (
    SemanticDoubleMergingSplitterNodeParser,
    LanguageConfig,
)

#from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.llms.groq import Groq
from llama_index.llms.ollama import Ollama
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.vector_stores.postgres import PGVectorStore

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openrouter import OpenRouter
from llama_index.core.llms import ChatMessage
import os 
from dotenv import load_dotenv
from llama_index.retrievers.bm25 import BM25Retriever


load_dotenv()

api_key = os.getenv('LLM_API_KEY')
OPEN_API_KEY = os.getenv('OPENAI_API_KEY')
openai_api_base = "https://openrouter.ai/api/v1"


### Embeddings

In [None]:
# embed_model = HuggingFaceEmbedding(model_name="Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)
embed_model = OpenAIEmbedding(model="text-embedding-3-large", dimensions=1024)

### Document Ingestion

In [None]:
documents = SimpleDirectoryReader("data").load_data()
print (documents)

### LLM Initialization

In [None]:
llm = OpenRouter(
    api_key=api_key,
    max_tokens=256,
    context_window=4096,
    model="meta-llama/llama-3.3-70b-instruct",
)

### Prompt Initialization

In [None]:
# groq_api_key="groq_api_key"
# llm = Groq(model="llama3-8b-8192", api_key=groq_api_key)

prompt = """ You are a chunk analysis assistant. Your task is to examine a chunk of text—typically extracted from a PDF document—and generate a **clear, concise one-line description** that accurately summarizes the key information contained in the chunk.

Instructions:
- Focus only on what is explicitly present in the chunk. Do not infer or interpret beyond the given content.
- Include visible structural clues such as section headers, bullet points, or table data if they help contextualize the description.
- Write in plain language suitable for downstream use in semantic retrieval or indexing.
- The output should be a single sentence that captures the main idea or purpose of the chunk.

**chunk content:**
{context_str}
"""

In [None]:
from llama_index.core.schema import MetadataMode
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)

text_splitter = SentenceSplitter(
    chunk_size=512, chunk_overlap=50
)
semantic_text_splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)
config = LanguageConfig(language="english", spacy_model="en_core_web_md")
double_semantic_merging_splitter = SemanticDoubleMergingSplitterNodeParser(
    language_config=config,
    initial_threshold=0.4,
    appending_threshold=0.6,
    merging_threshold=0.6,
    max_chunk_size=5000,
)
print("double_semantic_merging_splitter_content:",double_semantic_merging_splitter)

summary_extractors = [
    SummaryExtractor(summaries=["prev", "self", "next"], llm=llm),
]

qa_extractors = [
    QuestionsAnsweredExtractor(
            questions=3, llm=llm, metadata_mode=MetadataMode.EMBED
        ),    
]

title_extractor = TitleExtractor(nodes=5, llm = llm, node_template=prompt)
print("Title extractor:", title_extractor)

transformations = [semantic_text_splitter, KeywordExtractor(keywords=10) ]    

In [None]:
from llama_index.core.ingestion import IngestionPipeline
import nest_asyncio

nest_asyncio.apply()

pipeline = IngestionPipeline(
    transformations=transformations
)

nodes = pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True,
)


In [None]:
print(f"Node 11: {nodes[11]}")

In [None]:
print(f"Number of nodes: {len(nodes)}")

In [None]:
import psycopg2

connection_string = "postgresql://postgres:YOURPASSWORD@localhost:YOURPORT"
db_name = "vector_custom_retrieval"
conn = psycopg2.connect(connection_string)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [None]:
from sqlalchemy import make_url

url = make_url(connection_string)
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="hybrid_search",
    embed_dim=1024,
    hybrid_search=True,
    text_search_config="english",
    hnsw_kwargs={
        "hnsw_m": 16,
        "hnsw_ef_construction": 64,
        "hnsw_ef_search": 40,
        "hnsw_dist_method": "vector_cosine_ops",
    },
)

In [None]:
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.docstore.add_documents(nodes)

In [None]:
print(f"Number of nodes: {len(nodes)}")
print(storage_context)    

In [None]:
from llama_index.core import SimpleKeywordTableIndex, VectorStoreIndex

vector_index = VectorStoreIndex(nodes, storage_context=storage_context, embed_model=embed_model, show_progress=True)

In [None]:
keyword_index = SimpleKeywordTableIndex(nodes, storage_context=storage_context,llm=llm,show_progress=True)

In [None]:
# import QueryBundle
from llama_index.core import QueryBundle

# import NodeWithScore
from llama_index.core.schema import NodeWithScore

# Retrievers
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)
from llama_index.core.indices.keyword_table import (
    KeywordTableGPTRetriever,
)
from typing import List

In [None]:
class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both semantic search and hybrid search."""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        keyword_retriever:BM25Retriever,
        mode: str = "AND",
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._keyword_retriever = keyword_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)

        vector_ids = {n.node.node_id for n in vector_nodes}
        keyword_ids = {n.node.node_id for n in keyword_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in keyword_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(keyword_ids)
        else:
            retrieve_ids = vector_ids.union(keyword_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes

In [None]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine


# define custom retriever
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=4,verbose=True)
keyword_retriever = KeywordTableGPTRetriever(index=keyword_index, verbose=True)
custom_retriever = CustomRetriever(vector_retriever, keyword_retriever)

# define response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
    llm=llm,
)

# assemble query engine
custom_query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,
    response_synthesizer=response_synthesizer,
)

# vector query engine
vector_query_engine = RetrieverQueryEngine(
    retriever=vector_retriever,
    response_synthesizer=response_synthesizer,
)
# keyword query engine
keyword_query_engine = RetrieverQueryEngine(
    retriever=keyword_retriever,
    response_synthesizer=response_synthesizer,
)

In [None]:
# Define your query
query = "Give me the authors in the paper?"
# query = "who are the authors of the paper ?"

# Extract keywords using the keyword retriever's internal method
extracted_keywords = keyword_retriever._get_keywords(query)

# Display the extracted keywords
print(f"> Starting query: {query}")
print(f"INFO:llama_index.indices.keyword_table.retrievers:query keywords: {extracted_keywords}")
print(f"query keywords: {extracted_keywords}")

# Filter keywords to those present in the index structure
filtered_keywords = [k for k in extracted_keywords if k in keyword_retriever._index_struct.keywords]

# Display the filtered keywords
print(f"INFO:llama_index.indices.keyword_table.retrievers:> Extracted keywords: {filtered_keywords}")
print(f"> Extracted keywords: {filtered_keywords}")

# Proceed with the query using your custom query engine
response = custom_query_engine.query(query)

# Display the final response
print("Final Response:")
print(response.response)