### Import Handling

In [None]:

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
    SummaryExtractor,
)

from llama_index.core.node_parser import (
    SemanticDoubleMergingSplitterNodeParser,
    LanguageConfig,
)

#from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.llms.groq import Groq
from llama_index.llms.ollama import Ollama
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.vector_stores.postgres import PGVectorStore

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openrouter import OpenRouter
from llama_index.core.llms import ChatMessage
import os 
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv('LLM_API_KEY')
OPEN_API_KEY = os.getenv('OPENAI_API_KEY')
openai_api_base = "https://openrouter.ai/api/v1"


In [None]:
# embed_model = HuggingFaceEmbedding(model_name="Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)
embed_model = OpenAIEmbedding(model="text-embedding-3-large", dimensions=1024)


### Document Ingestion

In [None]:
documents = SimpleDirectoryReader("data").load_data()
print (documents)

### LLM

In [None]:
llm = OpenRouter(
    api_key=api_key,
    max_tokens=256,
    context_window=4096,
    model="meta-llama/llama-3.3-70b-instruct",
)

In [None]:
# groq_api_key="groq_api_key"

# llm = Groq(model="llama3-8b-8192", api_key=groq_api_key)

# llm = Ollama(model="llama3.2:latest", request_timeout=120.0)
# print(llm.complete("What is the Capital of France"))

prompt = """ You are a chunk analysis assistant. Your task is to examine a chunk of text—typically extracted from a PDF document—and generate a **clear, concise one-line description** that accurately summarizes the key information contained in the chunk.

Instructions:
- Focus only on what is explicitly present in the chunk. Do not infer or interpret beyond the given content.
- Include visible structural clues such as section headers, bullet points, or table data if they help contextualize the description.
- Write in plain language suitable for downstream use in semantic retrieval or indexing.
- The output should be a single sentence that captures the main idea or purpose of the chunk.

**chunk content:**
{context_str}

"""


### Chunking + Metadata Extraction

In [None]:
from llama_index.core.schema import MetadataMode

text_splitter = SentenceSplitter(
    chunk_size=512, chunk_overlap=50
)
semantic_text_splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)
config = LanguageConfig(language="english", spacy_model="en_core_web_md")
double_semantic_merging_splitter = SemanticDoubleMergingSplitterNodeParser(
    language_config=config,
    initial_threshold=0.4,
    appending_threshold=0.6,
    merging_threshold=0.6,
    max_chunk_size=5000,
)
# print("double_semantic_merging_splitter_content:",double_semantic_merging_splitter)

extractors = [
    SummaryExtractor(summaries=["prev", "self", "next"], llm=llm),
    QuestionsAnsweredExtractor(
            questions=3, llm=llm, metadata_mode=MetadataMode.EMBED
        ),
]

summary_extractors = SummaryExtractor(summaries=["prev", "self", "next"], llm=llm)
question_extractors = QuestionsAnsweredExtractor(
    questions=3, llm=llm, metadata_mode=MetadataMode.EMBED
)

title_extractor = TitleExtractor(nodes=5, llm = llm, node_template=prompt)
# print("Title extractor:", title_extractor)

# transformations = [double_semantic_merging_splitter,title_extractor] + summary_extractors
transformations = [double_semantic_merging_splitter,title_extractor]
print(transformations)

### Ingestion Pipeline + Transformations

In [None]:
from llama_index.core.ingestion import IngestionPipeline
import nest_asyncio

nest_asyncio.apply()

pipeline = IngestionPipeline(
    transformations=transformations
)

nodes = pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True,
)


In [None]:
print(f"Number of nodes: {len(nodes)}")
print(f"content: {nodes[0].metadata}")

In [None]:
# Use this if you are getting token limit error and remove summary_extractors from the pipeline.

for extractor in summary_extractors:
    nodes = extractor(nodes) 

In [None]:
# If you have token limit when running this code then skip this section. 

for extractor in question_extractors:
    nodes = extractor(nodes) 

In [None]:
print(f"content: {nodes[1].metadata}")
print('**********************')
print(f"prev_section_summary: {nodes[1].metadata['prev_section_summary']}")
print('**********************')
print(f"next_section_summary: {nodes[1].metadata['next_section_summary']}")
print('**********************')
print(f"section_summary: {nodes[1].metadata['section_summary']}")
print('**********************')


### Database Setup

In [None]:
import psycopg2

connection_string = "postgresql://postgres:YOURPASSWORD@localhost:YOURPORT"
db_name = "vector_autoretrieval"
conn = psycopg2.connect(connection_string)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [None]:
from sqlalchemy import make_url

url = make_url(connection_string)
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="vector_store",
    embed_dim=1024,
    hybrid_search=True,
    text_search_config="english",
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    embed_model=embed_model,          
    show_progress=True
)

In [None]:
print("Index created successfully.", index)

In [None]:
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo


vector_store_info = VectorStoreInfo(
    content_info="Answer questions only fom the vector store anything outside of it is unnecessary",
    metadata_info=[
        MetadataInfo(
            name="section_summary",
            type="str",
            description=(
                "summary of the chunks"
            ),
        ),
        MetadataInfo(
            name="prev_section_summary",
            type="str",
            description=(
                "summry of the prev chunk"
            ),
        ),
        MetadataInfo(
            name="next_section_summary",
            type="str",
            description=(
                "summry of the prev chunk"
            ),
        ),
        MetadataInfo(
            name="questions_this_excerpt_can_answer",
            type="str",
            description=(
                "3 questions that this chunk can answer"
            ),
        ),
    ],
)


In [None]:
from llama_index.core.prompts import PromptTemplate

custom_prompt = PromptTemplate(
    """ You are a chunk analysis assistant. Your task is to examine a chunk of text—typically extracted from a PDF document—and generate a **clear, concise one-line description** that accurately summarizes the key information contained in the chunk.

Instructions:
- Focus only on what is explicitly present in the chunk. Do not infer or interpret beyond the given content.
- Include visible structural clues such as section headers, bullet points, or table data if they help contextualize the description.
- Write in plain language suitable for downstream use in semantic retrieval or indexing.
- The output should be a single sentence that captures the main idea or purpose of the chunk.

**chunk content:**
{context_str}



# """
)

retriever = VectorIndexAutoRetriever(
    index,
    vector_store_info=vector_store_info,
    llm=llm,
    retrieval_prompt=custom_prompt,
)
retriever.retrieve("Give me the total number of references in the paper?")