### Implementing sentence window retrieval with llama index
This notebook shows the implementation of sentence window retrieval with llama index.

In [1]:
import os
import openai 
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
from llama_index.readers.file.base import SimpleDirectoryReader

In [7]:
documents = SimpleDirectoryReader(input_files=["./MIV2 - LLM paper.pdf"]).load_data()

In [8]:
len(documents)

17

In [9]:
from llama_index.schema import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))
print(document.text[:100])

Human-Robot interaction through joint robot planning
with Large Language Models
Kosi Asuzu1*
1*Birmi


In [23]:
from llama_index.node_parser import SentenceWindowNodeParser

# create sentence window node parser with the default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [24]:
text = "hello. how are you? I am fine!  "

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [25]:
len(nodes)

3

In [26]:
print([x.text for x in nodes])

['hello. ', 'how are you? ', 'I am fine!  ']


In [27]:
print(nodes[2].metadata["window"])

hello.  how are you?  I am fine!  


In [28]:
text = "hello. foo bar. cat dog. mouse"

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [29]:
print([x.text for x in nodes])

['hello. ', 'foo bar. ', 'cat dog. ', 'mouse']


In [30]:
print(nodes[0].metadata["window"])

hello.  foo bar.  cat dog. 


#### Building the index

In [31]:
from llama_index.llms import OpenAI

llm = OpenAI(model="mistralai/Mistral-7B-Instruct-v0.2", temperature=0.1)

In [32]:
from llama_index import ServiceContext

sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    # embed_model="local:BAAI/bge-large-en-v1.5"
    node_parser=node_parser,
)

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
from llama_index import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
    [document], service_context=sentence_context
)

In [38]:
sentence_index.storage_context.persist(persist_dir="./.cache")

In [40]:
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it

import os
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import load_index_from_storage

if not os.path.exists("./sentence_index"):
    sentence_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_context
    )

    sentence_index.storage_context.persist(persist_dir="./.cache")
else:
    sentence_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./cache"),
        service_context=sentence_context
    )

### Building the postprocessor
What are the post processors used for since the index has already been created

In [41]:
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [42]:
from llama_index.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [43]:
nodes_old[1].text

'foo bar. '

In [44]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

In [45]:
print(replaced_nodes[1].text)

hello.  foo bar.  cat dog.  mouse


##### Addding the ReRanker
What is the reranker used for 

In [47]:
from llama_index.indices.postprocessor import SentenceTransformerRerank

# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
    top_n=2, model="BAAI/bge-small-reranker-base"
)

OSError: BAAI/bge-small-reranker-base is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [48]:
from llama_index import QueryBundle
from llama_index.schema import TextNode, NodeWithScore

query = QueryBundle("I want a dog.")

scored_nodes = [
    NodeWithScore(node=TextNode(text="This is a cat"), score=0.6),
    NodeWithScore(node=TextNode(text="This is a dog"), score=0.4),
]

In [None]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

#### Runing the query engine


In [None]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postproc, rerank]
)

In [None]:
window_response = sentence_window_engine.query(
    "What are the keys to building a career in AI?"
)

In [None]:
from llama_index.response.notebook_utils import display_response

display_response(window_response)

#### Putting it all together
We will be adding the postprocessing and the reranking to build a full sentence window retrieval system

In [49]:
from llama_index.readers.file.base import SimpleDirectoryReader
from llama_index.service_context import ServiceContext
from llama_index.storage import StorageContext
from llama_index.schema import Document
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank

from llama_index import load_index_from_storage


In [51]:
from typing import List, Any

In [52]:
def build_sentence_window_index(
    documents: List[Document],
    llm: Any,
    embed_model: str = "local:BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir="./.cache",
):
    """
    Code for building sentence window context
    """
    # create the node parser
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            documents, service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index
    

In [53]:
def get_sentence_window_query_engine(
    sentence_index: VectorStoreIndex, similarity_top_k=6, rerank_top_n=2
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

In [55]:
from llama_index.llms import OpenAI

index = build_sentence_window_index(
    [document],
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
)


In [56]:
engine = get_sentence_window_query_engine(index)

model.safetensors:   0%|          | 0.00/1.11G [00:05<?, ?B/s]


KeyboardInterrupt: 

### TruEra Evaluation
Using the TruEra to perform RAG triad evaluations

In [57]:
eval_questions = []
with open('generated_questions.text', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

FileNotFoundError: [Errno 2] No such file or directory: 'generated_questions.text'

In [None]:
from trulens_eval import Tru

def run_evals(eval_questions, tru_recorder, query_engine):
    for question in eval_questions:
        with tru_recorder as recording:
            response = query_engine.query(question)

In [None]:
from utils import get_prebuilt_trulens_recorder

from trulens_eval import Tru

Tru().reset_database()