In [1]:
import warnings
warnings.filterwarnings('ignore')

from scripts import utils

import os
import openai
openai.api_key = utils.get_openai_api_key()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [2]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["pdfs/eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [3]:
from llama_index import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

### Window-sentence retrieval setup

In [32]:
from llama_index.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [33]:
text = "hello. how are you? I am fine!  "

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [34]:
print([x.text for x in nodes])

['hello. ', 'how are you? ', 'I am fine!  ']


In [35]:
print(nodes[1].metadata["window"])

hello.  how are you?  I am fine!  


In [31]:
# node_parser = SentenceWindowNodeParser.from_defaults(
#     window_size=1,
#     window_metadata_key="window",
#     original_text_metadata_key="original_text",
# )

# text = "hello. foo bar. cat dog. mouse"

# nodes = node_parser.get_nodes_from_documents([Document(text=text)])

# print([x.text for x in nodes])

# print("\n")

# print(nodes[3].metadata["window"])

### Building the index

In [36]:
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [37]:
from llama_index import ServiceContext

sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    # embed_model="local:BAAI/bge-large-en-v1.5"
    node_parser=node_parser,
)

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [38]:
from llama_index import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
    [document], service_context=sentence_context
)

In [39]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")


In [40]:
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it

import os
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import load_index_from_storage

if not os.path.exists("./sentence_index"):
    sentence_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_context
    )

    sentence_index.storage_context.persist(persist_dir="./sentence_index")
else:
    sentence_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./sentence_index"),
        service_context=sentence_context,
    )

### Building the postprocessor

In [41]:
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(target_metadata_key="window")

In [42]:
print([x.text for x in nodes])

['hello. ', 'how are you? ', 'I am fine!  ']


In [43]:
from llama_index.schema import NodeWithScore
from copy import deepcopy

# replicate similarity search result
scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes] 

nodes_old = [deepcopy(n) for n in nodes]

In [44]:
print([x.text for x in nodes_old])

['hello. ', 'how are you? ', 'I am fine!  ']


In [45]:
nodes_old[1].text

'how are you? '

In [46]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

In [47]:
print(replaced_nodes[1].text)

hello.  how are you?  I am fine!  
