In [1]:
from opensearch_interface import OpenSearchClient
from preprocessing import Vectorizor
from llama_index.evaluation import RetrieverEvaluator
from llama_index.vector_stores import OpensearchVectorStore, OpensearchVectorClient
from llama_index import VectorStoreIndex, StorageContext, SimpleDirectoryReader, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.llms import OpenAI
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)

  from .autonotebook import tqdm as notebook_tqdm


In [67]:
import nest_asyncio

nest_asyncio.apply()

In [46]:
# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "content_embedding"
sem_index = 'semantic-impact-theory-gte'
# OpensearchVectorClient encapsulates logic for a
# single opensearch index with vector search enabled
client = OpensearchVectorClient(
     "http://localhost:9200", sem_index, 768, embedding_field=embedding_field, text_field=text_field,
     http_auth=('admin', 'admin'),
     use_ssl = True,
     verify_certs = False,
     ssl_assert_hostname = False,
     ssl_show_warn = False,
     timeout=30)

In [4]:
data_path = '/home/elastic/notebooks/vector_search_applications/data/paul_graham.txt'

In [5]:
documents = SimpleDirectoryReader('./data/paul_graham/').load_data()

In [6]:
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)

In [7]:
vector_store = OpensearchVectorStore(client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# initialize an index using our sample data and the client we just created
index = VectorStoreIndex.from_documents(documents=documents, show_progress=True)

Parsing documents into nodes: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.11it/s]
Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:02<00:00,  7.81it/s]


In [8]:
retriever = index.as_retriever(similarity_top_k=5)

In [9]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)
llm = OpenAI()

In [33]:
qa_dataset = generate_question_context_pairs(nodes, llm=llm)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [01:09<00:00,  1.92s/it]


In [82]:
qa_dataset.save_json(path='./data/qa_dataset.json')

In [10]:
# qa_dataset = EmbeddingQAFinetuneDataset.from_json('./data/qa_dataset.json')

In [34]:
sample_id, sample_query = list(qa_dataset.queries.items())[10]
sample_expected = qa_dataset.relevant_docs[sample_id]
# sample_expected = qa_dataset.corpus[sample_expected[0]]

In [35]:
sample_id, sample_query, sample_expected

('09fbb4e2-6ee7-4ae5-9d63-480879df787f',
 "In the context of the passage, what factors influenced the protagonist's decision to write their dissertation on applications of continuations instead of macros and embedded languages?",
 ['6e8070a4-4c55-4d22-a1bc-09c32157a4e5'])

In [36]:
eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
eval_result

RetrievalEvalResult(query="In the context of the passage, what factors influenced the protagonist's decision to write their dissertation on applications of continuations instead of macros and embedded languages?", expected_ids=['6e8070a4-4c55-4d22-a1bc-09c32157a4e5'], retrieved_ids=['1e811809-1b7d-45ee-a3f3-2bf93fb0ed0c', 'b33a0d1b-9a14-4cbd-a0ad-12ab298ea685', '2499e887-b272-41e3-8505-a7f7a0e01c26', '2420f677-0307-47b6-9de5-78feecd5d5e3', '76d9a861-2264-4433-a6f3-13b277ddeafb'], metric_dict={'mrr': RetrievalMetricResult(score=0.0, metadata={}), 'hit_rate': RetrievalMetricResult(score=0.0, metadata={})})

In [80]:
osclient = OpenSearchClient()
osclient.show_indexes()

GET https://localhost:9200/_cat/indices?v=true [status:200 request:0.043s]
health status index                              uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   semantic-538-testrun               DjBPg6CdQwKbOGhJrI4YIQ   3   1        284            0      2.9mb          2.9mb
yellow open   kw-impact-theory                   2MjMun4bQYOoeUpv5UsJxg   3   1      33164            0     29.4mb         29.4mb
yellow open   security-auditlog-2023.10.10       rg-NByyNTvW0jEQQdYWdKA   1   1       1757            0      3.5mb          3.5mb
yellow open   test-kw-index                      6EF4Q2xDT9Gz1wua5a2IpQ   3   1        158            0      5.6mb          5.6mb
yellow open   security-auditlog-2023.10.11       OHTeNLijSHCkPzDXbzx7Gg   1   1       1731            0        2mb            2mb
yellow open   kw-full                            uNhdaqbnRVuyJci_L1Om8Q   3   1       6678            0     12.1mb         12.1mb
yellow open   p

### BM25 Retriever Setup

In [64]:
model = Vectorizor(model_name_or_path='./models/gte-base/').model

In [70]:
index = 'paul-graham3'

In [54]:
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

In [71]:
client = OpensearchVectorClient("http://localhost:9200", index, 1536, 
                                 embedding_field=embedding_field, text_field=text_field,
                                 http_auth=('admin', 'admin'),
                                 use_ssl = True,
                                 verify_certs = False,
                                 ssl_assert_hostname = False,
                                 ssl_show_warn = False,
                                 timeout=30)

GET https://localhost:9200/paul-graham3 [status:404 request:0.038s]
PUT https://localhost:9200/paul-graham3 [status:200 request:0.174s]
POST https://localhost:9200/paul-graham3/_refresh [status:200 request:0.002s]


In [79]:
osclient.indices.get_alias('paul*')

GET https://localhost:9200/paul*/_alias [status:200 request:0.004s]


{'paul-graham3': {'aliases': {}}}

In [None]:

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.retrievers import BM25Retriever
from llama_index.indices.vector_store.retrievers.retriever import VectorIndexRetriever
from llama_index.llms import OpenAI

In [69]:
llm = OpenAI()
service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llm)
nodes = service_context.node_parser.get_nodes_from_documents(documents)

In [73]:
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)
# initialize vector store
storage_context = StorageContext.from_defaults(vector_store=OpensearchVectorStore(client))
# initialize an index using our sample data and the client we just created
# new_index = VectorStoreIndex.from_documents(documents=documents, storage_context=storage_context, show_progress=True)
storage_context.docstore.add_documents(nodes=nodes)

In [74]:
new_index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    service_context=service_context,
)

GET https://localhost:9200/paul-graham3 [status:200 request:0.002s]
POST https://localhost:9200/_bulk [status:200 request:0.115s]
POST https://localhost:9200/paul-graham3/_refresh [status:200 request:0.015s]


In [82]:
retriever = BM25Retriever.from_defaults(new_index, similarity_top_k=5)

In [103]:
from llama_index.response.notebook_utils import display_source_node

# will retrieve all context from the author's life
results = retriever.retrieve(
    "Roy"
)
for node in results:
    display_source_node(node)

**Node ID:** 23755319-8812-4cf1-a60e-98fa57137179<br>**Similarity:** 0.0<br>**Text:** What I Worked On

February 2021

Before college the two main things I worked on, outside of schoo...<br>

**Node ID:** 4b6c25f5-dbb2-4a39-b849-ea65e2311140<br>**Similarity:** 0.0<br>**Text:** All you had to do was teach SHRDLU more words.

There weren't any classes in AI at Cornell then, ...<br>

**Node ID:** 070362e4-471d-4b48-8abf-fb9b0b76e8f7<br>**Similarity:** 0.0<br>**Text:** I was briefly tempted, but they were so slow by present standards; what was the point? No one els...<br>

**Node ID:** 591aaad2-89d0-4c1b-876b-041e2ab51275<br>**Similarity:** 0.0<br>**Text:** Now all I had to do was learn Italian.

Only stranieri (foreigners) had to take this entrance exa...<br>

**Node ID:** fcc3ae0a-56d4-4198-895f-c940c087ddd7<br>**Similarity:** 0.0<br>**Text:** I wanted to go back to RISD, but I was now broke and RISD was very expensive, so I decided to get...<br>

In [105]:
from llama_index.tools import RetrieverTool

vector_retriever = VectorIndexRetriever(new_index, similarity_top_k=5)

retriever_tools = [
    RetrieverTool.from_defaults(
        retriever=vector_retriever,
        description="Useful in most cases",
    ),
    RetrieverTool.from_defaults(
        retriever=retriever,
        description="Useful if searching about specific information",
    ),
]

In [107]:
from llama_index.retrievers import RouterRetriever

retriever = RouterRetriever.from_defaults(
    retriever_tools=retriever_tools,
    service_context=service_context,
    select_multi=True,
)

In [119]:
results = retriever.retrieve(
    "How much did Roy pay for the painting on his wall?"
)
for node in results:
    display_source_node(node)

Selecting retriever 1: Searching for specific information.


**Node ID:** bf06330b-85ef-4b86-a710-6e93ee00d398<br>**Similarity:** 8.000460324747431<br>**Text:** That's not always why artists have a signature style, but it's usually why buyers pay a lot for s...<br>

**Node ID:** fcc3ae0a-56d4-4198-895f-c940c087ddd7<br>**Similarity:** 7.60773078902128<br>**Text:** I wanted to go back to RISD, but I was now broke and RISD was very expensive, so I decided to get...<br>

**Node ID:** b42e9877-0dbc-4292-9d84-f60fd8635a3a<br>**Similarity:** 7.083600149727588<br>**Text:** (I still talk to alumni and to new startups working on things I'm interested in, but that only ta...<br>

**Node ID:** 070362e4-471d-4b48-8abf-fb9b0b76e8f7<br>**Similarity:** 6.885972725074988<br>**Text:** I was briefly tempted, but they were so slow by present standards; what was the point? No one els...<br>

**Node ID:** eb8a0408-7664-4acb-b99a-f9f009d8a322<br>**Similarity:** 6.686594148227136<br>**Text:** So in the summer of 1995, after I submitted the camera-ready copy of ANSI Common Lisp to the publ...<br>

In [124]:
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    StorageContext,
    SimpleDirectoryReader,
)
from llama_index.llms import OpenAI

# load documents
documents = SimpleDirectoryReader(
    input_files=["./data/IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()

# initialize service context (set chunk size)
# -- here, we set a smaller chunk size, to allow for more effective re-ranking
llm = OpenAI(model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(chunk_size=256, llm=llm)
nodes = service_context.node_parser.get_nodes_from_documents(documents)
client = OpensearchVectorClient("http://localhost:9200", 'climate-report', 1536, 
                                 embedding_field=embedding_field, text_field=text_field,
                                 http_auth=('admin', 'admin'),
                                 use_ssl = True,
                                 verify_certs = False,
                                 ssl_assert_hostname = False,
                                 ssl_show_warn = False,
                                 timeout=30)
storage_context = StorageContext.from_defaults(vector_store=OpensearchVectorStore(client))
storage_context.docstore.add_documents(nodes=nodes)

GET https://localhost:9200/climate-report [status:404 request:0.039s]
PUT https://localhost:9200/climate-report [status:200 request:0.218s]
POST https://localhost:9200/climate-report/_refresh [status:200 request:0.002s]


In [125]:
climate_index = VectorStoreIndex(nodes, storage_context=storage_context, service_context=service_context)

GET https://localhost:9200/climate-report [status:200 request:0.003s]
POST https://localhost:9200/_bulk [status:200 request:0.121s]
POST https://localhost:9200/_bulk [status:200 request:0.055s]
POST https://localhost:9200/_bulk [status:200 request:0.050s]
POST https://localhost:9200/_bulk [status:200 request:0.049s]
POST https://localhost:9200/_bulk [status:200 request:0.052s]
POST https://localhost:9200/_bulk [status:200 request:0.050s]
POST https://localhost:9200/_bulk [status:200 request:0.049s]
POST https://localhost:9200/_bulk [status:200 request:0.049s]
POST https://localhost:9200/_bulk [status:200 request:0.049s]
POST https://localhost:9200/_bulk [status:200 request:0.047s]
POST https://localhost:9200/_bulk [status:200 request:0.051s]
POST https://localhost:9200/_bulk [status:200 request:0.051s]
POST https://localhost:9200/_bulk [status:200 request:0.051s]
POST https://localhost:9200/_bulk [status:200 request:0.050s]
POST https://localhost:9200/_bulk [status:200 request:0.053s]


In [138]:
# retireve the top 10 most similar nodes using embeddings
vector_retriever = climate_index.as_retriever(similarity_top_k=10)

# retireve the top 10 most similar nodes using bm25
bm25_retriever = BM25Retriever.from_defaults(climate_index, similarity_top_k=10)

In [139]:
from llama_index.retrievers import BaseRetriever


class HybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        vector_nodes = self.vector_retriever.retrieve(query, **kwargs)

        # combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes

In [140]:
# index.as_retriever(similarity_top_k=5)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

In [154]:
res = hybrid_retriever.retrieve("What is the impact of climate change on the ocean?")

POST https://localhost:9200/climate-report/_search [status:200 request:0.009s]


In [156]:
res[1].node.text

'The ‘observed impact’ indicates the total effect of all climate-induced drivers on a specific ecosystem service, using expert judgement based on summary statements \nthroughout Section\xa03.5.  Tick marks represent the presence of co-occurring drivers non-climate drivers that affect the service. No assessment indicates that not enough evidence is \navailable to assess the direction of impact.\nThis section builds on the SROCC assessment of the portfolio \nof available solutions, their applicability and their effectiveness \nin reducing climate-change-induced risks to ocean and coastal \necosystems. Section\xa0 3.6.2 assesses the set of planned adaptation \nmeasures. Section\xa0 3.6.3 assesses implementation of adaptation \nsolutions and the enablers, barriers and limitations that affect their \nfeasibility.'

Bad pipe message: %s [b'\xda\x11\x1f\xf8>RtrM\'J\xa7\x1a\xa6\xbadqB \xa0\xf7\x0b\t\x19\x8b\xa1[\xeb\xb9\x98\xe9\xd8\x053\x04\x10R#\x8f\xad\xcd,\x1e\xefk\xf1\x9e"=\xab\xa4\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00']
Bad pipe message: %s [b"9\xf2\xc3\xc3\x16Y\xd7r'jP\xd7ww\x90\xf02\xa4 \xe8\xf5\x8f]\x9e\xfekF\xd44\x9a\xa6\x9eJ\x17b]3/\x8a\x07\xcd&\xa9\x1c\xfd=h\x10\xc9!*\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04", b'\x01\x02']
Bad pipe message: %s [b"\x9a\xc9\xd1\xd7}\x7f'\x9c\x1b\x89\xc4[b\x00\x0b\xb7\xa5#\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x0