In [6]:
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://www.thoughtworks.com/en-in/insights/blog/data-strategy/building-an-amazon-com-for-your-data-products"]
)

In [3]:
documents[0]

Document(id_='https://www.thoughtworks.com/en-in/insights/blog/data-strategy/building-an-amazon-com-for-your-data-products', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in "Thoughtworks")\n\nMenu\n\nClose\n\n  * [What we do  ](/en-in/what-we-do "What we do")\n\n    * [ Go to overview ](/en-in/what-we-do)\n      * ### Services\n\n        * [ Artificial Intelligence  ](/en-in/what-we-do/ai)\n        * [ Cloud  ](/en-in/what-we-do/cloud)\n        * [ Customer Experience and Products  ](/en-in/what-we-do/customer-experience-product-design)\n        * [ Data and Analytics  ](/en-in/what-we-do/data)\n        * [ Managed Services  ](/en-in/what-we-do/digital-application-management-and-operations)\n        * [ Modernization  ](/en-in/what-we-do/modernization)\n        * [ Platforms  ](/en-in/what-we-do/

In [5]:
documents[0]

Document(id_='83e760c3-aa01-433a-a603-019d12eee223', embedding=None, metadata={'page_label': '1', 'file_name': 'llama2.pdf', 'file_path': '/Users/samvardhan/Desktop/DataEngineer/opensearch_rag/data_pdf/llama2.pdf', 'file_type': 'application/pdf', 'file_size': 13661300, 'creation_date': '2024-04-21', 'last_modified_date': '2024-04-21'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Llama 2 : Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗Louis Martin†Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goya

In [7]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

# creates a persistant index to disk

client = QdrantClient(url="http://localhost:6333")

In [8]:
from llama_index.core.node_parser import SentenceSplitter
text_parser = SentenceSplitter(
 chunk_size=1024,
)
text_chunks = []
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [9]:
from llama_index.core.schema import TextNode, IndexNode
nodes = []
for idx, text_chunk in enumerate(text_chunks):
 node = TextNode(
 text=text_chunk,
 )
 src_doc = documents[doc_idxs[idx]]
 node.metadata = src_doc.metadata
 nodes.append(node)

In [10]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="avsolatorio/GIST-Embedding-v0")

In [11]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
    node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [12]:
from llama_index.llms.ollama import Ollama
llm = Ollama(model="llama2", request_timeout=30.0)

In [13]:
from llama_index.core import Settings
from llama_index.core import ServiceContext, set_global_service_context

service_context = ServiceContext.from_defaults(
  llm=llm, embed_model=embed_model
)

  service_context = ServiceContext.from_defaults(


In [14]:
import qdrant_client
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext
from qdrant_client import models
client = qdrant_client.QdrantClient(location=":memory:")
client.recreate_collection(
    collection_name="my_collection",
    vectors_config={
        "text-dense": models.VectorParams(
            size=768,
            distance=models.Distance.COSINE,
        )
    },
    sparse_vectors_config={
        "text-sparse": models.SparseVectorParams(
            index=models.SparseIndexParams()
        )
    },
)


vector_store = QdrantVectorStore(
    collection_name="my_collection", client=client, enable_hybrid=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [15]:
index = VectorStoreIndex.from_documents(
 documents, storage_context=storage_context, service_context=service_context
)

In [16]:
vector_store.add(nodes)

['b0052b8f-5349-4113-9341-dea28a8d639d',
 '9afd200b-904d-4e3b-bc61-5fb1a2a13604',
 '85bb9d30-39a5-4253-aad9-261788f57e6b',
 'a078d100-e6ca-469f-83ab-eeaf3770dcb7',
 'c978814f-6496-4012-a090-d0bcd125c670',
 'c4d39fe1-753a-419d-a32d-8b3ae7bd9b70',
 '21a2f676-25c2-4e24-8cfe-0a2aaf4399dc',
 'c2168d3c-ac69-492e-ac2f-790dc048a2d8',
 'd8567608-3d53-4887-a026-362b3db112be',
 '60a8462a-3b24-4eeb-a03a-681ea00359c5',
 '89834f7f-d02f-4aa3-b178-f6038b33c556']

In [17]:
query_str = "how is the author of the article Building An “Amazon.com” For Your Data Products"
query_embedding = embed_model.get_query_embedding(query_str)

In [18]:
from llama_index.core.vector_stores import (
    VectorStoreQuery,
    VectorStoreQueryResult,
)
query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"
vector_store_query = VectorStoreQuery(
 query_embedding=query_embedding, similarity_top_k=2, mode=query_mode
)
query_result = vector_store.query(vector_store_query)
print(query_result.nodes[0].get_content())

Amazon provides an
incredible amount of detail to help consumers purchase products from unknown
third-parties. Take the example of something as simple as a wrench:

[
![](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_3.png)
![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg) I’d buy this wrench.
![](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_3.png)
![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg) I’d buy this wrench.
](https://www.amazon.com/Amazon-Brand-Denali-8-Inch-
Adjustable/dp/B091BLK385/ref=sr_1_1_ffob_sspa?crid=39GIJHE50YBB1&keywords=wrench&qid=1681395714&sprefix=wrench%2Caps%2C70&sr=8-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUE1RDdMRD

In [19]:
from llama_index.core.schema import NodeWithScore, TextNode

In [20]:
from typing import Optional
nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
 score: Optional[float] = None
 if query_result.similarities is not None:
    score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

In [21]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List

class VectorDBRetriever(BaseRetriever):
    """Retriever over a qdrant vector store."""

    def __init__(
        self,
        vector_store: 'QdrantVectorStore',  # Assuming QdrantVectorStore is defined elsewhere
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Initialize parameters."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List['NodeWithScore']:
        """Retrieve documents based on the query."""
        query_embedding = self._embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = self._vector_store.query(vector_store_query)
        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score = query_result.similarities[index] if query_result.similarities is not None else None
            nodes_with_scores.append(NodeWithScore(node=node, score=score))
        return nodes_with_scores


# Compare Result

In [22]:
hybrid_retriever = VectorDBRetriever(vector_store, embed_model, query_mode="hybrid", similarity_top_k=2)
sparse_retriever = VectorDBRetriever(vector_store, embed_model, query_mode="sparse", similarity_top_k=2)

In [23]:
def execute_and_compare(query_str: str):
    hybrid_response = hybrid_retriever.retrieve(QueryBundle(query_str=query_str))
    sparse_response = sparse_retriever.retrieve(QueryBundle(query_str=query_str))

    print("Hybrid Results:")
    for result in hybrid_response:
        print(f"Text: {result.node.get_content()}, Score: {result.score}")

    print("\nSparse Results:")
    for result in sparse_response:
        print(f"Text: {result.node.get_content()}, Score: {result.score}")

In [29]:
query_str =  "what is Data products?"
execute_and_compare(query_str)

Hybrid Results:
Text: [Customer 360 Data
Product](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_1.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

![Customer 360 Data
Product](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_1.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

Have you ever come across an internal [data
product](https://www.thoughtworks.com/en-us/what-we-do/data-and-ai/modern-
data-engineering-playbook/data-as-a-product) and side-eyed it like it’s your
kid’s prom date? While it _seems_ like it fits the requirements, you don’t
quite trust it — who knows where the data in this shifty table has been. Will
it be reliab

In [34]:
query_str =  "How to Create data product SLOs?"
execute_and_compare(query_str)

Hybrid Results:
Text: This helps teams collectively brainstorm and understand usage, expectations,
trade-offs and business impact. The outcomes of the exercise are then used to
determine the various SLOs that need to be set for individual products.

![Product usage pattern exercise template. Courtesy of
Thoughtworks.](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_6.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

Product usage pattern exercise template. Courtesy of Thoughtworks.

![Product usage pattern exercise template. Courtesy of
Thoughtworks.](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_6.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/image

# Sparse_retriever

In [25]:
from llama_index.core.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine.from_args(
 sparse_retriever, service_context=service_context
)

In [28]:
query_str =  "what is Data products?"
response = query_engine.query(query_str)
print(str(response))

Based on the context information provided, a data product can be defined as a centralized marketplace or catalog of internal data assets that are discoverable, addressable, trustworthy, self-describing, interoperable, and secure. The data product is designed to raise awareness and convince skeptical data consumers to actually start using internal data products. By providing an incredible amount of detail, such as information quality metrics and performance promises, data products can help build trust with data consumers and encourage the reuse of data products.

In other words, a data product is a curated collection of internal data assets that are designed to be easily discoverable, accessible, and reusable across different domains and use cases. It provides a clear and consistent understanding of the data assets, their characteristics, and how they can be used, which helps build trust with data consumers and encourages them to adopt the data products for their own use cases.


In [32]:
print(response.source_nodes[0].get_content())

[Customer 360 Data
Product](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_1.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

![Customer 360 Data
Product](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_1.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

Have you ever come across an internal [data
product](https://www.thoughtworks.com/en-us/what-we-do/data-and-ai/modern-
data-engineering-playbook/data-as-a-product) and side-eyed it like it’s your
kid’s prom date? While it _seems_ like it fits the requirements, you don’t
quite trust it — who knows where the data in this shifty table has been. Will
it be reliable and safe even after

In [35]:
response_1 = query_engine.query("How to Create data product SLOs?")
print(str(response_1))

To create SLOs for a data product, follow these steps:

1. Identify the purpose of the data product: What is the main goal of the data product? What problems does it solve? Who are its target users?
2. Determine the metrics that will be used to measure success: Based on the purpose of the data product, identify the key performance indicators (KPIs) that will be used to evaluate its success. For example, freshness, accuracy, availability, completeness, etc.
3. Set specific and measurable targets for each metric: Define specific target values for each metric, such as 95% freshness rate or 99.9% accuracy rate.
4. Establish a monitoring and alert system: Implement a system to continuously monitor the metrics and alert stakeholders when targets are not met.
5. Review and adjust SLOs regularly: Regularly review the SLOs and adjust them as necessary based on changes in the business or technology landscape.
6. Communicate SLOs to stakeholders: Share the SLOs with stakeholders, including develo

In [36]:
print(response_1.source_nodes[0].get_content())

This helps teams collectively brainstorm and understand usage, expectations,
trade-offs and business impact. The outcomes of the exercise are then used to
determine the various SLOs that need to be set for individual products.

![Product usage pattern exercise template. Courtesy of
Thoughtworks.](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_6.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

Product usage pattern exercise template. Courtesy of Thoughtworks.

![Product usage pattern exercise template. Courtesy of
Thoughtworks.](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_6.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

Prod

# Hybrid Retrievel

In [39]:
from llama_index.core.query_engine import RetrieverQueryEngine
hybrid_query_engine = RetrieverQueryEngine.from_args(
 hybrid_retriever, service_context=service_context
)

In [40]:
query_str = "what is Data products?"
response =  hybrid_query_engine.query(query_str)
print(str(response))

Based on the provided context, a data product can be defined as a centralized marketplace or catalog of internal data assets that are discoverable, addressable, trustworthy, self-describing, interoperable, and secure. It is important to create a data product that addresses the characteristics originally defined by Zhamak Dehghani, such as discoverability, addressability, trustworthiness, self-description, interoperability, and security. By doing so, data teams can build trust with business stakeholders and encourage the reuse of data products across different domains. Additionally, creating a central marketplace or catalog of internal data products can help raise awareness and convince skeptical data consumers to start using them.


In [41]:
print(response.source_nodes[0].get_content())

[Customer 360 Data
Product](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_1.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

![Customer 360 Data
Product](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_1.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

Have you ever come across an internal [data
product](https://www.thoughtworks.com/en-us/what-we-do/data-and-ai/modern-
data-engineering-playbook/data-as-a-product) and side-eyed it like it’s your
kid’s prom date? While it _seems_ like it fits the requirements, you don’t
quite trust it — who knows where the data in this shifty table has been. Will
it be reliable and safe even after

In [42]:
response_1 = hybrid_query_engine.query("How to Create data product SLOs?")
print(str(response_1))

Creating SLOs (Service Level Objectives) for a data product involves several steps:

1. Identify the usage patterns of the data product: Understand how the data product is being used and what are the key metrics that are important to measure. This can be done through surveys, interviews, or by analyzing usage patterns.
2. Define the SLOs: Based on the usage patterns identified in step 1, define the SLOs that are relevant to the data product. These could include things like freshness, accuracy, completeness, and availability.
3. Express the SLOs as percentages: Once the SLOs have been defined, express them as percentages over a period of time. For example, "The data product will be 95% fresh within an hourly interval."
4. Monitor and measure the SLOs: Use monitoring tools and techniques to track the SLOs and measure their compliance. This can be done through automated checks or manual audits.
5. Adjust the SLOs as needed: Based on the measurements taken, adjust the SLOs as needed to ens

In [43]:
print(response_1.source_nodes[0].get_content())

This helps teams collectively brainstorm and understand usage, expectations,
trade-offs and business impact. The outcomes of the exercise are then used to
determine the various SLOs that need to be set for individual products.

![Product usage pattern exercise template. Courtesy of
Thoughtworks.](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_6.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

Product usage pattern exercise template. Courtesy of Thoughtworks.

![Product usage pattern exercise template. Courtesy of
Thoughtworks.](/content/dam/thoughtworks/images/infographic/Tw_illustration_blog_montecarlo_6.png)

![Pause](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/pause-icon.svg)
![Play](/etc.clientlibs/thoughtworks/clientlibs/clientlib-
site/resources/images/play-icon.svg)

Prod