In [1]:
# import for preprocessing
import pandas as pd
from rich import print
import logging
import sys
import sqlite3

# llama index import to create sql DB
from llama_index import (
    SQLDatabase,
    SimpleDirectoryReader,
    Document,
    StorageContext,
    ServiceContext,
    VectorStoreIndex,
    OpenAIEmbedding,
    set_global_service_context,
)
from llama_index.indices.struct_store import (
    NLSQLTableQueryEngine,
    SQLTableRetrieverQueryEngine,
)
from llama_index.llms import OpenAI

# import for text 2 SQl query engine
from llama_index.indices.struct_store.sql_query import (
    NLSQLTableQueryEngine,
    SQLTableRetrieverQueryEngine,
)

# iport if we do not know the name of table ahead of time
from llama_index.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)

# import to set up query tool
from llama_index.tools.query_engine import QueryEngineTool

# import to set up query router
from llama_index.query_engine.router_query_engine import RouterQueryEngine
from llama_index.selectors.llm_selectors import LLMSingleSelector

# import to set up weaviate
from llama_index.vector_stores import WeaviateVectorStore

import weaviate
import openai

from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
import os
openai.api_key = os.getenv("OPENA_AI_KEY")

from IPython.display import Markdown, display


from sqlalchemy import engine, MetaData, create_engine, text, select

# Import modules for self correcting
from llama_index.query_engine import RetryQueryEngine
from llama_index.evaluation import RelevancyEvaluator
from llama_index.query_engine import RetrySourceQueryEngine

# Import modules to customize guidelines for what response shou look like
from llama_index.evaluation.guideline import (
    GuidelineEvaluator,
    DEFAULT_GUIDELINES,
)
from llama_index.response.schema import Response
from llama_index.indices.query.query_transform.feedback_transform import (
    FeedbackQueryTransformation,
)
from llama_index.query_engine.retry_query_engine import (
    RetryGuidelineQueryEngine,
)

# import to display llama index response
from llama_index.response.notebook_utils import display_response
# Needed for running async functions in Jupyter Notebook
import nest_asyncio

nest_asyncio.apply()

# Tru Lens Evaluation
# import sys
# sys.path.append(os.path.dirname(find_dotenv()))
# from utils import get_prebuilt_trulens_recorder
# from trulens_eval import Tru

# Import for senetnce window
from wo_utils import build_sentence_window_index
from wo_utils import  build_sentence_window_index_vector_DB
from wo_utils import get_sentence_window_query_engine

from weaviate.auth import AuthApiKey


In [6]:
# # import weaviate
# # import os

# client = weaviate.connect_to_wcs(
#     cluster_url=os.getenv("WCS_DEMO_URL"),  # Replace with your WCS URL
#     auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCS_DEMO_RO_KEY"))  # Replace with your WCS key
# )

In [2]:

auth_config = AuthApiKey(api_key=os.getenv("WCS_DEMO_RO_KEY"))

client = weaviate.Client(
  url=os.getenv("WCS_DEMO_URL"),
  auth_client_secret=auth_config
)

TypeError: Either url or embedded options must be present.

# 1. INGESTION

## Load the product data from file

In [3]:
filepath = "/teamspace/studios/Data_Studio/product_listing/product_listing.csv"
data = pd.read_csv(filepath)
data.columns = ['Product_Name', 'Price', 'Rating', 'Description', 'Features']
print(data.columns)
data.head(3)

Unnamed: 0,Product_Name,Price,Rating,Description,Features
0,Ultimate Wireless Bluetooth Earbuds,79.99,4.4,Elevate your music experience with our Ultimat...,- High-fidelity sound with deep bass and clear...
1,SmartHome Security Camera System,199.99,4.6,Keep your home safe and secure with our SmartH...,- 1080p HD cameras for crystal-clear video qua...
2,Professional Espresso Machine,549.99,4.5,Brew barista-quality coffee at home with our P...,- 15-bar pump for optimal espresso extraction....


## Create an sqlite DB for the product data

In [4]:
# Create a connection to an SQLite database file
conn = sqlite3.connect('./data/product_sqlite.db')

# Write the DataFrame to a table in the SQLite database
data.to_sql('product_table', conn, if_exists='replace', index=False)

12

In [5]:
# View the sqlite DB
conn = sqlite3.connect('./data/product_sqlite.db')
result = conn.execute("SELECT * FROM product_table limit 1").fetchall()

# Process the result
for row in result:
    print(row)

## Create SQLDatabase Object

In [7]:
# SQLITE
# engine = create_engine("sqlite:///" +"./data/product_sqlite.db")
# connection = engine.connect()
# result = connection.execute(text("SELECT * FROM product_table limit 1"))
# for row in result:
#     print(row)


## Define SQL Database

In [6]:
llm = OpenAI(temperature=0.1, model = "gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
service_context = ServiceContext.from_defaults(llm =llm, embed_model=embed_model,)
set_global_service_context(service_context)
# service_context = ServiceContext.from_defaults(llm =llm)
sql_database = SQLDatabase(engine, include_tables=["product_table"])

NoInspectionAvailable: No inspection system is available for object of type <class 'module'>

## Text to SQL Query Engine

In [10]:
query_engine = NLSQLTableQueryEngine(
    sql_database = sql_database,
    tables = ["product_table"])
query_str = "Which product was sold for 79.99?"
response = query_engine.query(query_str)
display_response(response)

## Part 2: Query-Time Retrieval of Tables for Text-to-SQL
If we don’t know ahead of time which table we would like to use, and the total size of the table schema overflows your context window size, we should store the table schema in an index so that during query time we can retrieve the right schema.

The way we can do this is using the SQLTableNodeMapping object, which takes in a SQLDatabase and produces a Node object for each SQLTableSchema object passed into the ObjectIndex constructor.

In [11]:

# set Logging to DEBUG for more detailed outputs
table_node_mapping = SQLTableNodeMapping(sql_database)
table_schema_objs = [
    (SQLTableSchema(table_name="product_table"))
]  # add a SQLTableSchema for each table

obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex,
)
query_engine = SQLTableRetrieverQueryEngine(
    sql_database, obj_index.as_retriever(similarity_top_k=1)
)

In [12]:
query_str = "What is the Rating for the product whose name is Portable Solar Charger?"
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

<b>The rating for the product named Portable Solar Charger is 4.0.</b>

In [11]:
# you can also fetch the raw result from SQLAlchemy!
response.metadata["result"]

[(4.0,)]

## Create embedding for the columns to embed

In [13]:
## With PANDAS DATAFRAME
# Assuming you have already loaded the data into a Pandas DataFrame df
columns_to_embed = ["Description", "Features"]
columns_to_metadata = ["Product Name", "Price", "Rating", "Description", "Features"]

docs = []
for i, row in data.iterrows():
    to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
    values_to_embed = {k: row[k] for k in columns_to_embed if k in row}
    to_embed = '\n'.join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
    newDoc = Document(text=to_embed, metadata=to_metadata)
    docs.append(newDoc)

# Create a single document from a list of Documents
# this is what we will chunk up and store with its embedding in Weaviate
document = Document(text="\n\n".join([doc.text for doc in docs]))

## Create Weaviate Schema

In [11]:
# # Create Schema
# product_schema = {
#    "classes": [
#        {
#            "class": "Product",
#            "description": "Listing of Products",
#            "vectorizer": "text2vec-openai",
#            "properties": [
#                {
#                   "name": "Product_Name",
#                   "dataType": ["text"],
#                   "description": "Name of the product.",
#                },
#                {
#                   "name": "Price",
#                   "dataType": ["number"],
#                   "description": "Price of the product.",
#                },
#                {
#                   "name": "Rating",
#                   "dataType": ["number"],
#                   "description": "Rating of the product.",
#                },
#                {
#                   "name": "Description",
#                   "dataType": ["text"],
#                   "description": "Description of the product.",
#                },
#                {
#                   "name": "Features",
#                   "dataType": ["text"],
#                   "description": "Feature of the product.",
#                }
#             ]
#         }
#     ]
# }

# client.schema.create(product_schema)
# print("Product schema was created.")

Product schema was created.


In [None]:
# view the schme just created
print(client.schema.get())

## Sentence Window Index and Store in Vectore DB

In [15]:
product_index = build_sentence_window_index_vector_DB(
    document = document,
    client = client,
    llm = llm,
    embed_model=embed_model,
    prefix = "Product_sent_win_index"
)
# sentence_index = build_sentence_window_index(
#     document = document,
#     llm = llm,
#     embed_model=embed_model,
#     save_dir="sentence_index"
# )



## Build Weaviate Index

In [18]:
# vector_store = WeaviateVectorStore(weaviate_client=client, class_prefix="Product_index")
# storage_context = StorageContext.from_defaults(vector_store=vector_store)


# product_index = VectorStoreIndex.from_documents([document], storage_context=storage_context, 
#                 service_context = service_context)



# 2. RETRIEVAL

## Build SQL Query Engine

In [16]:
# set up text2SQL prompt
sql_query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["product_table"],
)

## Build Vector Query Engine

In [17]:
vector_query_engine  = get_sentence_window_query_engine(sentence_index = product_index,
                     similarity_top_k=6,
                    rerank_top_n=2,)

# vector_query_engine = product_index.as_query_engine()

## Tell llama index what tool to use

In [18]:
# from llama_index.tools.query_engine import QueryEngineTool

sql_tool = QueryEngineTool.from_defaults(
    query_engine = sql_query_engine,
    description=(
        "Useful for translating a natural language query into a SQL query over a product listing table containing: "
        "Price , Rating, Description, and Features of each product"
    ),
)
vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description="Useful for answering semantic questions about product listing",
)

## Create a router


In [19]:
query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=([sql_tool] + [vector_tool]),
)

## Queries


In [20]:
response = query_engine.query("What is the Rating for the product whose name is Portable Solar Charger?")
# print(str(response))
display_response(response)

**`Final Response:`** The rating for the product named Portable Solar Charger is 4.0.

In [23]:
response = query_engine.query("Tell me about Wireless \
Bluetooth Earbuds products in the product listing ")
display_response(response)

**`Final Response:`** The Ultimate Wireless Bluetooth Earbuds are a top choice for audiophiles who value both sound quality and convenience. With advanced Bluetooth 5.0 technology, these earbuds offer seamless connectivity and crystal-clear audio. They provide high-fidelity sound with deep bass and clear treble, and also feature noise-canceling technology for an immersive listening experience. With 20 hours of playtime on a single charge, you can enjoy your music all day long. The ergonomic design ensures a secure and comfortable fit, and the touch controls make it easy to control playback and take calls. Additionally, these earbuds have an IPX7 waterproof rating, making them suitable for outdoor activities.

In [22]:
print(response)

In [26]:
response = query_engine.query("How many product cost 79.99 in the product listing?")
display_response(response)

**`Final Response:`** There are 2 products in the product listing that cost 79.99.

In [24]:
print(response)

In [27]:
response = query_engine.query("How many product are similar to Wireless Bluetooth Earbuds products?")
display_response(response)

**`Final Response:`** There are 2 products similar to Wireless Bluetooth Earbuds.

In [None]:
rprint(response)

In [28]:
response = query_engine.query("list the products whose descriptions are similar.")
display_response(response)

**`Final Response:`** The products whose descriptions are similar are:
1. Organic Bamboo Bed Sheets
2. Virtual Reality Headset

In [25]:
print(response.metadata)

In [31]:
rprint(data.loc[data.Product_Name.isin(('Organic Bamboo Bed Sheets', 'Virtual Reality Headset')), "Description"].values)

In [29]:
response = query_engine.query("Which product has 'heart rate' in its description")
display_response(response)

**`Final Response:`** The product that has 'heart rate' in its description is the Fitness Tracker Smartwatch.

In [84]:
rprint(response)

In [26]:
response = query_engine.query("What is the sum of all prices for all products whose price is $79.99?")
display_response(response)

**`Final Response:`** The sum of all prices for all products whose price is $79.99 is $159.98.

In [27]:
print(response.metadata)

## Self Correcting query

### Retry Query Engine

In [28]:
query_response_evaluator = RelevancyEvaluator()
retry_query_engine = RetryQueryEngine(
    query_engine, query_response_evaluator
)
retry_response = retry_query_engine.query("What is the sum of all prices for all products whose price is $79.99?")
display_response(retry_response)

**`Final Response:`** The sum of all prices for all products whose price is $79.99 is $159.98.

### Retry Source Query Engine

In [29]:
retry_source_query_engine = RetrySourceQueryEngine(
    query_engine, query_response_evaluator
)
retry_source_response = retry_source_query_engine.query("list the products whose descriptions are similar.")
display_response(retry_source_response)


**`Final Response:`** The products whose descriptions are similar are:
1. Organic Bamboo Bed Sheets
2. Virtual Reality Headset

## Customize Retry Guideline Query Engine

In [30]:
# Guideline eval
guideline_eval = GuidelineEvaluator(
    guidelines=DEFAULT_GUIDELINES
    + "\nThe response should not be overly long.\n"
    "The response should try to summarize where possible.\n"
    "First, answer the question\n"
    "Second provide the reason, why you choose that answer.\n"
)  # just for example

In [31]:
query = "list the products whose descriptions are similar."
typed_response = (
    retry_source_response if isinstance(retry_source_response, Response) else retry_source_response.get_response()
)
eval = guideline_eval.evaluate_response(query, typed_response)
print(f"Guideline eval evaluation result: {eval.feedback}")

feedback_query_transform = FeedbackQueryTransformation(resynthesize_query=True)
transformed_query = feedback_query_transform.run(query, {"evaluation": eval})
print(f"Transformed query: {transformed_query.query_str}")

In [64]:
retry_guideline_query_engine = RetryGuidelineQueryEngine(
    query_engine, guideline_eval, resynthesize_query=True, 
     max_retries=2,)
retry_guideline_response = retry_guideline_query_engine.query(query)
# rprint(retry_guideline_response.response)
display_response(retry_guideline_response)

**`Final Response:`** The products with similar descriptions are the Organic Bamboo Bed Sheets, Virtual Reality Headset, and Organic Herbal Tea Collection.

In [65]:
rprint(retry_guideline_response.response)

In [68]:
rprint(data.loc[data.Product_Name == "Organic Herbal Tea Collection", "Description"].values)

In [69]:
rprint(data.loc[data.Product_Name == "Organic Bamboo Bed Sheets", "Description"].values)

In [70]:
rprint(data.loc[data.Product_Name == "Virtual Reality Headset", "Description"].values)

## Another example of self correcting query

In [72]:
# test another query

query = "What is the sum of all prices for all products whose price is $79.99?"
response = query_engine.query(query)
rprint(str(response))
typed_response = (
    response if isinstance(response, Response) else response.get_response()
)
eval = guideline_eval.evaluate_response(query, typed_response)
print(f"Guideline eval evaluation result: {eval.feedback}")

feedback_query_transform = FeedbackQueryTransformation(resynthesize_query=True)
transformed_query = feedback_query_transform.run(query, {"evaluation": eval})
print(f"Transformed query: {transformed_query.query_str}")

In [73]:
retry_guideline_query_engine = RetryGuidelineQueryEngine(
    query_engine, guideline_eval, resynthesize_query=True,
    max_retries=2,
)
retry_guideline_response = retry_guideline_query_engine.query(query)
rprint(retry_guideline_response.response)