In [None]:
from google.colab import userdata
userdata.get('OPENAI_API_TOKEN')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
BASE_DIR="/content/drive/MyDrive/Databricks_genai_hackathon_jan2024"
DATA_FILE_PATH=f"{BASE_DIR}/datasets/WMT_Grocery_202209.csv"
VECTORDB_PATH=f"{BASE_DIR}/chromadb1"
EMBEDDING_MODEL_PATH=f"{BASE_DIR}/embedding_model"

# Installs

In [None]:
!pip install --upgrade --quiet langchain langchain-community langchainhub langchain-openai openai transformers chromadb gradio tiktoken sentence-transformers

In [None]:
%pip install --upgrade --quiet  langchain langsmith langchainhub --quiet

# Index

In [None]:
from langsmith import Client

client = Client(api_key=userdata.get('LANGCHAIN_API_KEY'))

In [None]:
import pandas
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.tracers.context import tracing_v2_enabled
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import SystemMessage

In [None]:
# Read the product data
df = pandas.read_csv(DATA_FILE_PATH)
df = df[["index", "PRODUCT_NAME"]].drop_duplicates(subset=["PRODUCT_NAME"]).copy().head(100)
df.rename(columns={'index': 'id', 'PRODUCT_NAME': 'text'}, inplace=True)
print(df.shape)
df.head()

  df = pandas.read_csv(DATA_FILE_PATH)


(100, 2)


Unnamed: 0,id,text
0,0,"Marketside Roasted Red Pepper Hummus, 10 Oz"
1,1,"Marketside Roasted Garlic Hummus, 10 Oz"
2,2,"Marketside Classic Hummus, 10 Oz"
3,3,"Marketside Everything Hummus, 10 oz"
4,4,"Price's Jalapeno Dip, 12 Oz."


In [None]:
!rm -rf /content/drive/MyDrive/Databricks_genai_hackathon_jan2024/chromadb1
!mkdir -p /content/drive/MyDrive/Databricks_genai_hackathon_jan2024/chromadb1

In [None]:
!rm -rf /content/drive/MyDrive/Databricks_genai_hackathon_jan2024/embedding_model
!mkdir -p /content/drive/MyDrive/Databricks_genai_hackathon_jan2024/embedding_model

In [None]:
# Create the vectordb

# Download embeddings model
original_model = SentenceTransformer('all-MiniLM-L12-v2')

# Reload model using langchain wrapper
original_model.save(EMBEDDING_MODEL_PATH)
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH)
# embedding_model = OpenAIEmbeddings(openai_api_key=userdata.get('OPENAI_API_TOKEN'))

# Loading product dataframe
documents = DataFrameLoader(df, page_content_column="text").load()

# Create the vector db
vectordb = Chroma.from_documents(
    documents=documents, embedding=embedding_model, persist_directory=VECTORDB_PATH
)

vectordb.persist()

# Simple Bot

A simple RAG bot

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k":10})
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo-1106", temperature=0, openai_api_key=userdata.get('OPENAI_API_TOKEN'))

In [None]:
retriever.get_relevant_documents("Hummus")

[Document(page_content='Marketside Classic Hummus, 10 Oz', metadata={'id': 2}),
 Document(page_content='Marketside Pine Nut Hummus, 10 oz', metadata={'id': 11}),
 Document(page_content='Marketside Roasted Red Pepper Hummus, 10 Oz', metadata={'id': 0}),
 Document(page_content='Marketside Everything Hummus, 10 oz', metadata={'id': 3}),
 Document(page_content='Marketside Spicy Hummus, 10 oz', metadata={'id': 10}),
 Document(page_content='Marketside Roasted Garlic Hummus, 10 Oz', metadata={'id': 1}),
 Document(page_content='Fresh Cravings Classic Hummus 10oz', metadata={'id': 16}),
 Document(page_content='Fresh Cravings Roasted Red Pepper Hummus 10oz', metadata={'id': 8}),
 Document(page_content='Fresh Cravings Roasted Garlic Hummus 10 oz', metadata={'id': 14}),
 Document(page_content='Fresh Cravings Everything Bagel Hummus 10oz', metadata={'id': 13})]

In [None]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [None]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
df.head()

Unnamed: 0,id,text
0,0,"Marketside Roasted Red Pepper Hummus, 10 Oz"
1,1,"Marketside Roasted Garlic Hummus, 10 Oz"
2,2,"Marketside Classic Hummus, 10 Oz"
3,3,"Marketside Everything Hummus, 10 oz"
4,4,"Price's Jalapeno Dip, 12 Oz."


In [None]:
rag_chain.invoke("List all the available hummus items?")

'The available hummus items are Marketside Everything Hummus, Marketside Classic Hummus, Marketside Pine Nut Hummus, Marketside Roasted Garlic Hummus, and Marketside Roasted Red Pepper Hummus.'

In [None]:
df.query("text.str.contains('Hummus')")

Unnamed: 0,id,text
0,0,"Marketside Roasted Red Pepper Hummus, 10 Oz"
1,1,"Marketside Roasted Garlic Hummus, 10 Oz"
2,2,"Marketside Classic Hummus, 10 Oz"
3,3,"Marketside Everything Hummus, 10 oz"
8,8,Fresh Cravings Roasted Red Pepper Hummus 10oz
10,10,"Marketside Spicy Hummus, 10 oz"
11,11,"Marketside Pine Nut Hummus, 10 oz"
13,13,Fresh Cravings Everything Bagel Hummus 10oz
14,14,Fresh Cravings Roasted Garlic Hummus 10 oz
15,15,Fresh Cravings Honey Jalapeno Hummus 10oz


In [None]:
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use all the relevant findings from the context in your answer. Once you find the answer, just say the product names.

{context}

Question: {question}

Helpful Answer:"""

custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

print(rag_chain.invoke("Hummus"))

Marketside Classic Hummus, Marketside Pine Nut Hummus, Marketside Roasted Red Pepper Hummus, Marketside Everything Hummus, Marketside Spicy Hummus, Marketside Roasted Garlic Hummus, Fresh Cravings Classic Hummus, Fresh Cravings Roasted Red Pepper Hummus, Fresh Cravings Roasted Garlic Hummus, Fresh Cravings Everything Bagel Hummus


In [None]:
# To Dos:

# 1. Experiment with the prompts to have llm return all the options
# 2. How to make llm behave like an order fulfilling agent?
# 2.1. Define the desired flow in your prompt? e.g. "greeting" -> "receive the order" -> "show the options (if needed)" -> "clarify on options (how to do it when there are multiple products each with different options?)" -> "follow up (i.e. confirming the basket)" -> "Finalize (e.g. do you need anything else?)" -> "Create the basket": FINISH

# 3. Optimizations:
    # 3.1. Latency
    # 3.2. LLM model choice
    # 3.3. Vector DB  choice
    # 3.4. Embedding model choice
    # 3.5. Prompt experiments -> which prompts give the best answers
    # 3.6. Document loading -> i.e. chunking
    # 3.7. What's the best way to index tabular data?

# 4. Implementation
# 4.1. Databricks tools (once we get the access)
# 4.2. LlamaIndex