In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
from groq import Groq

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

In [3]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "llama-3.3-70b-versatile"
db_name = "vector_db"

In [4]:
load_dotenv(override=True)
api_key = os.getenv('GROK_API_KEY')

In [5]:
client = Groq(
    api_key=os.environ.get("GROK_API_KEY"),
)

In [17]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

folders = glob.glob("knowledge-base/")

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [18]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [8]:
len(chunks)

3

In [19]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

# embeddings = OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 3 documents


In [20]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


In [21]:
# create a new Chat with GROQ
llm = ChatGroq(temperature=0.7, model_name=MODEL, groq_api_key=os.environ.get("GROK_API_KEY"))

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [22]:
query = "i have a query regarding health insurance"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Hello! This is **Pristine** calling about your health insurance needs.  
I’ll just ask you a few quick questions to connect you to the right agent.  
Please press the number on your keypad.  
Please select your type:  
1. New Customer  
2. Renew an existing policy  
3. Claim or Complaint  
4. Other


In [23]:
system_message = """
You are a call flow assistant for Pristine Health Insurance.
- Use ONLY the flow in company.md file to talk to the customer.
- Do NOT explain or repeat the flow.
- use the json flow given in the company.md to reply dynamically to the users answers. the next question depends on the previous answer by the 
- Do NOT ask the same question again if the customer already answered.
- If the user's choice leads to an END agent (Renewals, Claims, or Other), IMMEDIATELY say the final message and stop.
- If the user's choice is New Customer, continue only with Step 2, then Step 3 if needed.
- Never invent extra steps or questions outside company.md.
- Once the correct agent is determined, say the final message and stop. """

# Create the prompt template

prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("system", "Context:\n{context}"),   # include retrieved company.md
    ("human", "{question}")
])

In [24]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, combine_docs_chain_kwargs={"prompt": prompt})

In [25]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [26]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True) 


* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
