In [1]:
# Import necessary libraries and modules
import os
import glob
from dotenv import load_dotenv
import gradio as gr

# Imports for Langchain, Plotly and Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.callbacks import StdOutCallbackHandler

In [2]:
# Define global variables and environment settings
# MODEL: Specifies the language model to use
# DB_NAME: Name of the vector database
# VARIABLES
MODEL = "gpt-4o-mini"
DB_NAME = "vector_db"

# Load Environment Variables
load_dotenv()

# Read Documents using Langchain's loaders
folders = glob.glob("knowledge-base/*")

def add_metadata(doc, doc_type):
    doc.metadata['doc_type'] = doc_type
    return doc

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])


In [3]:
# Split the documents into chunks for processing
# Specify the chunk size and overlap to ensure proper text segmentation
# Split into Chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total Chunks : {len(chunks)}")
print(f"Document Types found : {set(doc.metadata['doc_type'] for doc in documents)}")

Created a chunk of size 1088, which is longer than the specified 1000


Total Chunks : 123
Document Types found : {'products', 'employees', 'company', 'contracts'}


In [4]:
# Print information about the processed documents and their chunks
# Put Chunks of Data into a Vector Store that associates a Vector embedding with each chunk
# Chroma is a popular open source Vector DB based on SQLLite

embeddings = OpenAIEmbeddings()     # OpenAI

# embeddings_hf = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Free Vector Embeddings from HF Sentence Transformers

# Delete if already exists
if os.path.exists(DB_NAME):
    Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_NAME)
print(f"VectorStore created with {vectorstore._collection.count()} Documents.")

VectorStore created with 123 Documents.


In [5]:
# Print information about the processed documents and their chunks
# More Info on VectorStore
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=['embeddings'])['embeddings'][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store.")

There are 123 vectors with 1,536 dimensions in the vector store.


In [6]:
# Visualize the Vector Store - Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [7]:
# Reduce the dimensionality of vectors to 2D using T-SNE

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x = reduced_vectors[:, 0],
    y = reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type : {t}<br>Text: {d[:100]}" for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [8]:
# Trying 3D

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [9]:
# Define global variables and environment settings
# MODEL: Specifies the language model to use
# DB_NAME: Name of the vector database
# Use Langchain to bring everything together
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Alternatively, can use OLLAMA
# LLM = ChatOpenAI(temperature=0.7, model_name='mistral:7b', base_url='http://localhost:11434/v1', api_key='ollama')

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever(search_kwargs={'k': 25})

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [10]:
# Use Gradio for Chat Interface
def chat(question, history):
    result = conversation_chain.invoke({'question' : question})
    return result['answer']

In [11]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Alex Chen continues to be a vital asset at Insurellm, contributing significantly to innovative backend solutions that help shape the future of insurance technology.

- **Professional Development Goals**:  
  - Emily Tran aims to become a Marketing Manager within the next two years, focusing on leading larger campaigns and developing junior team members.

- **Hobbies**:  
  - Emily enjoys photography and regularly contributes to Insurellm's social media content with her own high-quality images.
  - She is also passionate about sustainability and organizes monthly team volunteer events for environmental awarenes