In [None]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [6]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [35]:
folders = glob.glob("../knowledge-base/*")

text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [36]:
len(documents),documents[0]

(31,
 Document(metadata={'source': '../knowledge-base/company/about.md', 'doc_type': 'company'}, page_content="# About Insurellm\n\nInsurellm was founded by Avery Lancaster in 2015 as an insurance tech startup designed to disrupt an industry in need of innovative products. It's first product was Markellm, the marketplace connecting consumers with insurance providers.\nIt rapidly expanded, adding new products and clients, reaching 200 emmployees by 2024 with 12 offices across the US."))

In [37]:
text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [38]:
len(chunks)

84

In [14]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"There are {len(doc_types)} document types  : {', '.join(doc_types)}")

There are 4 document types  : employees, contracts, company, products


In [9]:
embeddings = OpenAIEmbeddings()

In [16]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with Chromadb {vectorstore._collection.count()} documents")

Vectorstore created with Chromadb 84 documents


In [17]:
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [27]:
collection.get(include=["metadatas",'embeddings','documents'],limit=1)

{'ids': ['5eaee9bd-b184-445c-b0d8-966b12ad96c2'],
 'embeddings': array([[-0.01778954,  0.00708062, -0.02661662, ..., -0.01271939,
         -0.00526646, -0.0275237 ]]),
 'documents': ["# About Insurellm\n\nInsurellm was founded by Avery Lancaster in 2015 as an insurance tech startup designed to disrupt an industry in need of innovative products. It's first product was Markellm, the marketplace connecting consumers with insurance providers.\nIt rapidly expanded, adding new products and clients, reaching 200 emmployees by 2024 with 12 offices across the US."],
 'uris': None,
 'included': ['metadatas', 'embeddings', 'documents'],
 'data': None,
 'metadatas': [{'doc_type': 'company',
   'source': '../knowledge-base/company/about.md'}]}

In [28]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [31]:
vectors,len(vectors),documents[0],doc_types[0],colors[0]

(array([[-0.01778954,  0.00708062, -0.02661662, ..., -0.01271939,
         -0.00526646, -0.0275237 ],
        [-0.0153065 ,  0.00605007, -0.00880564, ...,  0.00327924,
         -0.011951  , -0.02517642],
        [-0.02097209, -0.02410661,  0.0059536 , ...,  0.00891878,
         -0.00972234, -0.01907278],
        ...,
        [ 0.00082623, -0.00383343, -0.02500761, ...,  0.00180967,
          0.00498413, -0.00151615],
        [-0.01270426,  0.00397226, -0.00321265, ...,  0.00042031,
         -0.00021277, -0.0134569 ],
        [-0.01124403, -0.0004906 ,  0.0016955 , ..., -0.00068949,
         -0.01923051, -0.0187104 ]]),
 84,
 "# About Insurellm\n\nInsurellm was founded by Avery Lancaster in 2015 as an insurance tech startup designed to disrupt an industry in need of innovative products. It's first product was Markellm, the marketplace connecting consumers with insurance providers.\nIt rapidly expanded, adding new products and clients, reaching 200 emmployees by 2024 with 12 offices acro

In [32]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.9),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [33]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [10]:
vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)

In [11]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [17]:
response = conversation_chain.run({'question':"What products does the company offer?"})
print(response)

I don't know.


In [20]:
def chat_with_docs(message,history):
    llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

    retriever = vectorstore.as_retriever()

    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
    
    response  = conversation_chain.run({'question':message})
    
    return response

In [21]:
view = gr.ChatInterface(
    fn=chat_with_docs,
    title="Chat with Company Documents",
    description="Ask questions about the company's products, employees, contracts, and more.",

).launch(share=True,inbrowser=True)

  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://8960f12f399525b538.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [23]:
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

In [None]:
load_dotenv()
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pinecone_api_key
pc = Pinecone(
    api_key=pinecone_api_key
)
index_name = 'llm-pinecone-practice'

In [32]:
pc.list_indexes()

[
    {
        "name": "llm-pinecone-practice",
        "metric": "cosine",
        "host": "llm-pinecone-practice-f49vysr.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1536,
        "deletion_protection": "disabled",
        "tags": null
    }
]

In [33]:
index = pc.Index(index_name)

In [39]:
vectorstore2 = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    index_name=index_name,
)

In [40]:
vectorstore2.as_retriever()

VectorStoreRetriever(tags=['PineconeVectorStore', 'OpenAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x7ea779913fa0>, search_kwargs={})

In [None]:
def chat_with_docs_with_pinecone(message,history):
    llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

    retriever = vectorstore2.as_retriever()

    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
    
    response  = conversation_chain.run({'question':message})
    
    return response

In [43]:
view = gr.ChatInterface(
    fn=chat_with_docs,
    title="Chat with Company Documents with Pinecone as vector Store",
    description="Ask questions about the company's products, employees, contracts, and more.",

).launch(share=True,inbrowser=True)

  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://c3b156e4d5c829ba82.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [3]:
import os
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = 'llm-pinecone-practice'


embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

vectorstore = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
    namespace="default"   # change if you used a different namespace
)


In [4]:
query = "who is ceo"
docs = vectorstore.similarity_search(query, k=3)

for i, doc in enumerate(docs, 1):
    print(f"Result {i}:")
    print(doc.page_content)
    print("-" * 50)


Result 1:
---

**Signatures:**

_________________________________  
**[Name]**  
**Title**: CEO  
**Insurellm, Inc.**

_________________________________  
**[Name]**  
**Title**: COO  
**GreenValley Insurance, LLC**  

---

This agreement represents the complete understanding of both parties regarding the use of the Homellm product and supersedes any prior agreements or communications.
--------------------------------------------------
Result 2:
# Avery Lancaster

## Summary
- **Date of Birth**: March 15, 1985  
- **Job Title**: Co-Founder & Chief Executive Officer (CEO)  
- **Location**: San Francisco, California  

## Insurellm Career Progression
- **2015 - Present**: Co-Founder & CEO  
  Avery Lancaster co-founded Insurellm in 2015 and has since guided the company to its current position as a leading Insurance Tech provider. Avery is known for her innovative leadership strategies and risk management expertise that have catapulted the company into the mainstream insurance market.  

