In [1]:
import chromadb
CHROMA_HOST = '13.234.115.62'
CHROMA_PORT = 8000

chroma_client = chromadb.HttpClient(host=CHROMA_HOST, port=CHROMA_PORT)
collection_name = "whatsapp"
collection = chroma_client.get_or_create_collection(name=collection_name)

In [2]:
import os
import fitz

def read_pdf_files_from_folder(folder_path):
    file_data = []

    if not os.path.exists(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return file_data

    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    if not pdf_files:
        print(f"No PDF files found in {folder_path}.")
        return file_data

    for file_name in pdf_files:
        try:
            doc = fitz.open(os.path.join(folder_path, file_name))
            text = ""
            for page in doc:
                text += page.get_text()
            doc.close()
            file_data.append({"file_name": file_name, "content": text})
        except Exception as e:
            print(f"Failed to read {file_name}: {e}")

    if not file_data:
        print("PDF files found, but none could be read.")
    return file_data

folder_path = "documents"  # Ensure this is the correct path to your documents
file_data = read_pdf_files_from_folder(folder_path)

if file_data:
    # Print the extracted data
    for data in file_data:
        print(f"File Name: {data['file_name']}")
        print(f"Content: {data['content'][:500]}...\n")
else:
    print("No data extracted from PDF files.")


File Name: Zappy-AI.pdf
Content:  
 
Business Information 
 
Business Name: Zappy 
Type/Industry: SaaS, IT services 
Website: www.heyzappy.com 
Email:info@heyzappy.com 
Phone Number: +94768708702 
WhatsApp Number: +94768708702 
Location:Sri Lanka office - Colombo two 
Malaysia Office - Bangsar, Kuala Lumpur 
Brief Description: Zappy, a venture backed homegrown business communication platform 
based in Malaysia and Sri Lanka that offers AI salesperson for your business. Zappy is the 
Salesperson you’ve been looking for, Reply ac...



In [3]:
import fitz  # Ensure this is PyMuPDF

try:
    doc = fitz.open('documents/Zappy-AI.pdf')
    print("Number of pages: ", len(doc))
    doc.close()
except Exception as e:
    print("An error occurred:", e)


Number of pages:  3


In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
documents = []
embeddings = []
metadatas = []
ids = []
for index,data in enumerate(file_data):
  documents.append(data['content'])
  embeding = model.encode(data['content']).tolist()
  embeddings.append(embeding)
  metadatas.append({'source':data['file_name']})
  ids.append(str(index+1))

In [6]:
bot_emb = chroma_client.get_collection("whatsapp")
bot_emb.add(
    documents=documents,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)

In [7]:
query = "What is zappyAI"
input_em = model.encode(query).tolist()

results = bot_emb.query(
    query_embeddings=[input_em],
    n_results=1
)
results

{'ids': [['1']],
 'distances': [[9.07656585126657]],
 'embeddings': None,
 'metadatas': [[{'source': 'Zappy-AI.pdf'}]],
 'documents': [[" \n \nBusiness Information \n \nBusiness Name: Zappy \nType/Industry: SaaS, IT services \nWebsite: www.heyzappy.com \nEmail:info@heyzappy.com \nPhone Number: +94768708702 \nWhatsApp Number: +94768708702 \nLocation:Sri Lanka office - Colombo two \nMalaysia Office - Bangsar, Kuala Lumpur \nBrief Description: Zappy, a venture backed homegrown business communication platform \nbased in Malaysia and Sri Lanka that offers AI salesperson for your business. Zappy is the \nSalesperson you’ve been looking for, Reply across socials, send emails, collect feedback, make \nphone calls and schedule meetings, even while you’re asleep ;) \n \n \nChatbot Configuration \n \nDesired Chatbot Name and Identity. Act as Zappy, the AI agent replying to customers \nwho have inquired about Zappy on social media.  \n \nDesired Chatbot tone: professional and friendly, a natural, 

In [8]:
query = "Can I know contact details of ZappyAI"
input_em = model.encode(query).tolist()

results = bot_emb.query(
    query_embeddings=[input_em],
    n_results=2,
)
results

{'ids': [['1']],
 'distances': [[8.604428070179036]],
 'embeddings': None,
 'metadatas': [[{'source': 'Zappy-AI.pdf'}]],
 'documents': [[" \n \nBusiness Information \n \nBusiness Name: Zappy \nType/Industry: SaaS, IT services \nWebsite: www.heyzappy.com \nEmail:info@heyzappy.com \nPhone Number: +94768708702 \nWhatsApp Number: +94768708702 \nLocation:Sri Lanka office - Colombo two \nMalaysia Office - Bangsar, Kuala Lumpur \nBrief Description: Zappy, a venture backed homegrown business communication platform \nbased in Malaysia and Sri Lanka that offers AI salesperson for your business. Zappy is the \nSalesperson you’ve been looking for, Reply across socials, send emails, collect feedback, make \nphone calls and schedule meetings, even while you’re asleep ;) \n \n \nChatbot Configuration \n \nDesired Chatbot Name and Identity. Act as Zappy, the AI agent replying to customers \nwho have inquired about Zappy on social media.  \n \nDesired Chatbot tone: professional and friendly, a natural,

In [9]:
chroma_client.list_collections()

[Collection(name=whatsapp), Collection(name=bot)]

In [10]:
chroma_client.delete_collection("whatsappnew")

Exception: {"error":"ValueError('Collection whatsappnew does not exist.')"}

In [11]:
chroma_client.list_collections()

[Collection(name=whatsapp), Collection(name=bot)]

In [13]:
collection = chroma_client.get_collection(name="whatsapp")

In [14]:
collection.count()

1

In [15]:
collection.peek()

{'ids': ['1'],
 'embeddings': [[-0.05166195333003998,
   -0.1263730376958847,
   -0.01722763478755951,
   -0.23875747621059418,
   0.193697988986969,
   0.034656185656785965,
   0.25450244545936584,
   0.05561080574989319,
   0.005254101008176804,
   0.2757863998413086,
   -0.016979383304715157,
   0.05997149646282196,
   0.08879131078720093,
   0.09548243135213852,
   0.2015741467475891,
   0.007766691967844963,
   0.056731536984443665,
   -0.21488693356513977,
   -0.12418613582849503,
   0.012278859503567219,
   -0.013434664346277714,
   -0.014247418381273746,
   -0.010483130812644958,
   -0.1278628557920456,
   -0.08508609980344772,
   -0.06472435593605042,
   0.058843016624450684,
   0.059498630464076996,
   -0.02873319201171398,
   -0.0350230447947979,
   0.23460014164447784,
   0.05201917886734009,
   0.2719438970088959,
   0.09439072012901306,
   0.00957559421658516,
   -0.04441637545824051,
   0.07823091000318527,
   -0.09482244402170181,
   0.0007814238779246807,
   0.27325522

In [16]:
collection.get()

{'ids': ['1'],
 'embeddings': None,
 'metadatas': [{'source': 'Zappy-AI.pdf'}],
 'documents': [" \n \nBusiness Information \n \nBusiness Name: Zappy \nType/Industry: SaaS, IT services \nWebsite: www.heyzappy.com \nEmail:info@heyzappy.com \nPhone Number: +94768708702 \nWhatsApp Number: +94768708702 \nLocation:Sri Lanka office - Colombo two \nMalaysia Office - Bangsar, Kuala Lumpur \nBrief Description: Zappy, a venture backed homegrown business communication platform \nbased in Malaysia and Sri Lanka that offers AI salesperson for your business. Zappy is the \nSalesperson you’ve been looking for, Reply across socials, send emails, collect feedback, make \nphone calls and schedule meetings, even while you’re asleep ;) \n \n \nChatbot Configuration \n \nDesired Chatbot Name and Identity. Act as Zappy, the AI agent replying to customers \nwho have inquired about Zappy on social media.  \n \nDesired Chatbot tone: professional and friendly, a natural, day-to-day conversational style \nthat ma

In [24]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

embedding_function = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2"
)

chroma_client = chromadb.HttpClient(host=CHROMA_HOST, port=CHROMA_PORT)

vector_db = Chroma(
    client=chroma_client,
    collection_name="bot",
    embedding_function=embedding_function
)

print("there are", vector_db._collection.get(), "in the collection")

there are {'ids': ['1'], 'embeddings': None, 'metadatas': [{'source': 'Zappy-AI.pdf'}], 'documents': [" \n \nBusiness Information \n \nBusiness Name: Zappy \nType/Industry: SaaS, IT services \nWebsite: www.heyzappy.com \nEmail:info@heyzappy.com \nPhone Number: +94768708702 \nWhatsApp Number: +94768708702 \nLocation:Sri Lanka office - Colombo two \nMalaysia Office - Bangsar, Kuala Lumpur \nBrief Description: Zappy, a venture backed homegrown business communication platform \nbased in Malaysia and Sri Lanka that offers AI salesperson for your business. Zappy is the \nSalesperson you’ve been looking for, Reply across socials, send emails, collect feedback, make \nphone calls and schedule meetings, even while you’re asleep ;) \n \n \nChatbot Configuration \n \nDesired Chatbot Name and Identity. Act as Zappy, the AI agent replying to customers \nwho have inquired about Zappy on social media.  \n \nDesired Chatbot tone: professional and friendly, a natural, day-to-day conversational style \n