In [1]:
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# Download Required Files: Download the following files (usually found in the "Files and versions" section):

# config.json
# pytorch_model.bin (or multiple weight shards, like pytorch_model-00001-of-00002.bin, etc.)
# tokenizer.json
# special_tokens_map.json
# tokenizer_config.json
# configuration_falcon.py
# modeling_falcon.py
# pytorch_model.bin.index.json
# generation_config.json (optional)


In [2]:
# Local path to the downloaded model files
lllm_model_path = "./models/falcon-7b-instruct"
# offload_folder_path = "./offload"
# Load tokenizer and model from the local path
tokenizer = AutoTokenizer.from_pretrained(lllm_model_path)
llm_model = AutoModelForCausalLM.from_pretrained(lllm_model_path, device_map="cpu") # , offload_folder=offload_folder_path

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Initialize Sentence-Transformers model
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight model for embeddings

# Save model to use it localy later
# embedding_model_path = "./models/all-MiniLM-L6-v2"
# embedding_model.save(embedding_model_path)

In [4]:
# Load model from path
embedding_model_path = "./models/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_path)

In [5]:
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./chromadb")

In [6]:
# Delete the collection
client.delete_collection("hello-world")
# Create or load a collection
collection = client.get_or_create_collection("hello-world")

In [7]:
# Documents, Embeddings and metadata
texts = ["ChromaDB is open-source.", "ChromaDB is a vector database.", "Sentence-Transformers generate embeddings."]
embeddings_texts = embedding_model.encode(texts)
ids = ["doc1","doc2","doc3"]

In [8]:
# Add documents to ChromaDB
collection.add(
    documents = texts,
    embeddings = embeddings_texts,
    ids = ids
)

In [9]:
query_text = "What is ChromaDB?"
query_embedding = embedding_model.encode(query_text)

In [10]:
results = collection.query(
    query_embeddings = query_embedding,
    n_results = 2
)

In [11]:
results

{'ids': [['doc2', 'doc1']],
 'embeddings': None,
 'documents': [['ChromaDB is a vector database.', 'ChromaDB is open-source.']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[0.525861771396575, 0.6587712137600167]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [12]:
def template_summary(customer_query,retrieved_document_1,retrieved_document_2):
    return f'''System Query: You are a Support Team Member. Help the customers.
    Customer Query: {customer_query}
    Retrieved Documents for context-awareness:
    {retrieved_document_1}
    {retrieved_document_2}
    Summarize the retrieved documents and answer:
    '''

In [13]:
input_text = template_summary(query_text,results['documents'][0][0],results['documents'][0][1])
input_text

'System Query: You are a Support Team Member. Help the customers.\n    Customer Query: What is ChromaDB?\n    Retrieved Documents for context-awareness:\n    ChromaDB is a vector database.\n    ChromaDB is open-source.\n    Summarize the retrieved documents and answer:\n    '

In [14]:
inputs = tokenizer(input_text, return_tensors="pt") #.to("cuda")  # Move to GPU if available

In [15]:
inputs

{'input_ids': tensor([[10721, 29213,    37,   781,   362,   241,  6624,  5888, 11103,    25,
          8032,   248,  2536,    25,   742,  9455, 29213,    37,  1634,   304,
         26433,    76,  9814,    42,   742, 18934, 30499,   312,  4436,    24,
         49606,    37,   742, 26433,    76,  9814,   304,   241, 12586,  6729,
            25,   742, 26433,    76,  9814,   304,  1314,    24,  8679,    25,
           742, 12753,   270,   907,   248, 34797,  5759,   273,  3173,    37,
           561]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [16]:
output_ids = llm_model.generate(inputs["input_ids"], max_length=100, temperature=0.2)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [17]:
output_ids

tensor([[10721, 29213,    37,   781,   362,   241,  6624,  5888, 11103,    25,
          8032,   248,  2536,    25,   742,  9455, 29213,    37,  1634,   304,
         26433,    76,  9814,    42,   742, 18934, 30499,   312,  4436,    24,
         49606,    37,   742, 26433,    76,  9814,   304,   241, 12586,  6729,
            25,   742, 26433,    76,  9814,   304,  1314,    24,  8679,    25,
           742, 12753,   270,   907,   248, 34797,  5759,   273,  3173,    37,
           561,    39,    91,    41,  1620,   397,    76,  9814,   304,   241,
         12586,  6729,   325,   304,  1314,    24,  8679,   273,   418,   314,
          1042,   312,  1211,  3549,   273, 29351,    25,   605,   304,   241,
          4452,  2119,   325,   418,   314,  1042,   271, 39147,   273, 13203]])

In [18]:
raw_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [19]:
if "Summarize the retrieved documents and answer:" in raw_summary:
    summary = raw_summary.split("Summarize the retrieved documents and answer:")[-1].strip()
else:
    summary = raw_summary.strip()

print(summary)


<p>ChromaDB is a vector database that is open-source and can be used for data analysis and visualization. It is a powerful tool that can be used to summarize and analyze
