In [3]:
import torch
import chromadb
from langchain import PromptTemplate
from transformers import AutoTokenizer, pipeline,AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFacePipeline

# Local path to the downloaded model files
lllm_model_path = "./models/falcon-7b-instruct"
# offload_folder_path = "./offload"
# Load tokenizer and model from the local path
tokenizer = AutoTokenizer.from_pretrained(lllm_model_path)
llm_model = AutoModelForCausalLM.from_pretrained(lllm_model_path, device_map="cpu") # , offload_folder=offload_folder_path
llm_model.config.pad_token_id = llm_model.config.eos_token_id

pipeline = pipeline(
    "text-generation", #task
    model=llm_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="cpu",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})


template = """
You are an intelligent chatbot. Help the following question with brilliant answers.
Question: {question}
Answer:"""
prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = prompt | llm

question = "Elon musk VS Donald trump for president election" 


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [4]:
result = llm_chain.invoke({"question" : question })


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



You are an intelligent chatbot. Help the following question with brilliant answers.
Question: Elon musk VS Donald trump for president election
Answer:
Elon Musk is an intelligent and highly successful entrepreneur. He is a pioneer in several fields, including electric cars and space exploration. On the other hand, Donald Trump has been a successful businessman but is not a professional inventor. In terms of presidential qualifications, both have been successful in their respective fields, which means they both have unique skillsets. Ultimately, it is up to the voters to decide which skillset is more important for the presidency.


In [5]:
if "Answer:" in result:
    clean_result = result.split("Answer:")[-1].strip()
else:
    clean_result = result.strip()

print(clean_result)


Elon Musk is an intelligent and highly successful entrepreneur. He is a pioneer in several fields, including electric cars and space exploration. On the other hand, Donald Trump has been a successful businessman but is not a professional inventor. In terms of presidential qualifications, both have been successful in their respective fields, which means they both have unique skillsets. Ultimately, it is up to the voters to decide which skillset is more important for the presidency.


In [None]:
# Download Required Files: Download the following files (usually found in the "Files and versions" section):

# config.json
# pytorch_model.bin (or multiple weight shards, like pytorch_model-00001-of-00002.bin, etc.)
# tokenizer.json
# special_tokens_map.json
# tokenizer_config.json
# configuration_falcon.py
# modeling_falcon.py
# pytorch_model.bin.index.json
# generation_config.json (optional)


In [None]:
# Initialize Sentence-Transformers model
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight model for embeddings

# Save model to use it localy later
# embedding_model_path = "./models/all-MiniLM-L6-v2"
# embedding_model.save(embedding_model_path)

In [None]:
# Load model from path
embedding_model_path = "./models/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_path)

In [None]:
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./chromadb")

In [None]:
# Delete the collection
client.delete_collection("hello-world")
# Create or load a collection
collection = client.get_or_create_collection("hello-world")

In [None]:
# Documents, Embeddings and metadata
texts = ["ChromaDB is open-source.", "ChromaDB is a vector database.", "Sentence-Transformers generate embeddings."]
embeddings_texts = embedding_model.encode(texts)
ids = ["doc1","doc2","doc3"]

In [None]:
# Add documents to ChromaDB
collection.add(
    documents = texts,
    embeddings = embeddings_texts,
    ids = ids
)

In [None]:
query_text = "What is ChromaDB?"
query_embedding = embedding_model.encode(query_text)

In [None]:
results = collection.query(
    query_embeddings = query_embedding,
    n_results = 2
)

In [None]:
results

In [None]:
def template_summary(customer_query,retrieved_document_1,retrieved_document_2):
    return f'''System Query: You are a Support Team Member. Help the customers.
    Customer Query: {customer_query}
    Retrieved Documents for context-awareness:
    {retrieved_document_1}
    {retrieved_document_2}
    Summarize the retrieved documents and answer:
    '''

In [None]:
input_text = template_summary(query_text,results['documents'][0][0],results['documents'][0][1])
input_text

In [None]:
inputs = tokenizer(input_text, return_tensors="pt") #.to("cuda")  # Move to GPU if available

In [None]:
inputs

In [None]:
output_ids = llm_model.generate(inputs["input_ids"], max_length=100, temperature=0.2)


In [None]:
output_ids

In [None]:
raw_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
if "Summarize the retrieved documents and answer:" in raw_summary:
    summary = raw_summary.split("Summarize the retrieved documents and answer:")[-1].strip()
else:
    summary = raw_summary.strip()

print(summary)
