In [44]:
import torch
from transformers import pipeline
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings


#small open source llm
pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="auto")

##open source embedding model
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction= 'Generate a representation for this sentence that can be used to retrieve related sentences:'
)



In [45]:
def load_docs():
    bs4_strainer = bs4.SoupStrainer(class_=("main-content"))
    loader = WebBaseLoader(
        web_paths=("https://deltek.com/en","https://www.deltek.com/en/about/contact-us", "https://www.deltek.com/en/small-business", "https://www.deltek.com/en/customers",
                "https://www.deltek.com/en/support", "https://www.deltek.com/en/partners"),
        bs_kwargs={"parse_only": bs4_strainer},
    )
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200, add_start_index=True
    )
    all_splits = text_splitter.split_documents(docs)
    return all_splits
    # all_splits_text = [split.page_content for split in all_splits]











In [46]:

def getDocs(query, all_splits, embeddings):
    db = FAISS.from_documents(all_splits, embeddings)

    ##get relevant docs from vectorstore
    relevant_docs = db.similarity_search(query, k = 5)
    formatted_docs = '\n'.join(doc.page_content for doc in relevant_docs)


In [47]:
# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
def getoutput(query, context):
    messages = [
        {
            "role": "system",
            "content": f"You are a friendly chatbot who responds about queries related to Deltek from {context}. You will not make up answers."
        },
        {"role": "user", "content": query},
    ]
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    start_out = len(prompt)
    return outputs[0]["generated_text"][start_out:]

In [49]:
query1 = "Why is deltek trusted?"
query2 = "What does Deltek do?"
query3 = "How can I contact Deltek?"

queries = [query1, query2, query3]
splits = load_docs()


for query in queries:
    rel_docs = getDocs(query, splits, embeddings)
    output = getoutput(query, rel_docs)
    print("\n" + query + ": \n" + output)


Why is deltek trusted?: 
Deltek is a trusted brand in the industry for its expertise in software solutions for the accounting and finance needs of businesses. The company has been providing services for more than three decades and has a strong reputation for providing high-quality solutions, advanced technologies, and expertise. Deltek's commitment to innovation, customer satisfaction, and continuous improvement is a key factor in its success. Additionally, Deltek's focus on data-driven insights and analytics has contributed to its success in the industry.

The Deltek brand is trusted because it has earned the confidence of its customers by providing them with high-quality software solutions, reliable service, and excellent support. Deltek's commitment to customer satisfaction, data-driven insights, and innovation has helped it establish itself as a leader in the industry.

What does Deltek do?: 
Deltek is a comprehensive accounting software and business management solution that helps