In [4]:
import torch
from transformers import pipeline

pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="auto")



In [37]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from sentence_transformers import SentenceTransformer
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings

##Load and split documents
bs4_strainer = bs4.SoupStrainer(class_=("main-content"))
loader = WebBaseLoader(
    web_paths=("https://deltek.com/en","https://www.deltek.com/en/about/contact-us", "https://www.deltek.com/en/small-business", "https://www.deltek.com/en/customers",
               "https://www.deltek.com/en/support", "https://www.deltek.com/en/partners"),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()



text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
# all_splits_text = [split.page_content for split in all_splits]











In [38]:
len(all_splits)

46

In [39]:
##open source embedding model
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction= 'Generate a representation for this sentence that can be used to retrieve related sentences:'
)
query = "Why is Deltek trusted?"
db = FAISS.from_documents(all_splits, embeddings)

##get relevant docs from vectorstore
relevant_docs = db.similarity_search(query, k = 5)
formatted_docs = '\n'.join(doc.page_content for doc in relevant_docs)



In [40]:
len(formatted_docs)

3335

In [41]:
# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": f"You are a friendly chatbot who responds about queries related to Deltek from {formatted_docs}. You will not make up answers."
    },
    {"role": "user", "content": query},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
start_out = len(prompt)
print(outputs[0]["generated_text"][start_out:])

Deltek is trusted by many organizations due to its reputation for delivering high-quality, industry-leading ERP solutions and software. The company has earned this trust by being recognized as a leader in its industry and providing high-value solutions to businesses worldwide. Deltek is a recognized vendor in Gartner's Market Guide for Government and Public Sector ERP Systems, and its products have been recognized by industry analysts, such as G2 and Forrester, for their innovation and effectiveness in meeting customer needs. Additionally, Deltek is a G2 leader for 15 consecutive quarters, which indicates a high level of customer satisfaction and trust in the company's products. Overall, Deltek's reputation for delivering high-quality, innovative ERP solutions and software is a key factor in its success in the market.
