In [2]:
# !pip install langchain
# !pip install PyPDF2
# !pip install sentence_transformers
# !pip install ctransformers
# !pip install faiss-gpu

In [1]:
import os
import glob
from PyPDF2 import PdfReader
from langchain.chains import RetrievalQA
from langchain_community.llms import CTransformers
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
embeddings = HuggingFaceBgeEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# !huggingface-cli download MaziyarPanahi/Calme-7B-Instruct-v0.2-GGUF Calme-7B-Instruct-v0.2.Q5_K_M.gguf --local-dir . --local-dir-use-symlinks False

In [3]:


def create_embeddings(pdf_file_or_folder_path,persist_directory):
    """Create the embeddings of input PDFs and saves them in persist_directory.

    Args:
        pdf_file_or_folder_path (str): Question asked by the user.
        persist_directory (str): Vectorstore directory.
    """

    if pdf_file_or_folder_path.lower().endswith('.pdf'):
        pdf_list  = [pdf_path]
    else:
        pdf_list = glob.glob(pdf_file_or_folder_path.strip('/') + '/*.pdf')

    context = []
    metadata = []
    for path in pdf_list:
        reader = PdfReader(path)
        page_texts = [page.extract_text() for page in reader.pages]
        splitter = CharacterTextSplitter('\n', chunk_size=2000, chunk_overlap=500)
        doc_context = splitter.split_text(" ".join(page_texts))
        metadata += [{"file_name" : path.split('/')[-1]}] * len(doc_context)
        context += doc_context

    db = FAISS.from_texts(texts=context, embedding = embeddings, metadatas = metadata)
    db.save_local(persist_directory.strip('/'))




def answer(prompt, persist_directory):
    print("answer function call",prompt)
    """From a question asked by the user, generate the answer based on the vectorstore.

    Args:
        prompt (str): Question asked by the user.
        persist_directory (str): Vectorstore directory.

    Returns:
        str: Answer generated with the LLM
    """

    prompt_temp = """
    Only use the following information to answer the question. If the answer is not present in the following information then return "no answer".
    {context}
    {question}
    """

    vectorstore = FAISS.load_local(persist_directory.strip('/'), embeddings, allow_dangerous_deserialization = True)
    prompt_template = PromptTemplate(template=prompt_temp, input_variables=["context", "question"])
    config = {'max_new_tokens': 1000, 'top_p':1, 'temperature':0, 'context_length': 4000, 'gpu_layers':100, 'stop':['(INST']}

    doc_chain = RetrievalQA.from_chain_type(
        llm = CTransformers(model="./Calme-7B-Instruct-v0.2.Q5_K_M.gguf", config=config),
        chain_type = "stuff",
        retriever = vectorstore.as_retriever(search_kwargs = {"k":2}),
        return_source_documents = True,
        chain_type_kwargs = {"prompt" : prompt_template}
    )

    result = doc_chain(prompt)
    answer = {}
    answer['result'] = result['result']
    if "noanswer" == result['result'].lower().replace(' ','').strip():
        answer['sources'] = []
    else:
        answer['sources'] = set([i.metadata['file_name'] for i in result['source_documents']])
    return answer


In [4]:
pdf_path = '/content/drive/MyDrive/chatbot/Operation-and-Maintenance-Manual_SEBU8407-06 (1).pdf'
persist_directory = '/content/drive/MyDrive/chatbot/embeddings'

In [6]:
create_embeddings(pdf_path,persist_directory)

In [25]:
question = "Steps to tilt the cab upward?"
resp = answer(question, persist_directory)
resp

answer function call Steps to tilt the cab upward?


KeyboardInterrupt: 

In [22]:
resp['result']

"1. Park the machine on level ground.\n    2. Lower the loader arms fully.\n    3. Turn the engine start switch key to the OFF position.\n    4. Remove the two front bolts for the ROPS. Close the cab door and ensure it's latched.\n    5. Tilt the cab upward, standing on the ground.\n    6. Engage the cab support lever on the right side of the machine in the ENGAGED position. "