In [17]:
import os
import logging
import click
import torch
import utils
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler  # for streaming response
from langchain.callbacks.manager import CallbackManager
from langchain.llms import ctransformers
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS, Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain import PromptTemplate, LLMChain
from langchain.chains import RetrievalQA
import uuid
# from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever

In [4]:
import constants
from constants import CHUNK_SIZE,CHUNK_OVERLAP, PARENT_CHUNK_SIZE, CHILD_CHUNK_SIZE,PERSIST_DIRECTORY,EMBEDDING_MODEL_NAME,CHROMA_SETTINGS

In [7]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

from prompt_template_utils import get_prompt_template

# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from transformers import (
    GenerationConfig,
    pipeline,
)

from load_models import (
    load_quantized_model_awq,
    load_quantized_model_gguf_ggml,
    load_quantized_model_qptq,
    load_full_model,
)

from constants import (
    EMBEDDING_MODEL_NAME,
    PERSIST_DIRECTORY,
    MODEL_ID,
    MODEL_BASENAME,
    MAX_NEW_TOKENS,
    MODELS_PATH,
    CHROMA_SETTINGS
)

In [19]:
print(MODEL_ID,
    MODEL_BASENAME)

TheBloke/Llama-2-7b-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf


In [8]:
# !pip install auto-gptq==0.2.2

In [9]:
def load_model(model_id, model_basename=None, LOGGING=logging, device_type='cpu'):
    """
    This is to download and load a Text Generation Model available in HuggingFace Hub.
    This downloads the models to the local drive for the first time only. 

    Args:
        model_id (str): Identifier of the model to load from HuggingFace's model hub.
        model_basename (str, optional): If going for Quantized version of models, it requires Basename of the model to be given.
            Defaults to None.
        device_type (str): Using 'cpu' as default here. Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.

    Returns:
        HuggingFacePipeline: A pipeline object for text generation using the loaded model.

    Raises:
        ValueError: If an unsupported model or device type is provided.
    """
    logging.info(f"Loading Model: {model_id}, on: {device_type}")
    print(f"Loading Model: {model_id}, on: {device_type}")

    if model_basename is not None:
        if ".gguf" in model_basename.lower():
            llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
            return llm
        elif ".ggml" in model_basename.lower():
            model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
        elif ".awq" in model_basename.lower():
            model, tokenizer = load_quantized_model_awq(model_id, LOGGING)
        else:
            model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
    else:
        model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)

    # Load configuration from the model to avoid warnings
    generation_config = GenerationConfig.from_pretrained(model_id)


    # Create a pipeline for text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=MAX_NEW_TOKENS,
        temperature=0.2,
        # top_p=0.95,
        repetition_penalty=1.15,
        generation_config=generation_config,
    )

    local_llm = HuggingFacePipeline(pipeline=pipe)
    logging.info("Local LLM Loaded")
    print("Local LLM Loaded - Querying How are you?")
    print(local_llm("How are you?"))

    return local_llm

In [20]:
def retrieval_qa_pipeline(use_history, promptTemplate_type="llama",device_type='cpu'):
    """
    This module creates a retrieval-based Question Answering (QA) pipeline.

    This function sets up a QA system that retrieves relevant information using embeddings
    from the HuggingFace library. It then answers questions based on the retrieved information.

    Parameters:
    - device_type (str): Defaults to 'cpu' for this project.
    - use_history (bool): Flag to determine whether to use chat history or not.

    Returns:
    - RetrievalQA: An initialized retrieval-based QA system.

    Notes:
    - The function uses embeddings from the HuggingFace library, either instruction-based or regular.
    - The Chroma class is used to load a vector store containing pre-computed embeddings.
    - The retriever fetches relevant documents or data based on a query.
    - The prompt and memory, obtained from the `get_prompt_template` function, might be used in the QA system.
    - The model is loaded onto the specified device using its ID and basename.
    - The QA system retrieves relevant documents using the retriever and then answers questions based on those documents.
    """

    embeddings_instruct = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": device_type})
    # run the following line if used HuggingFaceEmbeddings in the run_ingest.py
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

    # load the vectorstore
    db = Chroma(
        persist_directory=PERSIST_DIRECTORY,
        embedding_function=embeddings,
        client_settings=CHROMA_SETTINGS
    )
    retriever = db.as_retriever()

    # get the prompt template and memory if set by the user.
    prompt, memory = get_prompt_template(promptTemplate_type=promptTemplate_type, history=use_history)

    # load the llm pipeline
    llm = load_model(model_id=MODEL_ID, model_basename=MODEL_BASENAME, LOGGING=logging,device_type=device_type)

    if use_history:
        qa = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",  # we can try other chains types as well. refine, map_reduce, map_rerank
            retriever=retriever,
            return_source_documents=True,  # verbose=True,
            callbacks=callback_manager,
            chain_type_kwargs={"prompt": prompt, "memory": memory},
        )
    else:
        qa = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",  # we can try other chains types as well. refine, map_reduce, map_rerank
            retriever=retriever,
            return_source_documents=True,  # verbose=True,
            callbacks=callback_manager,
            chain_type_kwargs={
                "prompt": prompt,
            },
        )

    return qa

In [13]:
def main(show_sources, use_history, model_type, save_qa,device_type='cpu'):
    """
    This is the main function. Implements the main information retrieval task for a localGPT.

    This function sets up the QA system by loading the necessary embeddings, vectorstore, and LLM model.
    It then enters an interactive loop where the user can input queries and receive answers. Optionally,
    the source documents used to derive the answers can also be displayed.

    Parameters:
    - device_type (str): Specifies the type of device where the model will run, e.g., 'cpu', 'mps', 'cuda', etc.
    - show_sources (bool): Flag to determine whether to display the source documents used for answering.
    - use_history (bool): Flag to determine whether to use chat history or not.

    Notes:
    - Logging information includes the device type, whether source documents are displayed, and the use of history.
    - If the models directory does not exist, it creates a new one to store models.
    - The user can exit the interactive loop by entering "exit".
    - The source documents are displayed if the show_sources flag is set to True.

    """

    logging.info(f"Running on: {device_type}")
    logging.info(f"Display Source Documents set to: {show_sources}")
    logging.info(f"Use history set to: {use_history}")

    # check if models directory do not exist, create a new one and store models here.
    if not os.path.exists(MODELS_PATH):
        os.mkdir(MODELS_PATH)

    qa = retrieval_qa_pipeline(use_history, promptTemplate_type=model_type,device_type=device_type)
    # Interactive questions and answers
    while True:
        query = input("\nEnter a query: ")
        if query == "exit":
            break
        # Get the answer from the chain
        res = qa(query)
        answer, docs = res["result"], res["source_documents"]

        # Print the result
        print("\n\n> Question:")
        print(query)
        print("\n> Answer:")
        print(answer)

        if show_sources:  # this is a flag that you can set to disable showing answers.
            # # Print the relevant sources used for the answer
            print("----------------------------------SOURCE DOCUMENTS---------------------------")
            for document in docs:
                print("\n> " + document.metadata["source"] + ":")
                print(document.page_content)
            print("----------------------------------SOURCE DOCUMENTS---------------------------")
        
        # Log the Q&A to CSV only if save_qa is True
        if save_qa:
            utils.log_to_csv(query, answer)

In [21]:
show_sources=True
use_history=True
device_type='cpu'
save_qa=True
promptTemplate_type="llama"
main(show_sources,use_history,promptTemplate_type,save_qa,device_type)

load INSTRUCTOR_Transformer
max_seq_length  512
Loading Model: TheBloke/Llama-2-7b-Chat-GGUF, on: cpu


llama-2-7b-chat.Q4_K_M.gguf:   0%|          | 0.00/4.08G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 



Enter a query: What is Carbon?


> Question:
What is Carbon?

> Answer:
 Carbon is the element that forms the fundamental components of many biological molecules found uniquely in living things. It has four electrons in its outer shell, which allows it to form four covalent bonds with other atoms or molecules. The simplest organic carbon molecule is methane (CH4), in which four hydrogen atoms bind to a carbon atom. Carbon can also form more complex structures using the bonding properties of its atoms.
----------------------------------SOURCE DOCUMENTS---------------------------

> ../SOURCE_DATA_DB/ConceptsofBiology-WEB_Chapter 1-2.pdf:
Carbon
It is often said that life is “carbon-based.” This means that carbon atoms, bonded to other carbon atoms or other
elements, form the fundamental components of many, if not most, of the molecules found uniquely in living things.
Other elements play important roles in biological molecules, but carbon certainly qualifies as the “foundation”
element

Llama.generate: prefix-match hit




> Question:
"what causes diversity of life on Earth?"

> Answer:
 "The source of the diversity of life on Earth is evolution, the process of gradual change during which new species arise from older species. Evolutionary biologists study the evolution of living things in everything from the microscopic world to ecosystems."
----------------------------------SOURCE DOCUMENTS---------------------------

> ../SOURCE_DATA_DB/ConceptsofBiology-WEB_Chapter 1-2.pdf:
INTRODUCTION
CHAPTER 1
Introduction to Biology
1.1 Themes and Concepts of Biology
1.2 The Process of Science
Viewed from space, Earth (Figure 1.1) offers few clues about the diversity of life
forms that reside there. The first forms of life on Earth are thought to have been microorganisms
that existed for billions of years before plants and animals appeared. The mammals, birds, and
flowers so familiar to us are all relatively recent, originating 130 to 200 million years ago. Humans
have inhabited this planet for only the last 2.5

Llama.generate: prefix-match hit




> Question:
The monomers that make up proteins are called?

> Answer:
 The monomers that make up proteins are called amino acids.
----------------------------------SOURCE DOCUMENTS---------------------------

> ../SOURCE_DATA_DB/ConceptsofBiology-WEB_Chapter 1-2.pdf:
metabolism, and reproduction. For example, insulin is a protein hormone that maintains blood glucose levels.
Proteins have different shapes and molecular weights; some proteins are globular in shape whereas others are
fibrous in nature. For example, hemoglobin is a globular protein, but collagen, found in our skin, is a fibrous protein.
Protein shape is critical to its function. Changes in temperature, pH, and exposure to chemicals may lead to
permanent changes in the shape of the protein, leading to a loss of function or denaturation (to be discussed in
more detail later). All proteins are made up of different arrangements of the same 20 kinds of amino acids.
Amino acids are the monomers that make up proteins. Each amin

In [None]:
show_sources=True
use_history=True
device_type='cpu'
save_qa=True
promptTemplate_type="llama"
main(show_sources,use_history,promptTemplate_type,save_qa,device_type)

load INSTRUCTOR_Transformer
max_seq_length  512
Loading Model: TheBloke/Llama-2-7b-Chat-GGUF, on: cpu


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 



Enter a query: Why are hydrogen bonds and van der Waals interactions necessary for cells?


> Question:
Why are hydrogen bonds and van der Waals interactions necessary for cells?

> Answer:
  Hydrogen bonds and van der Waals interactions are essential for the structure and function of cells for several reasons:
1. Stabilization of biomolecules: Hydrogen bonds and van der Waals interactions play a crucial role in stabilizing the three-dimensional structures of biomolecules such as DNA, RNA, proteins, and lipids. These interactions help maintain their shape and stability, which is essential for their proper functioning.
2. Protein folding: Hydrogen bonds and van der Waals interactions are responsible for the folding of proteins into their specific three-dimensional structures. This is crucial for protein function, as their shape and chemical properties determine their activity.
3. Cell signaling: Hydrogen bonds and van der Waals interactions play a role in cell signaling pathways, allow

In [None]:
if __name__ == "__main__":
    show_sources=True
    use_history=True
    device_type='cpu'
    save_qa=True
    promptTemplate_type="llama"
    main(show_sources,use_history,promptTemplate_type,save_qa,device_type)