In [1]:
!cd ..

D:\Learning\git


In [2]:
import logging

from huggingface_hub import hf_hub_download
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import LlamaCpp
from langchain.vectorstores import Milvus
import os
from langchain.callbacks import wandb_tracing_enabled


from constants import (EMBEDDING_MODEL_NAME, MODEL_ID, MODEL_BASENAME)

In [3]:
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
                    level=logging.INFO)



In [4]:
def load_cpu_model(model_id, model_basename):
    model_path = hf_hub_download(repo_id=model_id, filename=model_basename)

    # Callbacks support token-wise streaming
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    # Verbose is required to pass to the callback manager

    model = LlamaCpp(model_path=model_path, n_ctx=2048, max_tokens=2048, temperature=0, repeat_penalty=1.15,
                     callback_manager=callback_manager, verbose=True)

    return model

In [5]:
# Load the embedding model
embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": 'cuda'})
logging.info('embedding model loaded.')

# Load text generator model
llm = load_cpu_model(MODEL_ID, MODEL_BASENAME)
logging.info('text generator model loaded.')


2023-07-10 14:18:31,038 - INFO - SentenceTransformer.py:66 - Load pretrained SentenceTransformer: hkunlp/instructor-large


load INSTRUCTOR_Transformer


2023-07-10 14:18:35,076 - INFO - 2111476880.py:3 - embedding model loaded.


max_seq_length  512


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 
2023-07-10 14:18:36,339 - INFO - 2111476880.py:7 - text generator model loaded.


In [6]:
def get_llm_generation_langchain(question):
    vector_store = Milvus(embedding_function=embeddings,
                          connection_args={"host": "localhost", "port": "19530"},
                          collection_name='PAN',
                          index_params={"metric_type": "IP", "params": {"nprobe": 10}})

    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever(),
                                     return_source_documents=True)

    # Get the answer from the chain
    res = qa(question)
    answer, docs = res["result"], res["source_documents"]

    return answer


In [7]:
# W&B env setup
os.environ["LANGCHAIN_WANDB_TRACING"] = "true"
os.environ["WANDB_PROJECT"] = "langchain-testing"

# unset the environment variable and use a context manager instead
if "LANGCHAIN_WANDB_TRACING" in os.environ:
    del os.environ["LANGCHAIN_WANDB_TRACING"]


In [8]:
# enable tracing using a context manager
with wandb_tracing_enabled():
    while True:
        query = input("\nEnter a query: ")
        if query == "exit":
            break
        # Get the answer from the chain
        answer = get_llm_generation_langchain(query)

[34m[1mwandb[0m: Streaming LangChain activity to W&B at https://wandb.ai/aditya/langchain-testing/runs/cavseyhs
[34m[1mwandb[0m: `WandbTracer` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `langchain`.


 What is the purpose of a PAN card?

Llama.generate: prefix-match hit


 Hello! How can I assist you today?