In [16]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.vectorstores import Qdrant
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain import hub
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.schema import (
    HumanMessage,
    SystemMessage,
)
from langchain_community.chat_models.huggingface import ChatHuggingFace
from langchain_core.prompts.chat import ChatPromptTemplate, MessagesPlaceholder

In [30]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def print_huggingface_result(text):
    return text.split("<|assistant|>")[1]

In [3]:
loader = PyPDFDirectoryLoader("documents/")
docs = loader.load()


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

split_documents = text_splitter.split_documents(docs)

In [5]:
db = await Qdrant.afrom_documents(split_documents, OpenAIEmbeddings())

In [53]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 20})

model = ChatOpenAI(temperature=0)
output_parser = StrOutputParser()

prompt = hub.pull("rlm/rag-prompt")


In [54]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [55]:
rag_chain.invoke("What is public health surveillance?")

'Public health surveillance is the ongoing systematic identification, collection, analysis, and interpretation of disease occurrence and public health event data for the purpose of taking timely and robust action. It is essential for planning, implementation, monitoring, and evaluation of public health practice. The IDSR strategy is used to achieve public health surveillance objectives.'

In [80]:
llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    huggingfacehub_api_token="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",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 1024,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

_prompt = hub.pull("rlm/rag-prompt")

chat_model = ChatHuggingFace(llm=llm)

chain = {"context": retriever | format_docs, "question": RunnablePassthrough()} | _prompt | chat_model | StrOutputParser()


                    repo_id was transferred to model_kwargs.
                    Please confirm that repo_id is what you intended.
                    task was transferred to model_kwargs.
                    Please confirm that task is what you intended.
                    huggingfacehub_api_token was transferred to model_kwargs.
                    Please confirm that huggingfacehub_api_token is what you intended.


In [81]:
results = chain.invoke("What is public health surveillance?")
print_huggingface_result(results)

'\nAn alert in the context of disease surveillance is a threshold that suggests further investigation is needed for a suspected case or an unexplained increase in cases of a disease or unusual pattern seen over a period of time in weekly or monthly summary reporting. Depending on the disease or condition, an alert threshold is reached when there is one suspected case for epidemic-prone diseases or diseases targeted for elimination or eradication, or when there is an unexplained increase for any disease or'