In [263]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.vectorstores import Qdrant
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models.huggingface import ChatHuggingFace
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from qdrant_client import QdrantClient
from langchain.text_splitter import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
import os

In [264]:
load_dotenv()


True

In [147]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def print_huggingface_result(text):
    # return text
    return text.split("<|assistant|>")[1]

In [247]:
loader = PyPDFDirectoryLoader("documents/")
documents = loader.load()

In [265]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

client = QdrantClient(os.environ.get("QDRANT_URL"), port=6333)
colllection_name = "idsr"

vectorstore = Qdrant(client, colllection_name, OpenAIEmbeddings())

store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    search_kwargs={
        "k": 10,
    },
    search_type="mmr"
)


In [266]:
retriever.add_documents(documents, ids=None)

In [253]:
# retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 20})
repo_id="HuggingFaceH4/zephyr-7b-beta"
llm = HuggingFaceHub(
    repo_id=repo_id,
    huggingfacehub_api_token=os.environ.get("HUGGINGFAACE_API_TOKEN"),
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 1024,
        "temperature": 0.1,
        "top_k": 30,
        "repetition_penalty": 1.03,
    },
)
chat_model = ChatHuggingFace(llm=llm)

output_parser = StrOutputParser()

prompt = hub.pull("rlm/rag-prompt")


In [254]:

chain = {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | chat_model | output_parser

In [259]:
results = chain.invoke("Who developed IDSR??")
print_huggingface_result(results)

'\nThe third edition of the Integrated Disease Surveillance and Response (IDSR) Technical Guidelines was developed by the WHO Health Emergencies (WHE) Programme in collaboration with programmes dealing with disease surveillance at the WHO Regional Office for Africa (AFRO), with technical reviews provided by the U.S. Centers for Disease Control and Prevention (CDC) and the U.S. Agency for International Development (USAID). The purpose of revising'