#Import the required libraries




In [1]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import VectorDBQA, RetrievalQA
from langchain.document_loaders import TextLoader, PyPDFLoader
import os
from dotenv import load_dotenv
load_dotenv()


True

# Loading Documents


In [2]:
loader = PyPDFLoader('https://www.nestle.com/sites/default/files/asset-library/documents/jobs/the_nestle_hr_policy_pdf_2012.pdf')
documents = loader.load()

# initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, # Maximum size of each chunk
    chunk_overlap=200 # Overlap between chunks to preserve context
)

# Split the loaded documents
split_documents = text_splitter.split_documents(documents)

# verify the split documents
print(f"Number of split documents: {len(split_documents)}")
# preview the last 3 chunks
for i, doc in enumerate(split_documents[-3:]):  # Preview last 3 chunks
    print(f"Chunk {i+1}: {doc.page_content[:200]}...")  # Print first 200 characters of each chunk


Number of split documents: 20
Chunk 1: that goes beyond the traditional aspects of 
collective bargaining in order to share knowledge 
and to jointly find opportunities related to 
important matters such as Creating Shared Value, 
the heal...
Chunk 2: minimal levels of management and broad spans 
of control, which enable people development, 
increase efficiency, and ease implementation 
of our “Nestlé Management and Leadership 
Principles”.
Less hi...
Chunk 3: A dynamic organisation creates a climate 
of innovation and allows people to think from 
different perspectives. At Nestlé we encourage 
our people to take risks. Mistakes may be made 
but there is al...


 # Creating Vector Representation of Texts

In [3]:
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# number of chunks for cheaper embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma.from_documents(texts, embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

llm = HuggingFaceEndpoint(
    repo_id="google/flan-t5-large",
    temperature=0,
    max_new_tokens=512
)

# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=retriever,
#     return_source_documents=True
# )


qa = RetrievalQA.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
    # llm=llm,
    # chain_type="stuff",
    # retriever=vectordb.as_retriever()
)




  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# Setting Up Question-Answering System

In [4]:

# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=vectordb.as_retriever()
# )

# Example query
# query = "What is the purpose of the Nestlé HR Policy?"
# result = qa({"query": query})

# print(f"Query: {query}")
# print(f"Answer: {result['result']}")


query = "What is the purpose of the Nestlé HR Policy?"

docs = retriever.get_relevant_documents(query)
print(f"Retrieved {len(docs)} documents")

result = qa({"query": query})
print(result)


  docs = retriever.get_relevant_documents(query)
  result = qa({"query": query})


Retrieved 3 documents


StopIteration: 

# Defining Prompt Template

In [None]:
from langchain import PromptTemplate

# Define the prompt template in English
template = """
I am a HR helpful assistant. Please answer the following question in English.
Question: {question}
Answer:
"""

# Create the PromptTemplate instance with the modified English template
prompt = PromptTemplate(
    input_variables=["question"],
    template=template,
)


#  Building Chat Interface with Gradio and Launching the Chat Interface

In [None]:
import gradio as gr

def add_text(history, text):
    history = history + [(text, None)]
    return history, ""

def bot(history):
    query = history[-1][0]
    query = prompt.format(question=query)
    answer = qa.run(query)
    source = qa._get_docs(query)[0]
    source_sentence = source.page_content
    answer_source = source_sentence +"\n"+"source:"+source.metadata["source"] + ", page:" + str(source.metadata["page"])
    history[-1][1] = answer # + answer_source
    return history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)

    with gr.Row():
        with gr.Column(scale=0.6):
            txt = gr.Textbox(
                show_label=False,
                placeholder="Enter text and press enter",
            ).style(container=False)

    txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
        bot, chatbot, chatbot
    )

demo.launch(share=True)