# Nestlé HR Assistant Setup

1. Load Nestlé HR PDF  
2. Split into chunks with PyPDFLoader  
3. Create embeddings & vectorstore  
4. Build QA retrieval chain  
5. Launch Gradio interface


In [1]:
import os

# Ensure we’re at the project root
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")
print("Working directory:", os.getcwd())

# Show what’s in data/raw
raw_dir = os.path.join(os.getcwd(), "data", "raw")
print("data/raw contains:", os.listdir(raw_dir))

from langchain.document_loaders import PyPDFLoader

# Adjust this to match the exact filename you see above
pdf_filename = "the_nestle_hr_policy_pdf_2012.pdf"
pdf_path = os.path.join(raw_dir, pdf_filename)
print("Loading:", pdf_path)

loader = PyPDFLoader(pdf_path)
docs = loader.load()
print(f"Loaded {len(docs)} pages")



Working directory: /Users/sheilamcgovern/Desktop/Projects2025/nestle_hr_assistant
data/raw contains: ['the_nestle_hr_policy_pdf_2012.pdf', '.ipynb_checkpoints']
Loading: /Users/sheilamcgovern/Desktop/Projects2025/nestle_hr_assistant/data/raw/the_nestle_hr_policy_pdf_2012.pdf
Loaded 8 pages


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import os

# assume `docs` is already loaded
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks   = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")

df = pd.DataFrame([
    {"chunk_id": i, "page": c.metadata.get("page"), "text": c.page_content.replace("\n"," ")}
    for i, c in enumerate(chunks)
])
df.head(10)

#—and if you want to save:
os.makedirs("data/processed", exist_ok=True)
df.to_csv("data/processed/hr_policy_chunks.csv", index=False)
print("CSV written.")


Split into 20 chunks
CSV written.


In [3]:
import os
os.environ["CHROMA_DISABLE_TELEMETRY"] = "1" # for chroma telemetry warnings
from dotenv import load_dotenv
load_dotenv()



from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

import gradio as gr



# rebuild docs from CSV
df = pd.read_csv("data/processed/hr_policy_chunks.csv")
docs = [
    Document(page_content=row["text"], metadata={"page": row["page"], "chunk_id": int(row["chunk_id"])})
    for _, row in df.iterrows()
]

# (re)create vector store & QA chain
embeddings = OpenAIEmbeddings()
vectordb   = Chroma.from_documents(docs, embeddings, persist_directory="db/chroma")
qa_chain   = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=vectordb.as_retriever()
)

def respond(message, history):
    answer = qa_chain.run(message)
    history = history + [(message, answer)]
    return history, ""

with gr.Blocks() as demo:
    gr.Markdown("## Nestlé HR Assistant\nAsk anything about Nestlé’s HR policy documents.")
    chatbot = gr.Chatbot()
    txt     = gr.Textbox(show_label=False, placeholder="Type your question and hit enter")
    txt.submit(respond, [txt, chatbot], [chatbot, txt])
    # optional clear button:
    # clear = gr.Button("Clear")
    # clear.click(lambda: [], None, chatbot)

demo.launch()




Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [5]:
print(qa_chain.run("What are the working hours?"))


  print(qa_chain.run("What are the working hours?"))
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


I don't have specific information about the working hours from the context provided. It would be best to refer to the company's official documentation or speak directly with the HR department for details on working hours.


In [7]:
# debug what your retriever is pulling back
retriever = vectordb.as_retriever()
docs = retriever.get_relevant_documents("working hours")

print(f"Found {len(docs)} candidate chunks:")
for d in docs:
    page = d.metadata.get("page")
    chunk_id = d.metadata.get("chunk_id")
    snippet = d.page_content.replace("\n", " ")[:200]
    print(f"• page {page}, chunk {chunk_id}: {snippet}")
    print("---")


  docs = retriever.get_relevant_documents("working hours")


Found 4 candidate chunks:
• page 4, chunk 10: working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---
• page 4, chunk 10: working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---
• page 4, chunk 10: working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---
• page 4, chunk 10: working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---


In [9]:
# rebuild the retriever to pull back 5 documents
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

# now ask Chroma for relevant chunks
docs = retriever.get_relevant_documents("hours of work")

print(f"Found {len(docs)} candidate chunks:")
for d in docs:
    print(f"• page {d.metadata.get('page')}, chunk {d.metadata.get('chunk_id')}")
    print("  ", d.page_content.replace('\n',' ')[:200])
    print("---")



Found 5 candidate chunks:
• page 4, chunk 10
   working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---
• page 4, chunk 10
   working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---
• page 4, chunk 10
   working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---
• page 4, chunk 10
   working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---
• page 4, chunk 10
   working inside or outside our premises under

In [10]:
# embed & search directly, then dedupe by chunk_id
query = "hours of work"
results = vectordb.similarity_search(query, k=5)

unique = {}
for d in results:
    cid = d.metadata["chunk_id"]
    if cid not in unique:
        unique[cid] = d

print(f"Unique hits for '{query}':")
for d in unique.values():
    print(f"• page {d.metadata['page']}, chunk {d.metadata['chunk_id']}")
    print("  ", d.page_content.replace('\n',' ')[:200])
    print("---")


Unique hits for 'hours of work':
• page 4, chunk 10
   working inside or outside our premises under  contractual obligations with service providers  and we insist that they also take steps so that  adequate working conditions are made available  to them. 
---


In [11]:
print(qa_chain.run("What does the policy say about workplace safety?"))


I don't have specific information on what the Nestlé Human Resources Policy says about workplace safety.
