# Nestlé HR Assistant Setup

1. Load Nestlé HR PDF  
2. Split into chunks with PyPDFLoader  
3. Create embeddings & vectorstore  
4. Build QA retrieval chain  
5. Launch Gradio interface


In [1]:
import os

# Ensure we’re at the project root
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")
print("Working directory:", os.getcwd())

# Show what’s in data/raw
raw_dir = os.path.join(os.getcwd(), "data", "raw")
print("data/raw contains:", os.listdir(raw_dir))

from langchain.document_loaders import PyPDFLoader

# Adjust this to match the exact filename you see above
pdf_filename = "the_nestle_hr_policy_pdf_2012.pdf"
pdf_path = os.path.join(raw_dir, pdf_filename)
print("Loading:", pdf_path)

loader = PyPDFLoader(pdf_path)
docs = loader.load()
print(f"Loaded {len(docs)} pages")



Working directory: /Users/sheilamcgovern/Desktop/Projects2025/nestle_hr_assistant
data/raw contains: ['the_nestle_hr_policy_pdf_2012.pdf', '.ipynb_checkpoints']
Loading: /Users/sheilamcgovern/Desktop/Projects2025/nestle_hr_assistant/data/raw/the_nestle_hr_policy_pdf_2012.pdf
Loaded 8 pages


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import os

# Assume `docs` is loaded from PDF (execution_count 1)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", " ", ""]
)
chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")

# Deduplicate and fix page numbers
chunk_data = []
seen_content = set()
for i, c in enumerate(chunks):
    content = c.page_content.replace("\n", " ").strip()
    if content not in seen_content:
        seen_content.add(content)
        chunk_data.append({
            "chunk_id": i,
            "page": c.metadata.get("page") + 1,
            "text": content
        })
df = pd.DataFrame(chunk_data)
print(f"Unique chunks: {len(df)}")

# Save to CSV
os.makedirs("data/processed", exist_ok=True)
df.to_csv("data/processed/hr_policy_chunks.csv", index=False)
print("CSV written.")

Split into 60 chunks
Unique chunks: 60
CSV written.


In [3]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
print("API Key loaded:", "Yes" if api_key else "No")
if api_key:
    print("API Key preview:", api_key[:5] + "...")

API Key loaded: Yes
API Key preview: sk-pr...


In [4]:
import os
os.environ["CHROMA_DISABLE_TELEMETRY"] = "1"
from dotenv import load_dotenv
load_dotenv()

from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import pandas as pd
from langchain.schema import Document

# Load chunks
df = pd.read_csv("data/processed/hr_policy_chunks.csv")
docs = [Document(page_content=row["text"], metadata={"page": int(row["page"]), "chunk_id": int(row["chunk_id"])}) for _, row in df.iterrows()]
print(f"Loaded {len(docs)} docs")

# Create vector store
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma(collection_name="hr_policy_2012", embedding_function=embeddings, persist_directory="db/chroma")
# Clear any existing data
if vectordb.get()['ids']:
    vectordb.delete_collection()
    vectordb = Chroma(collection_name="hr_policy_2012", embedding_function=embeddings, persist_directory="db/chroma")
# Add unique documents with unique IDs
unique_docs = list({doc.page_content: doc for doc in docs}.values())
ids = [f"doc_{i}" for i in range(len(unique_docs))]  # Unique IDs
vectordb.add_documents(documents=unique_docs, ids=ids)
print(f"Stored {len(unique_docs)} unique docs")

# Verify content
docs = vectordb.get()
print(f"Docs in vector store: {len(docs['ids'])}")
if docs['ids']:
    print("Sample metadata:", docs["metadatas"][0])
    print("Sample content:", docs["documents"][0][:100], "...")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


Loaded 60 docs
Stored 60 unique docs
Docs in vector store: 60
Sample metadata: {'chunk_id': 0, 'page': 1}
Sample content: Policy Mandatory September  2012 The Nestlé   Human Resources Policy ...


In [None]:
import os
os.environ["CHROMA_DISABLE_TELEMETRY"] = "1"
from dotenv import load_dotenv
load_dotenv()

from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import gradio as gr
import pandas as pd
import traceback

try:
    # Load chunks
    df = pd.read_csv("data/processed/hr_policy_chunks.csv")
    docs = [Document(page_content=row["text"], metadata={"page": int(row["page"]), "chunk_id": int(row["chunk_id"])}) for _, row in df.iterrows()]
    print(f"Loaded {len(docs)} docs")

    # Create vector store
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    vectordb = Chroma(collection_name="hr_policy_2012", embedding_function=embeddings, persist_directory="db/chroma")
    unique_docs = list({doc.page_content: doc for doc in docs}.values())
    ids = [f"doc_{i}" for i in range(len(unique_docs))]
    vectordb.add_documents(documents=unique_docs, ids=ids)
    print(f"Stored {len(unique_docs)} docs")

    # Prompt
    prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="Answer using only the provided snippets. Quote exact text relevant to the question. If the snippets do not directly address the question, say: 'I don’t know based on the 2012 HR Policy document.'\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
    )

    # QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
        chain_type="stuff",
        retriever=vectordb.as_retriever(search_kwargs={"k": 10}),
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True
    )

    # Gradio
    def respond(message, history=[]):
        try:
            print(f"Processing query: {message}")
            if not message:
                return history, "Please enter a question."
            result = qa_chain.invoke({"query": message})
            answer = str(result["result"])
            source_docs = result["source_documents"]
            seen = set()
            unique_sources = [doc for doc in source_docs if not (doc.metadata["chunk_id"] in seen or seen.add(doc.metadata["chunk_id"]))]
            if unique_sources and "I don’t know" not in answer.lower():
                pages = sorted({doc.metadata["page"] for doc in unique_sources})
                answer += f"\n\n*Source: Page{'s' if len(pages) > 1 else ''} {', '.join(map(str, pages))}*"
            elif "I don’t know" in answer.lower():
                answer += "\n\n*Note: The 2012 policy lacks specific details on this topic. Contact Nestlé HR for current policies.*"
            history.append((str(message), answer))
            print(f"Returning history: {len(history)} messages")
            return history, ""
        except Exception as e:
            error_msg = f"Error: {str(e)}"
            print(error_msg)
            traceback.print_exc()
            return history + [(str(message), error_msg)], ""

    with gr.Blocks() as demo:
        gr.Markdown("## Nestlé HR Assistant\nAsk about 2012 HR Policy. Ex: 'What are the total rewards?'")
        chatbot = gr.Chatbot()
        txt = gr.Textbox(show_label=False, placeholder="Type question and hit enter")
        txt.submit(respond, [txt, chatbot], [chatbot, txt])

    demo.launch()

except Exception as e:
    print(f"Error: {str(e)}")
    traceback.print_exc()

In [6]:
import gradio
print("Gradio version:", gradio.__version__)

Gradio version: 5.36.2


In [7]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma(collection_name="hr_policy_2012", embedding_function=embeddings, persist_directory="db/chroma")
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

for query in ["total rewards", "working hours"]:
    print(f"\nQuery: {query}")
    docs = retriever.invoke(query)
    for d in docs:
        print(f"Page {d.metadata['page']}, Chunk {d.metadata['chunk_id']}: {d.page_content[:100]}...")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given



Query: total rewards


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Page 5, Chunk 19: value and trust that our name brings to those  who work with us; the relationships with our line  ma...
Page 5, Chunk 20: receive. Nestlé, therefore, focuses on Fixed Pay,  Variable Pay, Benefits, Personal Growth and  Deve...
Page 5, Chunk 22: Nestlé Total Rewards programmes must be  established within the social and legal framework  of each ...
Page 5, Chunk 24: transparency. Corporate policy:  Nestlé Total Rewards Policy We are committed to providing our emplo...
Page 5, Chunk 18: The Nestlé Human Resources Policy 3  Total rewards Attracting new hires and keeping current  employe...

Query: working hours
Page 5, Chunk 30: employees is heard. Corporate policy:  Policy on Conditions of Work and Employment  Employment and w...
Page 5, Chunk 28: and we insist that they also take steps so that  adequate working conditions are made available  to ...
Page 7, Chunk 50: communication is established in the workplace.  While dialogue with trade unions is essential, it  d...
Pa

In [8]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma(collection_name="hr_policy_2012", embedding_function=embeddings, persist_directory="db/chroma")
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

for query in ["total rewards", "working hours"]:
    print(f"\nQuery: {query}")
    docs = retriever.invoke(query)
    for d in docs:
        print(f"Page {d.metadata['page']}, Chunk {d.metadata['chunk_id']}: {d.page_content[:100]}...")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given



Query: total rewards
Page 5, Chunk 19: value and trust that our name brings to those  who work with us; the relationships with our line  ma...
Page 5, Chunk 20: receive. Nestlé, therefore, focuses on Fixed Pay,  Variable Pay, Benefits, Personal Growth and  Deve...
Page 5, Chunk 22: Nestlé Total Rewards programmes must be  established within the social and legal framework  of each ...
Page 5, Chunk 24: transparency. Corporate policy:  Nestlé Total Rewards Policy We are committed to providing our emplo...
Page 5, Chunk 18: The Nestlé Human Resources Policy 3  Total rewards Attracting new hires and keeping current  employe...

Query: working hours
Page 5, Chunk 30: employees is heard. Corporate policy:  Policy on Conditions of Work and Employment  Employment and w...
Page 5, Chunk 28: and we insist that they also take steps so that  adequate working conditions are made available  to ...
Page 7, Chunk 50: communication is established in the workplace.  While dialogue with trade unions is 

In [9]:
from langchain_openai import ChatOpenAI
import traceback

try:
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    response = llm.invoke("Test query: Say 'Hello'")
    print("API Response:", response.content)
except Exception as e:
    print(f"OpenAI API error: {str(e)}")
    traceback.print_exc()

API Response: Hello!
