Imports

In [21]:
import pandas as pd

# Load CSV safely
df = pd.read_csv("HRDataset_v14.csv", encoding="utf-8")

print("Columns detected:", list(df.columns))
print("NO Columns detected:", len(df.columns))
print("Total rows:", len(df))


Columns detected: ['Employee_Name', 'EmpID', 'MarriedID', 'MaritalStatusID', 'GenderID', 'EmpStatusID', 'DeptID', 'PerfScoreID', 'FromDiversityJobFairID', 'Salary', 'Termd', 'PositionID', 'Position', 'State', 'Zip', 'DOB', 'Sex', 'MaritalDesc', 'CitizenDesc', 'HispanicLatino', 'RaceDesc', 'DateofHire', 'DateofTermination', 'TermReason', 'EmploymentStatus', 'Department', 'ManagerName', 'ManagerID', 'RecruitmentSource', 'PerformanceScore', 'EngagementSurvey', 'EmpSatisfaction', 'SpecialProjectsCount', 'LastPerformanceReview_Date', 'DaysLateLast30', 'Absences']
NO Columns detected: 36
Total rows: 311


csv to text

In [22]:
def row_to_text(row):
    parts = []
    for col, val in row.items():
        if pd.notna(val):
            parts.append(f"{col}: {val}")
    return " | ".join(parts)

documents = [row_to_text(row) for _, row in df.iterrows()]

print("Sample document:")
print(documents[0])


Sample document:
Employee_Name: Adinolfi, Wilson  K | EmpID: 10026 | MarriedID: 0 | MaritalStatusID: 0 | GenderID: 1 | EmpStatusID: 1 | DeptID: 5 | PerfScoreID: 4 | FromDiversityJobFairID: 0 | Salary: 62506 | Termd: 0 | PositionID: 19 | Position: Production Technician I | State: MA | Zip: 1960 | DOB: 07/10/83 | Sex: M  | MaritalDesc: Single | CitizenDesc: US Citizen | HispanicLatino: No | RaceDesc: White | DateofHire: 7/5/2011 | TermReason: N/A-StillEmployed | EmploymentStatus: Active | Department: Production        | ManagerName: Michael Albert | ManagerID: 22.0 | RecruitmentSource: LinkedIn | PerformanceScore: Exceeds | EngagementSurvey: 4.6 | EmpSatisfaction: 5 | SpecialProjectsCount: 0 | LastPerformanceReview_Date: 1/17/2019 | DaysLateLast30: 0 | Absences: 1


Chunking

In [23]:
from langchain_core.documents import Document

# 1. Create LangChain Documents directly from your strings
# No RecursiveCharacterTextSplitter needed here because each row is already a discrete "chunk"
chunks = [Document(page_content=doc) for doc in documents]

print(f"Total documents created: {len(chunks)}")

# 2. Verify the content to ensure DateofHire is intact
print("\nSample Document Content:")
print(chunks[0].page_content)

Total documents created: 311

Sample Document Content:
Employee_Name: Adinolfi, Wilson  K | EmpID: 10026 | MarriedID: 0 | MaritalStatusID: 0 | GenderID: 1 | EmpStatusID: 1 | DeptID: 5 | PerfScoreID: 4 | FromDiversityJobFairID: 0 | Salary: 62506 | Termd: 0 | PositionID: 19 | Position: Production Technician I | State: MA | Zip: 1960 | DOB: 07/10/83 | Sex: M  | MaritalDesc: Single | CitizenDesc: US Citizen | HispanicLatino: No | RaceDesc: White | DateofHire: 7/5/2011 | TermReason: N/A-StillEmployed | EmploymentStatus: Active | Department: Production        | ManagerName: Michael Albert | ManagerID: 22.0 | RecruitmentSource: LinkedIn | PerformanceScore: Exceeds | EngagementSurvey: 4.6 | EmpSatisfaction: 5 | SpecialProjectsCount: 0 | LastPerformanceReview_Date: 1/17/2019 | DaysLateLast30: 0 | Absences: 1


Embedding chunks to FAISS

In [24]:
import torch
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Embedding model (open-source, local)
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Create in-memory FAISS vector store
vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=embedding_model
)

print("Baseline FAISS vector store ready (in-memory)")


Baseline FAISS vector store ready (in-memory)


LLM Initialisation

In [25]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os

load_dotenv()

groq_api_key = os.environ.get("GROQ_API_KEY")

try:
    if not groq_api_key:
        raise ValueError("GROQ_API_KEY not found in environment variables")

    llm = ChatGroq(
        groq_api_key=groq_api_key,
        model_name="llama-3.1-8b-instant",
        temperature=0.7
    )

    # Minimal sanity check call
    llm.invoke("ping")

    print("‚úÖ ChatGroq LLM initialized successfully")

except Exception as e:
    print("‚ùå ChatGroq LLM initialization failed")
    print(f"Error: {e}")
    raise


‚úÖ ChatGroq LLM initialized successfully


Retrival 

In [17]:
from langchain_classic.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# 1. Initialize Vector Retriever (Semantic)
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# 2. Initialize BM25 Retriever (Keyword/Exact Match)
# We recreate this here to ensure it uses the latest 'chunks'
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 3

# 3. Create Ensemble Retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.5, 0.5]  # Adjust weights if you want to favor keywords more (e.g., [0.7, 0.3])
)

# 4. Take query from user input
query = input("Enter your query for Retrieval: ").strip()

if not query:
    raise ValueError("Query cannot be empty")

print(f"\nüîç Searching for: '{query}'...")

# 5. Perform Hybrid Search
results = ensemble_retriever.invoke(query)

print("\nRetrieved chunks:\n")
for i, doc in enumerate(results, 1):
    print(f"--- Chunk {i} ---")
    print(doc.page_content)
    print("\n")


üîç Searching for: 'DOB: 06/14/87'...

Retrieved chunks:

--- Chunk 1 ---
Employee_Name: Le, Binh | EmpID: 10232 | MarriedID: 0 | MaritalStatusID: 0 | GenderID: 0 | EmpStatusID: 1 | DeptID: 3 | PerfScoreID: 3 | FromDiversityJobFairID: 0 | Salary: 81584 | Termd: 0 | PositionID: 22 | Position: Senior BI Developer | State: MA | Zip: 1886 | DOB: 06/14/87 | Sex: F | MaritalDesc: Single | CitizenDesc: US Citizen | HispanicLatino: No | RaceDesc: Asian | DateofHire: 10/2/2016 | TermReason: N/A-StillEmployed | EmploymentStatus: Active | Department: IT/IS | ManagerName: Brian Champaigne | ManagerID: 13.0 | RecruitmentSource: Indeed | PerformanceScore: Fully Meets | EngagementSurvey: 4.1 | EmpSatisfaction: 5 | SpecialProjectsCount: 7 | LastPerformanceReview_Date: 1/8/2019 | DaysLateLast30: 0 | Absences: 2


--- Chunk 2 ---
Employee_Name: Bernstein, Sean | EmpID: 10046 | MarriedID: 0 | MaritalStatusID: 0 | GenderID: 1 | EmpStatusID: 1 | DeptID: 5 | PerfScoreID: 3 | FromDiversityJobFairID: 0 | Sa

LLM QA Chain

In [33]:
from langchain_core.prompts import ChatPromptTemplate

# 1. Take query from user input
# (You can remove this input() line if you want to reuse the query from Cell 1)
query = input("Enter your query for LLM Answer: ").strip()

if not query:
    raise ValueError("Query cannot be empty")

# 2. Retrieve Documents using the Ensemble Retriever from Cell 1
# This ensures we get the exact date matches AND the semantic meaning
retrieved_docs = ensemble_retriever.invoke(query)

# 3. Combine retrieved chunks into a single context string
context_text = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])

# 4. Create the Prompt Template
prompt_template = ChatPromptTemplate.from_template("""
You are a helpful HR assistant. Use the following employee records to answer the question accurately.
If the answer is not in the context, strictly state that you don't know.

Context:
{context}

Question: 
{question}
""")

# 5. Format the prompt
prompt = prompt_template.format_messages(
    context=context_text,
    question=query
)

# 6. Generate Response
print(f"\nüß† Generating answer for: '{query}'...")
response = llm.invoke(prompt)

print("\nü§ñ LLM Answer:")
print("-" * 50)
print(response.content)
print("-" * 50)


üß† Generating answer for: '5Which recruitment source was used to hire "Elijiah Gray"?'...

ü§ñ LLM Answer:
--------------------------------------------------
I don't know. 

However, I can tell you that Elijiah Gray is a manager in the Production department, but I do not see any information about his own hiring or recruitment source in the provided data.
--------------------------------------------------
