In [1]:
!pip install openai
!pip install pypdf
!pip install tiktoken
!pip install langchain
!pip install -q langchain langchain-community openai chromadb pypdf
!pip install langchain-openai
!pip install chromadb
!pip install ragas datasets evaluate



# Using Langchain to Load Documents

In [1]:
from langchain_community.document_loaders import PyPDFLoader

In [2]:
# 1. Download from GitHub
import requests

url = 'https://github.com/cfernandez3/LLM_Projects/raw/main/welcome-book-faculty-univ-staff.pdf'
response = requests.get(url)

with open('welcome_book.pdf', 'wb') as f:
    f.write(response.content)

loader = PyPDFLoader('welcome_book.pdf')
pages = loader.load_and_split()

# Show a preview of the first page
print(pages[0].page_content[:1000])


1
2025 | Faculty and Gables/Marine Staff
Yourjourneystartshere.
2025 | FACULTY (NON-CLINICAL) 
AND CORAL GABLES/MARINE STAFF


In [3]:
pages[10]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2024-12-09T18:10:26-05:00', 'moddate': '2024-12-09T18:10:34-05:00', 'title': 'Welcome Book - Faculty and Staff', 'trapped': '/False', 'source': 'welcome_book.pdf', 'total_pages': 41, 'page': 10, 'page_label': '11'}, page_content='11\n2025 | Faculty and Gables/Marine Staff\nUHEALTH TOTAL CARE\nThis plan is designed to provide you and your family with top-tier healthcare services at a lower cost while giving you access to a network of primarily \nUHealth and Jackson Health System providers and facilities. \n • Lowest premiums and deductibles to ensure affordable care at all levels.\n • Comprehensive coverage, including preventive care, diagnostics, and specialty services, all conveniently available at UM facilities\n • UHealth Total Care gives you access to an exclusive network of primarily UHealth and Jackson Health System providers and facilities. This plan \nalso inclu

In [5]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key="")


In [10]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [11]:

texts = [doc.page_content for doc in pages]
faiss_db = FAISS.from_texts(texts, embedding=embeddings)
faiss_db.save_local("vectorstore")


In [12]:
import shutil
shutil.make_archive("vectorstore", 'zip', "vectorstore")


'/content/vectorstore.zip'

In [13]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size = 100, chunk_overlap = 0)
texts = text_splitter.split_documents(pages)

In [14]:
texts[20]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2024-12-09T18:10:26-05:00', 'moddate': '2024-12-09T18:10:34-05:00', 'title': 'Welcome Book - Faculty and Staff', 'trapped': '/False', 'source': 'welcome_book.pdf', 'total_pages': 41, 'page': 19, 'page_label': '20'}, page_content='20\n2025 | Faculty and Gables/Marine Staff\nUHealth World-Class Care\nCOORDINATING YOUR CARE\nKeeping your health care within UHealth means a more coordinated approach, as doctors can collaborate and have a clear picture of your overall \nhealth. Rest assured that we take your privacy very seriously. As our clinical staff does with all patients, your records are kept confidential and private, \nbefore, during and after your health care visits as legally required by the Health Insurance Portability and Accountability Act (HIPAA).\n OUR GROWING NETWORK\nWhen it comes to convenience, UHealth care is closer than you think. Besides the medical campu

# Using Tokens to Chunk Text

In [15]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [16]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(model_name = 'gpt-4',
                                                                     chunk_size = 300,
                                                                     chunk_overlap =50)
texts = text_splitter.split_documents(pages)

In [17]:
texts[2]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2024-12-09T18:10:26-05:00', 'moddate': '2024-12-09T18:10:34-05:00', 'title': 'Welcome Book - Faculty and Staff', 'trapped': '/False', 'source': 'welcome_book.pdf', 'total_pages': 41, 'page': 2, 'page_label': '3'}, page_content='3\n2025 | Faculty and Gables/Marine Staff\nTable\nof Contents\nDIRECCT Values 4\nBefore Your First Day 5\nDuring Your First Week 5\nYour UM Experience  6\nFind Help When You Need It 7\nWhat’s Happening at the U? 8\n’Canes Total Rewards 9\nCommonly Used Terms 10\n2025 Medical Plans 11\nHealth Reimbursement Account   12\n2025 Medical Plan Summary 13\n2025 Medical Plan Comparison Chart 14\n2025 Medical Plan Premiums    16\nBehavioral & Mental Health     17\nWell ‘Canes Preventive Care    18\nSave Money on Health Care    19\nUHealth World-Class Care    20\nHigh End Imaging    24\nNeed Immediate Care?   25\nPrescription Drugs    27\nFlexible Spending 

# Create an OpenAI instance for Embedding Model

In [18]:
from langchain_openai import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings(api_key = '')

# Create Vector database with Chroma

In [19]:
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings_model)

#Similarity search using a natural language query

In [20]:
query = 'what are the first steps for a new UM employee?'
docs = db.similarity_search(query)
print(docs[0].page_content)

5
2025 | Faculty and Gables/Marine Staff
CREATE A CANE ID
Visit workday.miami.edu to create a Cane ID and password.
PROTECT YOUR CANE ID
Complete Multi-Factor Authentication at caneid.miami.edu to ensure additional protection of your new Cane ID.
LEARN ABOUT THE UNIVERSITY OF MIAMI
Visit firstdays.miami.edu to learn more about working at the University of Miami.
Before Your First Day
During Your First Week
GET A PARKING PERMIT
Register your car and purchase a parking permit.  
View the menu of options at miami.edu/parking (Coral Gables), earth.miami.edu/about-us/administration/campus-safety/
parking/index.html (Marine), or med.miami.edu/offices/public-safety/parking-and-transportation (UHealth/Miller School).
 
GET YOUR BENEFITS
Make your benefit elections within 15 days of hire at workday.miami.edu. 
KNOW WHAT TO EXPECT
Meet with your supervisor to familiarize yourself with the workplace and discuss your job role, performance 
expectations/goals, and any required training.


# Create a prompt for RAG application

In [21]:
prompt = f"""CONTEXT:
{docs[0].page_content}

INSTRUCTIONS:
- You are an HR expert on University of Miami policies and operations.
- Your role is to assist new employees by answering questions and guiding them through the initial setup of benefits, parking, and other general concerns related to their new employment.
- Answer directly first, then provide any extra info if needed.

RULES:
- Base your answers ONLY on the context provided above.
- If the answer is not found in the context, respond with "I don't know."
- Politely refuse to answer any question that is not related to University of Miami policies and operations.

QUESTION:
{query}
"""
print(prompt)


CONTEXT:
5
2025 | Faculty and Gables/Marine Staff
CREATE A CANE ID
Visit workday.miami.edu to create a Cane ID and password.
PROTECT YOUR CANE ID
Complete Multi-Factor Authentication at caneid.miami.edu to ensure additional protection of your new Cane ID.
LEARN ABOUT THE UNIVERSITY OF MIAMI
Visit firstdays.miami.edu to learn more about working at the University of Miami.
Before Your First Day
During Your First Week
GET A PARKING PERMIT
Register your car and purchase a parking permit.  
View the menu of options at miami.edu/parking (Coral Gables), earth.miami.edu/about-us/administration/campus-safety/
parking/index.html (Marine), or med.miami.edu/offices/public-safety/parking-and-transportation (UHealth/Miller School).
 
GET YOUR BENEFITS
Make your benefit elections within 15 days of hire at workday.miami.edu. 
KNOW WHAT TO EXPECT
Meet with your supervisor to familiarize yourself with the workplace and discuss your job role, performance 
expectations/goals, and any required training.

I

# Create prompting function

In [22]:
from openai import OpenAI
client = OpenAI(api_key = '')

#create prompting function
def prompt_function(prompt: str):
  response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {
        "role": "user",
        "content": prompt
      }
    ],
    temperature=0.3,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )
  return response.choices[0].message.content

In [23]:
#print response
prompt_function(prompt)

'The first steps for a new UM employee would be to create a Cane ID and password, complete Multi-Factor Authentication, learn about the University of Miami, get a parking permit, make benefit elections, and meet with their supervisor to discuss their job role and expectations.'

# log_interaction() function

In [24]:
import csv
from datetime import datetime

def log_interaction(query, answer, docs, k=8):
    with open("rag_logs.csv", mode="a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow([
            datetime.now().isoformat(),
            query,
            "\n\n".join(doc.page_content for doc in docs),
            answer,
            k
        ])


# RAG function

In [25]:
def RAG(query: str):
    # Create embedding for the query
    embedding_vector = embeddings_model.embed_query(query)

    # Semantic search on vector DB
    docs = db.similarity_search_by_vector(embedding_vector, k=8)

    # Combine top docs into a single context string
    context = "\n\n".join(doc.page_content for doc in docs)

    # Create the prompt
    prompt = f"""
CONTEXT:
{context}

INSTRUCTIONS:
- You are an HR expert on University of Miami policies and operations.
- Your role is to assist new employees by answering questions and guiding them through the initial setup of benefits, parking, and other general concerns related to their new employment.
- Answer directly first, then provide any extra info if needed.

RULES:
- Use only the provided context above to answer the question.
- If the answer is not in the context, say "I don't know."
- Refuse to answer questions unrelated to the University of Miami.

QUESTION:
{query}
"""

    # Run the LLM and get result
    answer = prompt_function(prompt)

    # ✅ Log query, context, and answer
    log_interaction(query, answer, docs, k=8)

    return answer


# Using the RAG Application

In [26]:
query = "What are the medical plans available?"
response = RAG(query)
print(response)

The medical plans available for University of Miami faculty and Gables/Marine staff in 2025 are UHealth Total Care, Select 1, Select 2, and Choice POSII HRA.


In [27]:
query = "What are the medical plan premium prices?"
response = RAG(query)
print(response)

The medical plan premium prices vary depending on the plan and the coverage level. For example, for the UHealth Total Care plan, the monthly pre-tax premium for Employee Only ranges from $4.15 to $37.58, while for Employee + Family, it ranges from $16.98 to $144.52. For more specific premium prices, you can visit benefits.miami.edu.


In [19]:
query = "What is the HRA?"
response = RAG(query)
print(response)


The Health Reimbursement Account (HRA) is a fund provided by the University of Miami to help employees pay for their deductible ($1,500 individual/$4,500 family) and other eligible medical and pharmacy expenses. Participants receive a HealthEquity Visa card pre-loaded with a fund ($400 per person; up to $1,200 per family) to use for these expenses. The fund is administered by HealthEquity and can be used for medical and pharmacy expenses until depleted, after which participants pay the negotiated rates for their expenses until their deductible is met.


In [20]:
query = "How many people work for UM?"
response = RAG(query)
print(response)

Approximately 18,000 faculty and staff work for the University of Miami.


In [21]:
query = "What are my investment options for retirement?"
response = RAG(query)
print(response)

Your investment options for retirement at the University of Miami include Tier One Fidelity Freedom Index Funds, Tier Two Passive and Active Mutual Funds, Tier Three TIAA-CREF Annuities, and Tier Four Fidelity BrokerageLink. Each tier offers different investment choices based on your risk tolerance and investment preferences.


In [22]:
#Question that should prompt a "I dont know"
query = "Could I paint my office on blue color?"
response = RAG(query)
print(response)

I don't know.


# RAGAS to Evaluate the RAG Application

In [28]:
import os
os.environ["OPENAI_API_KEY"] = ""


In [24]:
from datasets import Dataset

# Provide your test questions
test_questions = [
    "How do I apply for a UM parking permit?",
    "What benefits are offered to new employees?",
    "Where do I go to set up my UM email?",
    "How do I enroll in health insurance?",
    "What are the financial security benefits offered by UM at not cost?",
    "What are the DIRECCT Values at UM?",
    "Are free flu shots available for UM employees "
]

# Provide ground truths (must be same order as test_questions)
ground_truths = [
    "You can apply for a UM parking permit by visiting miami.edu/parking and completing the online form.",
    "New employees receive health, dental, vision, retirement, and tuition benefits. Full list at miami.edu/hr/benefits.",
    "To set up your UM email, log in to CaneID and access your mailbox through Outlook or the UM portal.",
    "Enrollment is done online via Workday. You must enroll within 14 days of your start date.",
    "Life Insurance, Accidental Death & Dismemberment (AD&D), Long-Term Disability (LTD), One Month's Pay Death Benefit.",
    "DIVERSITY, INTEGRITY, RESPONSIBILITY, EXCELLENCE, COMPASSION, CREATIVITY, TEAMWORK",
    "Free flu shots are offered during flu season at the annual HR-Total Rewards fairs, Healthy ’Canes Employee Clinic, and UHealth at Walgreens."
]

# Generate the dataset
def create_ragas_dataset(test_questions, ground_truths=None, k=2):
    dataset_entries = []

    for i, question in enumerate(test_questions):
        # Embed and retrieve
        embedding_vector = embeddings_model.embed_query(question)
        docs = db.similarity_search_by_vector(embedding_vector, k=k)
        context_passages = [doc.page_content for doc in docs]

        # Create prompt and get answer
        context = "\n\n".join(context_passages)
        prompt = f"""
CONTEXT:
{context}

INSTRUCTIONS:
- You are an HR expert on University of Miami policies and operations.
- Your role is to assist new employees by answering questions and guiding them through the initial setup of benefits, parking, and other general concerns related to their new employment.
- Answer directly first, then provide any extra info if needed.

RULES:
- Use only the provided context above to answer the question.
- If the answer is not in the context, say "I don't know."
- Refuse to answer questions unrelated to the University of Miami.

QUESTION:
{question}
"""
        answer = prompt_function(prompt)

        # Create record
        entry = {
            "question": question,
            "contexts": context_passages,
            "answer": answer,
        }

        # Add ground truth if provided
        if ground_truths and i < len(ground_truths):
            entry["ground_truth"] = ground_truths[i]

        dataset_entries.append(entry)

    return Dataset.from_list(dataset_entries)

# RAGAS dataset
ragas_dataset = create_ragas_dataset(test_questions, ground_truths)

# Preview sample entry
print(ragas_dataset[0])


{'question': 'How do I apply for a UM parking permit?', 'contexts': ['5\n2025 | Faculty and Gables/Marine Staff\nCREATE A CANE ID\nVisit workday.miami.edu to create a Cane ID and password.\nPROTECT YOUR CANE ID\nComplete Multi-Factor Authentication at caneid.miami.edu to ensure additional protection of your new Cane ID.\nLEARN ABOUT THE UNIVERSITY OF MIAMI\nVisit firstdays.miami.edu to learn more about working at the University of Miami.\nBefore Your First Day\nDuring Your First Week\nGET A PARKING PERMIT\nRegister your car and purchase a parking permit.  \nView the menu of options at miami.edu/parking (Coral Gables), earth.miami.edu/about-us/administration/campus-safety/\nparking/index.html (Marine), or med.miami.edu/offices/public-safety/parking-and-transportation (UHealth/Miller School).\n \nGET YOUR BENEFITS\nMake your benefit elections within 15 days of hire at workday.miami.edu. \nKNOW WHAT TO EXPECT\nMeet with your supervisor to familiarize yourself with the workplace and discus

In [25]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

results = evaluate(
    ragas_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
)

print(results)


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

{'faithfulness': 0.8571, 'answer_relevancy': 0.8112, 'context_precision': 0.4286, 'context_recall': 0.1429}


# *Streamlit App*

In [32]:
import csv
from datetime import datetime

def log_interaction(query, answer, docs, k=8):
    with open("rag_logs.csv", mode="a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            datetime.now().isoformat(),
            query,
            "\n\n".join(doc.page_content for doc in docs),
            answer,
            k
        ])

def RAG(query: str):
    embedding_vector = embeddings_model.embed_query(query)
    k = 8
    docs = db.similarity_search_by_vector(embedding_vector, k=k)
    context = "\n\n".join(doc.page_content for doc in docs)

    prompt = f"""
CONTEXT:
{context}

INSTRUCTIONS:
- You are an HR expert on University of Miami policies and operations.
- Your role is to assist new employees by answering questions and guiding them through the initial setup of benefits, parking, and other general concerns related to their new employment.
- Answer directly first, then provide any extra info if needed.

RULES:
- Use only the provided context above to answer the question.
- If the answer is not in the context, say "I don't know."
- Refuse to answer questions unrelated to the University of Miami.

QUESTION:
{query}
"""
    answer = prompt_function(prompt)
    log_interaction(query, answer, docs, k)
    return answer
