In [1]:
## Load environment variables

import os
from dotenv import load_dotenv, find_dotenv, dotenv_values

# Load with explicit path and allow override
dotenv_path = find_dotenv(usecwd=True)
print("dotenv_path:", dotenv_path or "NOT FOUND")
load_dotenv(dotenv_path=dotenv_path, override=True)

# Show what was parsed from the file (safe preview)
parsed = dotenv_values(dotenv_path) if dotenv_path else {}
print("Keys in .env:", sorted(parsed.keys()))
print("Has OPENAI_API_KEY in .env?:", "OPENAI_API_KEY" in parsed)

val = os.getenv("OPENAI_API_KEY")
print("Env OPENAI_API_KEY present?:", val is not None)
print("Value prefix (masked):", (val[:6] + "…") if val else None)

# Current working directory (to catch path mistakes)
print("cwd:", os.getcwd())

dotenv_path: /Users/anupam/Documents/Programming/rag101/.env
Keys in .env: ['HANDBOOK_SOURCE', 'LANGSMITH_API_KEY', 'LANGSMITH_ENDPOINT', 'LANGSMITH_PROJECT', 'LANGSMITH_TRACING', 'OPENAI_API_KEY', 'POSTS_SOURCE']
Has OPENAI_API_KEY in .env?: True
Env OPENAI_API_KEY present?: True
Value prefix (masked): sk-pro…
cwd: /Users/anupam/Documents/Programming/rag101


In [2]:
# Define LLM model

import getpass, os
from langchain.chat_models import init_chat_model

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

llm = init_chat_model("gpt-4o-mini", model_provider="openai", verbose=True)

In [3]:
# Choose embeddings

import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [4]:
# Chose vector store

from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [5]:
# Chunk handbook and load into Langchain as documents

from typing import TypedDict, List, Dict
import json, os
from langchain_core.documents import Document

class HandbookEntry(TypedDict):
    url: str
    title: str
    sections: Dict[str, str]

def load_handbook(json_path: str) -> List[HandbookEntry]:
    with open(json_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def create_documents(entries: List[HandbookEntry]) -> List[Document]:
    documents = []
    """
    # chunk each section within article individually
    for entry in entries:
        for section_title, section_text in entry['sections'].items():
            if not section_text:
                continue
            metadata = {
                'url': entry['url'],
                'title': entry['title'],
                'section': section_title,
                }
            documents.append(Document(page_content=section_text, metadata=metadata))
    """
    # chunk each article individually
    for entry in entries:
        metadata = {
                'url': entry['url'],
                'title': entry['title'],
                }
        article_text = "\n\n".join(f"{section}\n\n{text}" for section, text in entry["sections"].items())
        documents.append(Document(page_content=article_text, metadata=metadata))    
    return documents

print("Loading handbook...")
handbook_entries = load_handbook(os.environ.get("HANDBOOK_SOURCE"))
print(f"Loaded {len(handbook_entries)} handbook entries")

# Convert to Langchain documents (one per section)
print("Converting handbook sections to Langchain documents...")
documents = create_documents(handbook_entries)
print(f"Created {len(documents)} documents")

# Index documents
document_ids = vector_store.add_documents(documents=documents)
print("Document Ids:", document_ids[:5])


Loading handbook...
Loaded 17 handbook entries
Converting handbook sections to Langchain documents...
Created 17 documents
Document Ids: ['13edb3bc-6198-4963-a8c8-1289f08d78bb', '566d03c5-cc58-4d45-a588-63cfc3e61986', '044a934f-0a7b-4ff9-a2ba-74f1aca40ef8', '0ff32a91-6311-4821-82f0-595fc159a8fc', '981b1a9b-b790-4000-bac6-a74298fa0bdd']


In [6]:
from langchain_core.prompts import PromptTemplate
from typing_extensions import List, TypedDict
import json

# Define prompt for question-answering
prompt = PromptTemplate(
    input_variables=["question", "context"],
    template="""
        Act as a conversational interface for answering questions based on the content of the handbook in your knowledge base.

        When information related to a specific topic does not exist, return no results.
                
        Question: {question} 
        Context: {context} 
        Answer:
        """
)


In [7]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
import json

# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State, min_similarity: float = 0.10 , max_docs: int = 8):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}
    """
    results = vector_store.similarity_search_with_score(state["question"], k=max_docs)
    # Filter by threshold; note: depending on backend, higher score can mean closer or farther.
    # For Chroma + cosine similarity in LC, score is often distance; adjust comparator accordingly.
    relevant = []
    relevant_log = []
    for doc, score in results:
        if score >= min_similarity:
            relevant.append(doc)
            relevant_log.append(f"Doc: {doc.metadata.get('title', 'Unknown')}\nScore: {score}")
    print("\n\n".join(relevant_log))
    return {"context": relevant}
    """

def generate_with_links(state: State):
    if not state["context"]:
        
        return {"answer": "I don't know." + "\n\nNo relevant documents found."}
    
    # Get the base answer
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    base_answer = response.content
    
    # Extract unique links from context
    unique_links = {}
    for doc in state["context"]:
        title = doc.metadata.get('title', 'Unknown')
        url = doc.metadata.get('url', '')
        if url and title not in unique_links:
            unique_links[title] = url
    
    # Format links section
    if unique_links:
        links_section = "\n\nRelevant documents posts:\n"
        for title, link in unique_links.items():
            links_section += f"- [{title}]({link})\n"
        
        final_answer = base_answer + links_section
    else:
        final_answer = base_answer + "\n\nNo relevant documents found."
    
    return {"answer": final_answer}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate_with_links])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [8]:
# Cell 8: Create evaluation dataset
from typing import List, Dict

def create_evaluation_dataset(question: str  ) -> List[Dict]:
    """Create evaluation dataset by running questions through the RAG system"""
    
    evaluation_data = []
    # Get RAG response
    response = graph.invoke({"question": question})
    # Extract retrieved contexts (from the retrieve step)
    retrieved_docs = response.get("context", [])
    retrieved_contexts = [doc.page_content for doc in retrieved_docs] if retrieved_docs else []
    answer = response["answer"]
    evaluation_data.append({
            "user_input": question,
            "retrieved_contexts": retrieved_contexts,
            "response": answer
        })   
    return evaluation_data

In [9]:
# Cell 9: Setup Ragas evaluation
import gc
from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
    LLMContextRecall, 
    Faithfulness, 
    FactualCorrectness,
    AnswerRelevancy,
    LLMContextPrecisionWithoutReference,
    LLMContextPrecisionWithReference,
    NonLLMContextPrecisionWithReference,
    LLMContextRecall,
    NonLLMContextRecall
    
)
def perform_ragas_evaluation(evaluation_dataset_raw):
    # Convert to Ragas format
    evaluation_dataset = EvaluationDataset.from_list(evaluation_dataset_raw)
    
    # Setup evaluator LLM (using the same LLM for consistency)
    evaluator_llm = LangchainLLMWrapper(llm)
    
    # Choose metrics (start with lighter ones to avoid memory issues)
    metrics = [
        AnswerRelevancy(),      # How relevant is the answer to the question
        Faithfulness(),         # Is the answer faithful to the retrieved context
        LLMContextPrecisionWithoutReference(), 
        # LLMContextPrecisionWithReference(),
        # NonLLMContextPrecisionWithReference(),
        # LLMContextRecall(),
        # NonLLMContextRecall(),
    ]
    
    print("Starting Ragas evaluation...")
    print("This may take a few minutes...")
    
    # Add garbage collection before evaluation
    gc.collect()
    
    # Run evaluation
    result = evaluate(
        dataset=evaluation_dataset,
        metrics=metrics,
        llm=evaluator_llm
    )
    print("Evaluation completed!")
    print(f"Results: {result}")

In [21]:
# Sample question for evaluation
sample_question = "Can I work on another job in parallel?"

print("Creating evaluation dataset...")
evaluation_dataset_raw = create_evaluation_dataset(sample_question)
print('Here is the response: \n\n', evaluation_dataset_raw[0]['response']) 
perform_ragas_evaluation(evaluation_dataset_raw)

Creating evaluation dataset...
Here is the response: 

 You can work on another job in parallel, but there are specific guidelines to follow. Occasional side gigs, speaking engagements, or advisory roles are generally acceptable as long as they don't conflict with your responsibilities at 37signals or require significant time commitments. However, working full-time or part-time for another company in the same industry is not allowed, and any side work should not interfere with your performance or dedication to your role at 37signals. If you're unsure about a specific situation, it's best to reach out to your manager for clarification.

Relevant documents posts:
- [A Note About Moonlighting](https://basecamp.com/handbook/moonlighting)
- [How We Work](https://basecamp.com/handbook/how-we-work)
- [Making a Career](https://basecamp.com/handbook/making-a-career)
- [Getting Started](https://basecamp.com/handbook/getting-started)

Starting Ragas evaluation...
This may take a few minutes...


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation completed!
Results: {'answer_relevancy': 0.0000, 'faithfulness': 0.8750, 'llm_context_precision_without_reference': 1.0000}
