# RAG Chatbot with Local LLM

A Retrieval-Augmented Generation (RAG) chatbot that answers questions using only internal company documents.

**Features:**
- Uses local LLM (Qwen2.5-1.5B-Instruct) - no API keys required
- Multilingual support (English & German)
- Only answers based on provided documents
- Admits when information is not available

**Architecture:**
1. **Document Processing**: Load and chunk markdown documents
2. **Embedding**: Multilingual sentence embeddings for semantic search
3. **Retrieval**: Find relevant document chunks based on query similarity
4. **Generation**: Use LLM to generate answers grounded in retrieved context

## 1. Setup and Imports

In [1]:
import os
from pathlib import Path
import json
import hashlib
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Configuration
DOCS_DIR = Path("data/docs")
EMBEDDING_MODEL = "intfloat/multilingual-e5-small"
LLM_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"  # Smaller, faster, good multilingual
TOP_K = 3  # Number of chunks to retrieve
SIMILARITY_THRESHOLD = 0.3  # Minimum similarity to consider context relevant
BM25_WEIGHT = 0.35  # Weight for keyword score in hybrid retrieval
INDEX_DIR = Path("index")

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"MPS available: {torch.backends.mps.is_available()}")

PyTorch version: 2.9.1
CUDA available: False
MPS available: True


## 2. Document Loading and Chunking

In [6]:
def load_documents(docs_dir: Path) -> list[dict]:
    """Load all markdown documents from directory."""
    documents = []
    for file_path in docs_dir.glob("*.md"):
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            documents.append({
                "filename": file_path.name,
                "content": content
            })
    return documents

def extract_title(content: str, filename: str) -> str:
    """Extract document title from markdown header or derive from filename."""
    lines = content.strip().split("\n")
    for line in lines:
        if line.startswith("# "):
            return line[2:].strip()
    return filename.replace("_", " ").replace(".md", "").title()

def chunk_document(doc: dict) -> list[dict]:
    """Split document into chunks by sections, including title for better retrieval."""
    content = doc["content"]
    filename = doc["filename"]
    title = extract_title(content, filename)
    
    sections = [s.strip() for s in content.split("\n\n") if s.strip()]
    
    chunks = []
    for section in sections:
        if section.startswith("# "):
            continue  # Skip title line
        if section:
            # Prepend title for better semantic matching
            chunk_text = f"[{title}]\n{section}"
            chunks.append({
                "text": chunk_text,
                "source": filename,
                "title": title
            })
    return chunks


# Load and chunk all documents
documents = load_documents(DOCS_DIR)
print(f"Loaded {len(documents)} documents:")
for doc in documents:
    print(f"  - {doc['filename']}")

# Create chunks
all_chunks = []
for doc in documents:
    chunks = chunk_document(doc)
    all_chunks.extend(chunks)

print(f"\nCreated {len(all_chunks)} chunks total")

Loaded 5 documents:
  - travel_expenses.md
  - it_security.md
  - vacation_policy.md
  - employee_handbook.md
  - remote_work_policy.md

Created 20 chunks total


In [7]:
all_chunks

[{'text': '[Travel & Expense Guidelines]\n**1. Booking Travel**  \nBusiness trips must be booked through the approved travel portal. Economy class is standard for flights under six hours.',
  'source': 'travel_expenses.md',
  'title': 'Travel & Expense Guidelines'},
 {'text': '[Travel & Expense Guidelines]\n**2. Accommodation**  \nHotel expenses are reimbursed up to 120 EUR per night, including taxes.',
  'source': 'travel_expenses.md',
  'title': 'Travel & Expense Guidelines'},
 {'text': '[Travel & Expense Guidelines]\n**3. Meals**  \nPer-diem allowances follow local regulations. Receipts must be uploaded within ten days after the trip.',
  'source': 'travel_expenses.md',
  'title': 'Travel & Expense Guidelines'},
 {'text': '[Travel & Expense Guidelines]\n**4. Reimbursement Process**  \nExpenses are submitted via the HR portal and typically reimbursed within 14 days.',
  'source': 'travel_expenses.md',
  'title': 'Travel & Expense Guidelines'},
 {'text': '[IT Security Guidelines]\n**1

## 3. Embedding Model and Semantic Search

In [8]:
# Load embedding model
print(f"Loading embedding model: {EMBEDDING_MODEL}")
embed_model = SentenceTransformer(EMBEDDING_MODEL)

# Generate embeddings for all chunks
def compute_docs_hash(docs_dir: Path) -> str:
    hasher = hashlib.sha256()
    for file_path in sorted(docs_dir.glob("*.md")):
        hasher.update(file_path.name.encode("utf-8"))
        hasher.update(b"\0")
        hasher.update(file_path.read_bytes())
    return hasher.hexdigest()

def load_cached_index(docs_hash: str):
    INDEX_DIR.mkdir(exist_ok=True)
    meta_path = INDEX_DIR / "index_meta.json"
    embed_path = INDEX_DIR / "embeddings.npy"
    chunks_path = INDEX_DIR / "chunks.json"
    if not (meta_path.exists() and embed_path.exists() and chunks_path.exists()):
        return None
    meta = json.loads(meta_path.read_text())
    if meta.get("docs_hash") != docs_hash:
        return None
    if meta.get("embedding_model") != EMBEDDING_MODEL:
        return None
    cached_chunks = json.loads(chunks_path.read_text())
    cached_embeddings = np.load(embed_path)
    if len(cached_chunks) != len(cached_embeddings):
        return None
    return cached_chunks, cached_embeddings

def save_cached_index(docs_hash: str, chunks: list[dict], embeddings: np.ndarray) -> None:
    INDEX_DIR.mkdir(exist_ok=True)
    meta = {"docs_hash": docs_hash, "embedding_model": EMBEDDING_MODEL}
    (INDEX_DIR / "index_meta.json").write_text(json.dumps(meta, ensure_ascii=True))
    (INDEX_DIR / "chunks.json").write_text(json.dumps(chunks, ensure_ascii=True))
    np.save(INDEX_DIR / "embeddings.npy", embeddings)

print("Generating embeddings for document chunks...")
docs_hash = compute_docs_hash(DOCS_DIR)
cached = load_cached_index(docs_hash)
if cached:
    all_chunks, chunk_embeddings = cached
    print(f"Loaded cached embeddings: {chunk_embeddings.shape}")
else:
    chunk_texts = [chunk["text"] for chunk in all_chunks]
    chunk_embeddings = embed_model.encode([f"passage: {t}" for t in chunk_texts], normalize_embeddings=True)
    save_cached_index(docs_hash, all_chunks, chunk_embeddings)
    print(f"Created embeddings with shape: {chunk_embeddings.shape}")

# Ensure chunk_texts is available for BM25
chunk_texts = [chunk["text"] for chunk in all_chunks]
# Build BM25 index for keyword retrieval
def bm25_tokenize(text: str) -> list[str]:
    return [t for t in "".join(ch if ch.isalnum() or ch.isspace() else " " for ch in text.lower()).split() if t]

bm25_corpus = [bm25_tokenize(t) for t in chunk_texts]
bm25 = BM25Okapi(bm25_corpus)


Loading embedding model: intfloat/multilingual-e5-small
Generating embeddings for document chunks...
Created embeddings with shape: (20, 384)


In [9]:
def expand_query(query: str) -> str:
    """Add lightweight synonym hints for better recall."""
    q = query.strip()
    q_lower = q.lower()
    additions = []
    # English synonyms
    if "work from home" in q_lower or "wfh" in q_lower:
        additions += ["remote work", "home office", "telework"]
    if "remote work" in q_lower:
        additions += ["work from home", "home office"]
    # German synonyms
    if "von zu hause" in q_lower or "zu hause" in q_lower:
        additions += ["remote arbeiten", "homeoffice", "telearbeit"]
    if "homeoffice" in q_lower:
        additions += ["remote arbeiten", "von zu hause arbeiten"]

    if not additions:
        return q
    # Append synonyms to help embedding recall
    return q + " | " + " | ".join(sorted(set(additions)))


def semantic_search(query: str, top_k: int = TOP_K) -> list[dict]:
    """Find most relevant chunks for a query using cosine similarity."""
    # Embed the query
    expanded_query = expand_query(query)
    query_embedding = embed_model.encode([f"query: {expanded_query}"], normalize_embeddings=True)[0]
    
    # Calculate cosine similarities (embeddings are normalized, so dot product = cosine)
    similarities = np.dot(chunk_embeddings, query_embedding)
    
    # Get top-k indices
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    # Return results with scores
    results = []
    for idx in top_indices:
        results.append({
            "text": all_chunks[idx]["text"],
            "source": all_chunks[idx]["source"],
            "score": float(similarities[idx])
        })
    
    return results


# Test semantic search
test_query = "How many vacation days do I get?"
results = semantic_search(test_query)
print(f"Query: {test_query}\n")
for i, r in enumerate(results, 1):
    print(f"{i}. [{r['source']}] (score: {r['score']:.3f})")
    print(f"   {r['text'][:100]}...\n")

Query: How many vacation days do I get?

1. [vacation_policy.md] (score: 0.892)
   [Vacation Policy]
**1. Annual Leave Entitlement**  
Employees receive 25 days of paid vacation per y...

2. [vacation_policy.md] (score: 0.866)
   [Vacation Policy]
**3. Carryover Rules**  
Up to 5 unused days may be carried over into the next cal...

3. [vacation_policy.md] (score: 0.861)
   [Vacation Policy]
**2. Request Procedure**  
Vacation requests must be submitted via the HR portal a...



## 4. Load Local LLM

In [10]:
# Determine best available device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")
print(f"Loading LLM: {LLM_MODEL}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL,
    dtype=torch.float16 if device != "cpu" else torch.float32,
    device_map="auto"
)

print("Model loaded successfully!")

Using device: mps
Loading LLM: Qwen/Qwen2.5-1.5B-Instruct
Model loaded successfully!


## 5. RAG Pipeline

In [11]:
SYSTEM_PROMPT = """You are a helpful company assistant that answers questions based ONLY on the provided context.

STRICT RULES:
1. ONLY use information from the provided context to answer questions
2. If the context does not contain the answer, say: "I don't have information about that in our company documents."
3. Never make up or infer information not explicitly stated in the context
4. Respond in the same language as the user's question (German or English)
5. Keep answers concise but complete
6. If asked about topics outside the context (weather, general knowledge, etc.), politely explain you can only answer questions about company policies"""


def build_context(search_results: list[dict], threshold: float = SIMILARITY_THRESHOLD) -> tuple[str, bool]:
    """Build context string from search results. Returns (context, has_relevant_info)."""
    # Filter by similarity threshold
    relevant_results = [r for r in search_results if r["score"] >= threshold]
    
    if not relevant_results:
        return "", False
    
    context_parts = []
    for r in relevant_results:
        context_parts.append(f"[From {r['source']}]:\n{r['text']}")
    
    return "\n\n".join(context_parts), True


def generate_response(query: str, context: str, has_context: bool | None = None) -> str:
    """Generate response using the LLM with proper chat template."""
    
    if has_context:
        user_message = f"""Context from company documents:
---
{context}
---

Question: {query}

Answer based ONLY on the context above:"""
    else:
        user_message = f"""Question: {query}

Note: No relevant information was found in the company documents for this question."""
    
    # Build messages for chat template
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_message}
    ]
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.1,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode only the new tokens
    response = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    )
    
    return response.strip()


def ask(query: str) -> str:
    """Main RAG function: retrieve context and generate answer."""
    # Step 1: Retrieve relevant documents
    search_results = semantic_search(query, top_k=TOP_K)
    
    # Step 2: Build context with relevance check
    context, has_relevant_context = build_context(search_results)
    
    # Step 3: Generate response
    response = generate_response(query, context, has_relevant_context)
    
    return response

## 6. Test the Chatbot

In [15]:
# Test with English questions
test_questions_en = [
    "How many vacation days do employees get per year?",
    "What are the password requirements?",
    "Can I work from home?",
    "Do I need my own laptop?",
    "What is the weather like today?",  # Should say "I don't know"
    "What is the capital of France?"  # Should say "I don't know"
]

print("=" * 60)
print("ENGLISH QUERIES")
print("=" * 60)

for q in test_questions_en:
    print(f"\nQ: {q}")
    answer = ask(q)
    print(f"A: {answer}")
    print("-" * 40)

ENGLISH QUERIES

Q: How many vacation days do employees get per year?
A: Employees receive 25 days of paid vacation per year.
----------------------------------------

Q: What are the password requirements?
A: According to the IT Security Guidelines, passwords must contain at least 12 characters and need to be changed every 180 days. Multi-factor authentication is also mandatory for accessing systems.
----------------------------------------

Q: Can I work from home?
A: Yes, you can work from home if it meets the following conditions:

- You are eligible under the Remote Work Policy with your manager's approval.
- Your request should be submitted at least one month before the intended start date.
- The period of remote work cannot exceed 20 days per year.
- During working hours, you must remain reachable via Slack and email.
----------------------------------------

Q: Do I need my own laptop?
A: Yes, Schaefer Corporation provides a laptop to employees, so you do need your own laptop.


In [14]:
# Test with German questions
test_questions_de = [
    "Wie viele Urlaubstage habe ich pro Jahr?",
    "Welche Passwortanforderungen gibt es?",
    "Kann ich von zu Hause aus arbeiten?",
    "Wie ist das Wetter heute?",  # Should say "I don't know"
    "Was ist die Hauptstadt von Frankreich?",  # Should say "I don't know"
]

print("=" * 60)
print("GERMAN QUERIES")
print("=" * 60)

for q in test_questions_de:
    print(f"\nQ: {q}")
    answer = ask(q)
    print(f"A: {answer}")
    print("-" * 40)

GERMAN QUERIES

Q: Wie viele Urlaubstage habe ich pro Jahr?
A: Ich habe pro Jahr 25 Tage freien Urlaub.
----------------------------------------

Q: Welche Passwortanforderungen gibt es?
A: Passwörter müssen mindestens 12 Zeichen lang sein und jedes Jahr umgelegt werden. Mehrfachauthentifizierung ist erforderlich.
----------------------------------------

Q: Kann ich von zu Hause aus arbeiten?
A: Ja, Sie können von zu Hause aus arbeiten, aber nur für bis zu 20 Tage pro Jahr. Alle Datenschutzregeln bleiben vollständig anwendbar.
----------------------------------------

Q: Wie ist das Wetter heute?
A: Ich kann Ihnen leider nicht sagen, wie es heute im Wetter ist, da ich keine Informationen darüber habe.
----------------------------------------

Q: Was ist die Hauptstadt von Frankreich?
A: Ich kann Ihnen leider keine Informationen über die Hauptstadt Frankreichs geben, da sie nicht im angegebenen Kontext erwähnt wird.
----------------------------------------


## 8. Debug View (Optional)

Use this to see the retrieval results and understand how the system works.

In [24]:
def ask_debug(query: str) -> dict:
    """RAG with debug info showing retrieval results."""
    # Step 1: Retrieve
    search_results = semantic_search(query, top_k=TOP_K)
    
    # Step 2: Build context
    context, has_relevant_context = build_context(search_results)
    
    # Step 3: Generate
    response = generate_response(query, context, has_relevant_context)
    
    return {
        "query": query,
        "retrieved_chunks": search_results,
        "has_relevant_context": has_relevant_context,
        "response": response
    }

# Example debug view
debug_result = ask_debug("What is the hotel limit for business travel?")

print(f"Query: {debug_result['query']}")
print(f"\nRetrieved {len(debug_result['retrieved_chunks'])} chunks:")
for i, chunk in enumerate(debug_result['retrieved_chunks'], 1):
    print(f"  {i}. [{chunk['source']}] score={chunk['score']:.3f}")
print(f"\nHas relevant context: {debug_result['has_relevant_context']}")
print(f"\nResponse: {debug_result['response']}")

Query: What is the hotel limit for business travel?

Retrieved 3 chunks:
  1. [travel_expenses.md] score=0.401
  2. [travel_expenses.md] score=0.262
  3. [travel_expenses.md] score=0.216

Has relevant context: True

Response: I don't have information about that in our company documents.
