In [1]:
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, util
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
import tempfile
import chromadb
from pathlib import Path

load_dotenv()

  from tqdm.autonotebook import tqdm, trange
2026-02-25 10:11:01.255062: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


True

In [2]:
client = chromadb.PersistentClient(path="./chroma_db")
print(f"‚úÖChromaDB client initialized")

collection = client.get_or_create_collection(name="techcop_rag")
print(f"‚úÖCollection '{collection.name}' ready")

model = SentenceTransformer("all-MiniLM-L6-v2")

test_text = "Testing RAG setup"
test_embedding = model.encode(test_text)
print(f"‚úÖ Test embedding created: {len(test_embedding)} dimension")

‚úÖChromaDB client initialized
‚úÖCollection 'techcop_rag' ready




‚úÖ Test embedding created: 384 dimension


In [3]:
def smart_chunk(text, overlap_ratio=0.2):
    """
    Smart paragraph-based chunking with overlap
    """

    paragraphs = text.split("\n\n")

    chunks = []
    for i in range(len(paragraphs)):
        chunk_parts = []
        chunk_parts.append(paragraphs[i])

        if i + 1 < len(paragraphs):
            chunk_parts.append(paragraphs[i+1])

        if i > 0 and overlap_ratio > 0:
            overlap_chars = int(len(paragraphs[i-1]) * overlap_ratio)
            chunk_parts.insert(0,paragraphs[i-1][-overlap_chars:])

        chunk = " ".join(chunk_parts)
        chunks.append(chunk)
    return chunks

doc_dir = Path("techcorp-docs/techcorp-docs")
total_chunks = 0
docs_processed = 0

for category_dir in doc_dir.iterdir():
    if category_dir.is_dir():
        print(f"\n üóÇÔ∏è Processing {category_dir.name}")

        for doc_file in category_dir.glob("*.md"):
            metadata = {
                "source": doc_file.name,
                "section": category_dir.name
            }

            with open(doc_file, "r") as f:
                content = f.read()

            chunks = smart_chunk(content)

            for i, chunk in enumerate(chunks):
                chunk_id = f"{category_dir.name}_{doc_file.stem}_chunk_{i}"
                embedding = model.encode(chunk).tolist()

                collection.add(
                    ids=[chunk_id],
                    embeddings=[embedding],
                    documents=[chunk],
                    metadatas=[metadata]
                )
                total_chunks +=1
            docs_processed +=1
            print(f" {doc_file.name}: {len(chunks)} chunks")

print("-"*50)
print(f"    Document processed: {docs_processed}")
print(f"    Total chunks created: {total_chunks}")
print(f"    Collection size: {collection.count()}")


Insert of existing embedding ID: products_datastream_chunk_0
Add of existing embedding ID: products_datastream_chunk_0
Insert of existing embedding ID: products_datastream_chunk_1
Add of existing embedding ID: products_datastream_chunk_1
Insert of existing embedding ID: products_cloudsync_chunk_0
Add of existing embedding ID: products_cloudsync_chunk_0



 üóÇÔ∏è Processing products
 datastream.md: 2 chunks


Insert of existing embedding ID: products_cloudsync_chunk_1
Add of existing embedding ID: products_cloudsync_chunk_1
Insert of existing embedding ID: policies_remote_work_chunk_0
Add of existing embedding ID: policies_remote_work_chunk_0
Insert of existing embedding ID: policies_remote_work_chunk_1
Add of existing embedding ID: policies_remote_work_chunk_1
Insert of existing embedding ID: policies_dress_code_chunk_0
Add of existing embedding ID: policies_dress_code_chunk_0
Insert of existing embedding ID: policies_dress_code_chunk_1
Add of existing embedding ID: policies_dress_code_chunk_1
Insert of existing embedding ID: support_vpn_troubleshooting_chunk_0
Add of existing embedding ID: support_vpn_troubleshooting_chunk_0


 cloudsync.md: 2 chunks

 üóÇÔ∏è Processing policies
 remote_work.md: 2 chunks
 dress_code.md: 2 chunks

 üóÇÔ∏è Processing support


Insert of existing embedding ID: support_vpn_troubleshooting_chunk_1
Add of existing embedding ID: support_vpn_troubleshooting_chunk_1
Insert of existing embedding ID: support_password_reset_chunk_0
Add of existing embedding ID: support_password_reset_chunk_0
Insert of existing embedding ID: support_password_reset_chunk_1
Add of existing embedding ID: support_password_reset_chunk_1


 vpn_troubleshooting.md: 2 chunks
 password_reset.md: 2 chunks
--------------------------------------------------
    Document processed: 6
    Total chunks created: 12
    Collection size: 12


In [15]:
llm = ChatOpenAI(
        model="gpt-4.1-mini",
        api_key=os.getenv("OPENAI_API_KEY"),
        base_url=os.getenv("OPENAI_API_BASE"),
    )
def test_generation():
    temperature = 0.3
    max_tokens = 500

    llm.temperature = temperature
    llm.max_tokens = max_tokens

    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role:": "user", "content": "What is RAG in AI? Answer in one sentence"}
    ]

    response = llm.invoke(messages)
    answer = response.content

    print(f"Test response: {answer}")

    return True
    

In [None]:
"""
Task 4: Prompt Engineering
Build the RAG prompt template that ensures accurate, context-based answers
"""
def create_rag_prompt(context_chunks, user_question):
    """Create the RAF prompt with context and question"""
    
    system_prompt = """You are TechCorp's helpful AI assistant.
Answer ONLY based on the provided context.
If the answer is not in the context, say 'I don't have that information in the provided documents'.
Be concise and accurate"""

    context_text = "Context from TechCorp documents:\n\n"
    for i, chunk in enumerate(context_chunks,1):
        context_text += f"[Document {i}]\n{chunk}\n\n"

    user_prompt = f"""
{context_text}

Question: {user_question}
Answer:""" 
    
    return system_prompt, user_prompt

def test_prompt_engineering():
    test_chunks = [
        "TechCorp allows employees to work remotely up to 3 days per week. Core hours are 10 AM to 3 PM.",
        "Remote work arrangements must be approved by your manager and documented with HR.",
        "VPN is mandatory when accessing company resources from home."
    ]

    test_question = "How many days can I work from home?"

    system_prompt = """You are TechCorp's helpful AI assistant.
Answer ONLY based on the provided context.
If the answer is not in the context, say 'I don't have that information in the provided documents'.
Be concise and accuratte"""

    context_text = "Context from TechCorp documents:\n\n"
    for i, chunk in enumerate(test_chunks, 1):
        context_text += f"[Document {i}]\n{chunk}\n\n"

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": "Use the following context to answer the question."},
        {"role": "assistant", "content": context_text},
        {"role": "user", "content": test_question},
    ]

    response_1 = llm.invoke(messages)
    print(response_1)
    print("\nü§ñ Generated answer:")
    print("-" * 40)
    print(response_1.content)

    return True

success = test_prompt_engineering()


content='You can work from home up to 3 days per week.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 13, 'prompt_tokens': 151, 'total_tokens': 164, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_c5f3afa7ce', 'finish_reason': 'stop', 'logprobs': None} id='run-75c96c2e-8a0f-4f1a-8a01-ce09b3f479d4-0' usage_metadata={'input_tokens': 151, 'output_tokens': 13, 'total_tokens': 164, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}

ü§ñ Generated answer:
----------------------------------------
You can work from home up to 3 days per week.


In [23]:
"""Task 5: Complete RAG pipeline"""
# client = chromadb.PersistentClient(path="./chroma_db")
# collection = client.get_or_create_collection(name="techcop_rag")
# model = SentenceTransformer("all-MiniLM-L6-v2")
def rag_pipeline(user_question):
    """Complete RAG pipeline: Retrieve -> Augment -> Generate"""

    print(f"\n‚ùî Question: {user_question}")
    print("-"*50)

    # STEP 1: RETRIEVE
    print(f"1Ô∏è‚É£ RETRIEVE: Converting to embeddings...")
    query_embeddings = model.encode(user_question).tolist()

    results = collection.query(
        query_embeddings=[query_embeddings],
        n_results=3
    )

    retrieved_chunks = results['documents'][0]
    metadatas = results['metadatas'][0]

    print(f"     Retrieved {len(retrieved_chunks)} relevant chunks")
    for i, meta in enumerate(metadatas):
        print(f"    - {meta['source']} ({meta['section']})")

    # STEP 2: AUGMENT
    print("\n2Ô∏è‚É£ AUGMENT: Building context...")

    system_prompt = """You are TechCorp's helpful AI assistant.
Answer ONLY based on the provided context.
If the answer is not in the context, say 'I don't have that information in the provided documents'.""" 

    context_text_1 = "Context from TechCorp documents:\n\n"
    for i, chunk in enumerate(retrieved_chunks,1):
        context_text_1 += f"[Document {i}]\n{chunk}\n\n"

    user_prompt_1=f"{context_text_1}\nQuestion: {user_question}\n\nAnswer:"

    print(" ‚úÖ Context prepared with retrieved document")

    # STEP 3: GENERATE
    print("n3Ô∏è‚É£ GENERATE: Creating answer...")
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_1}
    ]

    response_1 = llm.invoke(messages)
    answer_1 = response_1.content

    sources = [meta['source'] for meta in metadatas]
    unique_sources = list(set(sources))

    final_response = f"{answer_1}\n\nüìé Sources: {', '.join(unique_sources)}"

    return final_response

In [25]:
def test_rag_pipeline():
    
    test_questions = [
        "Can I bring my dog to the office?",
        "How many vacation days do I get?",
        "What is the remote work policy?"
    ]

    for question in test_questions:
        answer = rag_pipeline(question)
        print("\n" + "=" * 50)
        print("üí¨ANSWER:")
        print(answer)

success_1 = test_rag_pipeline()


‚ùî Question: Can I bring my dog to the office?
--------------------------------------------------
1Ô∏è‚É£ RETRIEVE: Converting to embeddings...
     Retrieved 3 relevant chunks
    - dress_code.md (policies)
    - dress_code.md (policies)
    - remote_work.md (policies)

2Ô∏è‚É£ AUGMENT: Building context...
 ‚úÖ Context prepared with retrieved document
n3Ô∏è‚É£ GENERATE: Creating answer...

üí¨ANSWER:
I don't have that information in the provided documents.

üìé Sources: dress_code.md, remote_work.md

‚ùî Question: How many vacation days do I get?
--------------------------------------------------
1Ô∏è‚É£ RETRIEVE: Converting to embeddings...
     Retrieved 3 relevant chunks
    - remote_work.md (policies)
    - remote_work.md (policies)
    - dress_code.md (policies)

2Ô∏è‚É£ AUGMENT: Building context...
 ‚úÖ Context prepared with retrieved document
n3Ô∏è‚É£ GENERATE: Creating answer...

üí¨ANSWER:
I don't have that information in the provided documents.

üìé Sources: dress_code