<a href="https://colab.research.google.com/github/ahmedellaboudy/HRPolicesRAGpipeline/blob/main/RAGProjectHRPolicies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain-community pymupdf4llm chromadb sentence-transformers


Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.27-py3-none-any.whl.metadata (4.8 kB)
Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pymupdf>=1.26.3 (from pymupdf4llm)
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Co

In [None]:
!pip install transformers torch accelerate



In [None]:
from google.colab import files
uploaded = files.upload()

Saving HR_policies 1.pdf to HR_policies 1.pdf


In [None]:
# CELL 1: Import All Required Modules
import os
import re
import torch
import warnings
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# LangChain imports
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

# Transformers imports
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    StoppingCriteria,
    StoppingCriteriaList,
    set_seed
)

# PyMuPDF import
import pymupdf4llm

warnings.filterwarnings("ignore", message=".*generation flags.*")
set_seed(42)

In [None]:
# Cell 2 : Extracting markdown
def extract_pdf_to_markdown(pdf_path):
    md_docs = pymupdf4llm.to_markdown(pdf_path)
    print(f"PDF extracted to markdown successfully!")
    print(f"Total markdown length: {len(md_docs)} characters")
    return md_docs


In [None]:
# CELL 3: Split by Headers
def split_by_headers(md_docs):
    """Split markdown by headers"""
    headers_to_split_on = [
        ("#", "Main Topic"),
        ("##", "Sub-topic")
    ]

    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    docs = splitter.split_text(md_docs)

    print(f"Found {len(docs)} header-based sections")
    return docs


In [None]:
# CELL 4: Create Larger Chunks
def create_chunks(docs, chunk_size=2000, chunk_overlap=200):
    chunk_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    final_chunks = []
    for doc in docs:
        chunks = chunk_splitter.split_text(doc.page_content)
        for idx, chunk in enumerate(chunks):
            chunk_metadata = doc.metadata.copy()
            chunk_metadata['chunk_id'] = idx + 1
            chunk_metadata['chunk_size'] = len(chunk)
            final_chunks.append(Document(page_content=chunk, metadata=chunk_metadata))

    print(f"Created {len(final_chunks)} chunks with {chunk_size} character size")
    return final_chunks


In [None]:
# CELL 5: Clean Text Functions
def clean_metadata_value(value):
    """Helper function to clean metadata strings"""
    if not isinstance(value, str):
        return value
    value = value.strip()
    value = re.sub(r'#+\s*', '', value)
    value = re.sub(r'(\*\*|__|\*)', '', value)
    value = re.sub(r'[^\w\s\u0600-\u06FF]', '', value)
    value = re.sub(r'\s+', ' ', value)
    return value.strip()

def clean_text(final_chunks):
    """Clean both text content and metadata"""
    cleaned_final_chunks = []

    for doc in final_chunks:
        chunk_metadata = {
            k: clean_metadata_value(v) for k, v in doc.metadata.items()
        }

        chunk_content = doc.page_content
        cleaned_chunk_content = re.sub(r'#+\s*', '', chunk_content)
        cleaned_chunk_content = re.sub(r'(\*\*|__|\*)', '', cleaned_chunk_content)
        cleaned_chunk_content = re.sub(r'^[\-\*\+]\s*', '', cleaned_chunk_content, flags=re.MULTILINE)

        cleaned_final_chunks.append(
            Document(page_content=cleaned_chunk_content, metadata=chunk_metadata)
        )
    return cleaned_final_chunks


In [None]:
# CELL 6: Arabic Normalization
def normalize_arabic(text):
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    return text


In [None]:
# CELL 7: Topic Questions Dictionary
topic_questions = {
    "dress code": [
        "What is the company dress code?",
        "Do I need to wear formal attire?",
        "Is casual clothing allowed?",
        "ما هو الزي الرسمي المطلوب في الشركة؟",
        "هل يسمح بالملابس الكاجوال؟",
        "ما هي قواعد المظهر العام؟"
    ],
    "working office hours": [
        "What are the official working hours?",
        "what is office hours?",
        "Can I come late or leave early?",
        "Is there a flexible schedule?",
        "ما هي ساعات العمل الرسمية؟",
        "هل يمكنني الحضور متأخراً أو المغادرة مبكراً؟",
        "هل يوجد جدول عمل مرن؟"
    ],
    "leaves and holidays": [
        "How many vacation days do I have?",
        "What is the leave policy?",
        "Can I take a holiday anytime?",
        "كم عدد أيام الإجازة المتاحة لي؟",
        "ما هي سياسة الإجازات؟",
        "هل يمكنني أخذ عطلة في أي وقت؟"
    ],
    "probation period": [
        "What is the probation period?",
        "How long is the trial period for new employees?",
        "ما هي فترة التجربة للموظفين الجدد؟",
        "كم تستمر فترة العقد التجريبي؟"
    ],
    "employees termination": [
        "What is the termination policy?",
        "How do I submit a resignation?",
        "What is notice period?",
        "ما هي سياسة إنهاء الخدمة؟",
        "كيف أقدم استقالتي؟",
    ],
    "staff payroll": [
        "When will I receive my salary?",
        "How does payroll work?",
        "متى يتم صرف الراتب؟",
        "كيف يتم احتساب الرواتب؟",
    ],
    "HR Initiatives / Employee Engagement and Development Activities": [
        "Are there any team-building programs?",
        "what about employee welfare and activities?",
        "هل يوجد أنشطة لزيادة تفاعل الموظفين؟",
    ],
    "mobile phone": [
        "Can I use my mobile phone at work?",
        "Is social media allowed during office hours?",
        "هل يمكنني استخدام الهاتف المحمول في العمل؟",
        "هل يسمح باستخدام وسائل التواصل الاجتماعي أثناء الدوام؟",
    ],
    "absence": [
        "What should I do if I cannot come to the office?",
        "Who should I notify if I am absent from work?",
        "What happens if I am absent for two consecutive days without notice?",
        "ما هي عواقب الغياب بدون إذن؟",
        "هل يتم معاقبة الغياب غير المصرح به؟"
    ],
    "staff movement": [
        "Can I request a transfer to another department?",
        "What is the relocation policy?",
        "هل يمكنني طلب نقل إلى قسم آخر؟",
        "ما هي سياسة النقل أو الانتداب؟"
    ],
    "staff orientation": [
        "What is included in the orientation program?",
        "Are company policies explained in orientation?",
        "ما هو برنامج التعريف بالموظفين الجدد؟",
        "هل يتم شرح السياسات خلال فترة التعريف؟"
    ],
    "Interim positions and promotions": [
        "How can I get a promotion?",
        "What is the policy for reclassification?",
        "كيف أحصل على ترقية؟",
        "ما هي سياسة إعادة التصنيف؟"
    ],
    "تقييم الاداء": [
        "How is performance evaluated?",
        "When will I get my performance assessment?",
        "ما هي طريقة تقييم الأداء؟",
        "متى يتم عمل تقييم للموظف؟"
    ],
    "التدريب والتطوير": [
        "Are there training programs for employees?",
        "Can I attend workshops for development?",
        "هل يوجد برامج تدريبية للموظفين؟",
        "هل يمكنني حضور ورش عمل للتطوير؟"
    ],
    "مدونه السلوك": [
        "What is the code of conduct?",
        "How is workplace harassment handled?",
        "ما هو السلوك المطلوب في مكان العمل؟",
        "كيف يتم التعامل مع حالات التحرش؟"
    ],
    "الصحة والسلامة": [
        "What are the health and safety rules?",
        "What should I do in case of an emergency?",
        "ما هي قواعد الصحة والسلامة؟",
        "ماذا أفعل في حالة الطوارئ؟"
    ],
    "العمل عن بعد": [
        "Is remote work allowed?",
        "Can I work from home?",
        "هل يسمح بالعمل عن بعد؟",
        "هل يمكنني العمل من المنزل؟"
    ],
    "تسويه الشكاوي": [
        "How can I file a complaint?",
        "What is the grievance process?",
        "كيف أقدم شكوى؟",
        "ما هي إجراءات النظر في التظلمات؟"
    ],
    "personnel files and documentations": [
        "What documents are kept in personnel files?",
        "Are personnel files confidential?",
        "ما هي المستندات الموجودة في ملفات الموظفين؟",
        "هل تعتبر ملفات الموظفين سرية؟"
    ],
    "job description": [
        "Where can I find my job description?",
        "What are my roles and responsibilities?",
        "أين أجد الوصف الوظيفي الخاص بي؟",
        "ما هي مهامي ومسؤولياتي؟"
    ],
    "suggestions": [
        "How can I submit a suggestion?",
        "Does the company accept employee ideas?",
        "كيف يمكنني تقديم اقتراح؟",
        "هل تقبل الشركة أفكار الموظفين؟"
    ],
    "الانفصال": [
        "What is the exit process?",
        "Will there be an exit interview?",
        "ما هي إجراءات الخروج من الشركة؟",
        "هل يوجد مقابلة خروج عند الاستقالة؟"
    ],
    "tax deduction": [
        "What is the professional tax deduction?",
        "How much tax is deducted from my salary?",
        "ما هو الخصم الضريبي من المرتب؟",
        "كم قيمة الضريبة التي تخصم من راتبي؟",
    ],
    "late_coming": [
        "What is considered late coming at work?",
        "What should I do if I know I will be late?",
        "هل يعتبر التأخير عن مواعيد العمل مخالفة؟",
        "كم دقيقة تأخير مسموحة قبل اعتباره تأخير رسمي؟",
    ],
    "office_rooms": [
        "what should i do after leaving office room",
        "كيف يمكنني حجز غرفة مكتب للاجتماعات؟",
        "هل توجد غرف مشتركة للموظفين؟",
    ],
    "traveling_allowance": [
        "What is the traveling allowance policy?",
        "what should i mention in the allowence form?",
        "ما هي سياسة بدل السفر؟",
    ]
}

In [None]:
# CELL 8: Setup Vector Store and Smart Search
def setup_vectorstore(final_cleaned_chunks):
    model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)

    vectorstore = Chroma.from_documents(
        final_cleaned_chunks,
        embedding_model,
        persist_directory=None,
        ids=[str(i) for i in range(len(final_cleaned_chunks))]
    )

    vectorstore.persist()
    print("Chroma database with 2000-char chunks stored successfully!")

    return vectorstore, embedding_model

def prepare_topic_embeddings(topic_questions, embedding_model):
    """Prepare topic question embeddings"""
    topic_questions_embedded = {
        topic: [embedding_model.embed_query(normalize_arabic(q.lower().strip("?!."))) for q in questions]
        for topic, questions in topic_questions.items()
    }
    return topic_questions_embedded

def get_most_relevant_topic_names(query, topic_questions_embedded, embedding_model):
    """Get most relevant topic for the query"""
    normalized_query = normalize_arabic(query.lower().strip("?!."))
    encoded_query = embedding_model.embed_query(normalized_query)
    topic_similarity_scores = defaultdict(lambda: -1.0)

    for topic, questions_list in topic_questions_embedded.items():
        for question in questions_list:
            cos_sim = cosine_similarity([encoded_query], [question])[0][0]
            cos_sim = round(cos_sim, 2)
            if cos_sim > topic_similarity_scores[topic]:
                topic_similarity_scores[topic] = cos_sim

    topic_with_highest_score = max(topic_similarity_scores, key=topic_similarity_scores.get)
    return topic_with_highest_score

def smart_search(query, vectorstore, topic_questions_embedded, embedding_model, k=5, debug=False):
    """Updated smart search function that returns (document, score) tuples"""
    normalized_query = normalize_arabic(query.lower().strip("?!."))
    results = vectorstore.similarity_search_with_score(normalized_query, k=k)
    topic_with_highest_score = get_most_relevant_topic_names(query, topic_questions_embedded, embedding_model)

    if debug:
        print(f'Most related topic: {topic_with_highest_score}')
        print('++++++++++++++++++++++++++++++++++++++++++++++++')

    encoded_topic_with_highest_score = embedding_model.embed_query(
        normalize_arabic(topic_with_highest_score.lower().strip("?!."))
    )

    # Return chunks with the most related topic along with their scores
    related_chunks = []
    for doc, score in results:
        if debug:
            print(f'Document score: {score}')
            print(f'Document topic: {doc.metadata.get("Sub-topic", "")}')

        doc_topic = doc.metadata.get('Sub-topic', '')
        if doc_topic:  # Only process if topic exists
            encoded_topic = embedding_model.embed_query(normalize_arabic(doc_topic.lower().strip("?!.")))
            cos_sim = cosine_similarity([encoded_topic], [encoded_topic_with_highest_score])[0][0]
            cos_sim = round(cos_sim, 2)

            if debug:
                print(f'Cosine similarity with topic: {cos_sim}')

            if cos_sim > 0.4:
                related_chunks.append((doc, score))

        if debug:
            print('----------------------------------------------')

    # If no topic-related chunks found, return top results
    if not related_chunks:
        related_chunks = results[:min(2, len(results))]

    return related_chunks


In [None]:
# CELL 9: Model Setup and LLM Pipeline
class StopOnTokens(StoppingCriteria):
    """Custom stopping criteria for text generation"""
    def __init__(self, stop_sequences, tokenizer):
        self.tokenizer = tokenizer
        self.stop_token_ids = []
        for seq in stop_sequences:
            token_ids = self.tokenizer.encode(seq, add_special_tokens=False)
            device = "cuda" if torch.cuda.is_available() else "cpu"
            self.stop_token_ids.append(torch.tensor(token_ids, device=device))

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in self.stop_token_ids:
            if len(input_ids[0]) >= len(stop_ids):
                if torch.all(input_ids[0][-len(stop_ids):] == stop_ids):
                    return True
        return False

def setup_model_and_chains():
    """Setup the language model and LangChain chains"""
    print("Loading the model and tokenizer...")
    model_name = "Qwen/Qwen2.5-3B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load model based on available hardware
    try:
        if torch.cuda.is_available():
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16
            ).to("cuda")
            print("Model loaded with bfloat16 on GPU.")
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float32
            )
            print("Model loaded with float32 on CPU.")
    except Exception as e:
        print(f"BF16 load failed. Falling back to float16. Error: {e}")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        print("Model loaded with float16.")

    # Define multilingual stop words
    stop_words = ["User:", "\nUser:", "المستخدم:", "\nالمستخدم:", "Human:", "\nHuman:", "Question:", "\nQuestion:"]
    stopping_criteria = StoppingCriteriaList([StopOnTokens(stop_words, tokenizer)])

    # Create the pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        return_full_text=False,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.1,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )

    # Wrap the pipeline in LangChain LLM
    llm = HuggingFacePipeline(
        pipeline=pipe,
        model_kwargs={"stopping_criteria": stopping_criteria}
    )

    # Create prompt templates
    prompt_template_en = PromptTemplate(
        input_variables=["context", "input"],
        template="""Answer the question directly using only the provided context. Be concise and specific.
If the answer is not in the context, say: "I don't know based on the policy."

Context: {context}

Question: {input}

Answer:"""
    )

    prompt_template_ar = PromptTemplate(
        input_variables=["context", "input"],
        template="""أجب عن السؤال باختصار باستخدام المعلومات التالية فقط. إذا لم تجد الإجابة في السياق، قل: "لا أعلم بناءً على السياسة."

السياق: {context}

السؤال: {input}

الجواب:"""
    )

    # Create LLM chains
    llm_chain_en = LLMChain(llm=llm, prompt=prompt_template_en, verbose=False)
    llm_chain_ar = LLMChain(llm=llm, prompt=prompt_template_ar, verbose=False)

    print("Model and chains ready!")

    return llm_chain_en, llm_chain_ar


In [None]:
#cell 10 : Helper functions
def is_arabic(text: str) -> bool:
    return any('\u0600' <= ch <= '\u06FF' for ch in text)

def is_vague(query: str) -> bool:
    """Check if query is too vague"""
    q_lower = query.strip().lower()
    if len(q_lower.split()) <= 2:
        return True
    vague_terms = {"policy", "leave", "hours", "benefits", "ساعات", "اجازة", "سياسة", "work"}
    return q_lower in vague_terms

def retrieve_context_with_smart_search(query: str, vectorstore, topic_questions_embedded, embedding_model, k: int = 2, max_chars: int = 4000) -> str:
    """Retrieve context using smart search"""
    try:
        results = smart_search(query, vectorstore, topic_questions_embedded, embedding_model, k=k)
        if not results:
            return ""

        # Extract documents from results
        docs = [result[0] for result in results]

        # Combine content
        context = "\n\n".join([d.page_content for d in docs])

        # Limit context length
        if len(context) > max_chars:
            context = context[:max_chars].rsplit('\n', 1)[0]

        return context

    except Exception as e:
        print(f"Error in smart search: {e}")
        # Fallback to regular similarity search
        docs = vectorstore.similarity_search(query, k=k)
        if not docs:
            return ""
        context = "\n\n".join([d.page_content for d in docs])
        if len(context) > max_chars:
            context = context[:max_chars].rsplit('\n', 1)[0]
        return context

def clean_response(response: str, is_arabic_query: bool = False) -> str:
    """Clean up the model response"""
    response = response.strip()

    # Remove common unwanted phrases
    unwanted_phrases = [
        "Based on the context", "If you have any more questions", "please let me know",
        "The answer should be", "There is no need to repeat", "However, since",
        "إذا كنت تريد", "يمكنك كتابة المزيد", "كما هو موضح أدناه",
        "إذا كنت تريد توضيح", "في حالة عدم وجود"
    ]

    # Split into sentences
    sentences = [s.strip() for s in response.split('.') if s.strip()]

    clean_sentences = []
    for sentence in sentences:
        # Skip sentences with unwanted phrases
        if not any(phrase in sentence for phrase in unwanted_phrases):
            if is_arabic_query:
                # Clean Arabic text
                cleaned_sentence = ''.join(char for char in sentence if
                    char.isspace() or
                    '\u0600' <= char <= '\u06FF' or  # Arabic
                    '\u0030' <= char <= '\u0039' or  # Numbers
                    char in '.:،؛؟!-()/')
                if cleaned_sentence.strip():
                    clean_sentences.append(cleaned_sentence.strip())
            else:
                clean_sentences.append(sentence)

    if clean_sentences:
        response = '. '.join(clean_sentences)
        if not response.endswith('.'):
            response += '.'

    # If response is too long, take first 2 sentences
    sentences = response.split('.')
    if len(sentences) > 3:
        response = '. '.join(sentences[:2]) + '.'

    return response

def debug_print_retrieval(query, vectorstore, topic_questions_embedded, embedding_model, k=5):
    """Debug function to print retrieval details"""
    results = smart_search(query, vectorstore, topic_questions_embedded, embedding_model, k=k, debug=True)
    print("---- Retrieved chunks (debug) ----")
    for i, (doc, score) in enumerate(results, 1):
        print(f"[{i}] Score: {score:.6f}")
        print(f"Metadata: {doc.metadata}")
        print(doc.page_content[:300] + ("..." if len(doc.page_content) > 300 else ""))
        print("-" * 60)


In [None]:
# CELL 11: Main Execution Pipeline
def main():
    """Main execution function"""

    md_docs = extract_pdf_to_markdown("HR_policies 1.pdf")

    docs = split_by_headers(md_docs)

    final_chunks = create_chunks(docs)

    final_cleaned_chunks = clean_text(final_chunks)

    vectorstore, embedding_model = setup_vectorstore(final_cleaned_chunks)

    topic_questions_embedded = prepare_topic_embeddings(topic_questions, embedding_model)

    llm_chain_en, llm_chain_ar = setup_model_and_chains()

    print("\n=== HR Assistant Chat (English/Arabic) ===")
    print("Type 'exit', 'quit', or 'خروج' to end the conversation.")
    print("Type 'debug' followed by your query to see retrieval details.\n")

    while True:
        user_input = input("You: ").strip()

        # Exit conditions
        if user_input.lower() in {"exit", "quit", "خروج"}:
            print("Bot: Goodbye! / وداعاً!")
            break

        if not user_input:
            continue

        # Debug mode
        if user_input.lower().startswith("debug "):
            debug_query = user_input[6:].strip()
            if debug_query:
                debug_print_retrieval(debug_query, vectorstore, topic_questions_embedded, embedding_model)
            continue

        # Handle vague queries
        if is_vague(user_input):
            if is_arabic(user_input):
                print("Bot: هل يمكنك توضيح أي سياسة تقصد؟ مثل 'الإجازة المرضية' أو 'ساعات العمل'")
            else:
                print("Bot: Could you be more specific? For example: 'sick leave policy', 'working hours', 'annual leave'")
            continue

        try:
            # Retrieve context using smart search
            context = retrieve_context_with_smart_search(
                user_input, vectorstore, topic_questions_embedded, embedding_model, k=2, max_chars=4000
            )

            if not context:
                if is_arabic(user_input):
                    print("Bot: لا أعلم بناءً على السياسة.")
                else:
                    print("Bot: I don't know based on the policy.")
                continue

            # Choose the appropriate chain
            is_arabic_query = is_arabic(user_input)

            if is_arabic_query:
                response = llm_chain_ar.invoke({"input": user_input, "context": context})
            else:
                response = llm_chain_en.invoke({"input": user_input, "context": context})

            # Extract response text
            if isinstance(response, dict):
                bot_response = response.get("text", str(response))
            else:
                bot_response = str(response)

            # Clean the response
            bot_response = clean_response(bot_response, is_arabic_query)

            # Handle empty responses
            if not bot_response or bot_response.lower() in {"", ".", "answer:", "الجواب:"}:
                if is_arabic_query:
                    bot_response = "لا أعلم بناءً على السياسة."
                else:
                    bot_response = "I don't know based on the policy."

            print(f"Bot: {bot_response}")
            print("-" * 50)

        except Exception as e:
            print(f"Bot: Sorry, an error occurred: {str(e)}")
            continue

    print("Chat session ended.")

In [None]:
# CELL 12: Run the application
if __name__ == "__main__":
    main()

PDF extracted to markdown successfully!
Total markdown length: 13664 characters
Found 27 header-based sections
Created 27 chunks with 2000 character size


  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  vectorstore.persist()


Chroma database with 2000-char chunks stored successfully!
Loading the model and tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(
  llm_chain_en = LLMChain(llm=llm, prompt=prompt_template_en, verbose=False)


Model loaded with bfloat16 on GPU.
Model and chains ready!

=== HR Assistant Chat (English/Arabic) ===
Type 'exit', 'quit', or 'خروج' to end the conversation.
Type 'debug' followed by your query to see retrieval details.

Bot: The annual leave policy provides for up to 12 days of paid leave per year.  Additionally, there's an additional one casual leave (CL) available each month after completing the probation period, which totals out to about 13 months' worth over a year if taken annually.
--------------------------------------------------
Bot: لا أعلم بناءً على السياسة.  بناءً على المعلومات المتاحة، فإن الجواب المناسب هو:

لا أعلم بن   

هذا يعني أن هناك عدم وجود بيانات واضحة حول الزي الرسمي للموظفين ضمن هذا السياق.
--------------------------------------------------
You: ما سياسه الزي للموظفين 
Bot: لا أعلم بناءً على السياسة.  بناءً على المعلومات المعطاة في السياق، ليس هناك أي ذكر للسياسة المتعلقة بالزي الرسمي للموظفين.
--------------------------------------------------
