# RAG Implementation with Quantized-Models

This notebook implements a Retrieval-Augmented Generation (RAG) system using:
- ChromaDB for vector storage and retrieval
- Sentence Transformer for embedding generation
- Llama3 for text generation

## 1. Import Dependencies

In [None]:
import time
# import torch
import chromadb
import pandas as pd
from llama_cpp import Llama
from bert_score import score
from transformers import logging as transformers_logging
from sentence_transformers import SentenceTransformer, util
# from transformers import AutoModelForSeq2SeqLM, MT5Tokenizer

In [None]:
# ignore warnings
transformers_logging.set_verbosity_error()

## 2. Load Data

In [None]:
data_file_name_= 'endpoints_info_v3.csv'
df = pd.read_csv(data_file_name_)

slice_df = df[['question','answer','context']]
#.where(df['endpoint'] == 'تمام اندپوینت ها').dropna()

pure_contexts = slice_df['context'].unique().tolist()
questions = slice_df['question'].tolist()
answers = slice_df['answer'].tolist()
qa_as_context = (slice_df['question'] + ' ' + slice_df['answer']).to_list()
contexts = []
for pure_context in pure_contexts:
    contexts.append(pure_context)

for qa in qa_as_context:
    contexts.append(qa)


del slice_df
del df
del pure_contexts
del qa_as_context


## 3. Configure Embedding Model

In [None]:
# Load embedding model
def load_embedding_model(model_name='all-MiniLM-L6-v2'):
    """Load and configure the sentence transformer model for embeddings"""
    embedding_model_path = f"./models/{model_name}"
    
    # Load model from local path
    embedding_model = SentenceTransformer(embedding_model_path)
    
    return embedding_model

# Initialize models
embedding_model = load_embedding_model(model_name="bge-m3")

## 4. Set Up Vector Database

In [None]:
def setup_vector_db(collection_name="novacloud_knowledge"):
    """Initialize ChromaDB and create or get collection"""
    client = chromadb.PersistentClient(path="./chromadb")
    
    # Delete the collection. Uncomment it if needed
    client.delete_collection(collection_name)
    # Get or create collection
    collection = client.get_or_create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}
    )
    
    return client, collection


# Set up ChromaDB
chroma_client, collection = setup_vector_db(collection_name = 'endpoints_info')

## 5. Add Documents to Vector Database

In [None]:
def add_documents_to_collection(collection, documents, embedding_model):
    """Add documents to ChromaDB collection with embeddings"""
    # Generate embeddings for documents
    embeddings = embedding_model.encode(documents)
    
    # Add documents with embeddings to collection
    collection.add(
        embeddings=embeddings,
        documents=documents,
        ids=[str(i) for i in range(len(documents))]
    )
    
    return len(documents)
# Add documents to collection
num_added = add_documents_to_collection(collection, contexts, embedding_model)
print(f"Added {num_added} documents to vector database")

## 6. Implement Retrieval Function

In [None]:
def retrieve_relevant_documents(query, collection, embedding_model, top_k=1, similarity_threshold=0.6):
    """Retrieve relevant documents based on semantic similarity"""
    # Create embedding for query
    query_embedding = embedding_model.encode([query])
    
    # Query the collection
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )

    not_found_message = ["برای پاسخ این سوال با ادمین تماس بگیرید."]
    not_found_message = [""]

    
    # Extract results
    documents = results["documents"][0] if results["documents"] else not_found_message
    distances = results["distances"][0] if results["distances"] else [1.0]  # Higher distance = less relevant
    
    # Print similarity scores for debugging
    print(f"Similarity scores: {[1-d for d in distances]}")
    
    # Optional: Filter by similarity threshold
    filtered_docs = [doc for doc, dist in zip(documents, distances) if 1-dist >= similarity_threshold]
    return filtered_docs if filtered_docs else not_found_message
    
    # return documents

# Test retrieval function
top_k = 5
test_query = questions[0]
print(test_query)
retrieved_docs = retrieve_relevant_documents(test_query, collection, embedding_model,top_k)
print(retrieved_docs)
for retrieved_docs in retrieved_docs:
    print(f"Retrieved document: {retrieved_docs[:100]}...")

## 7. Load LLM for Generation

In [None]:
context_window_size = 15360

def load_llm_model(model_path="./models/Dorna-Llama3-8B-Instruct-GGUF-Q8/dorna-llama3-8b-instruct.Q8_0.gguf",chat_format='auto'):
    """Load and configure the LLM for text generation"""
    llm = Llama(
        model_path=model_path,
        chat_format=chat_format,
        n_gpu_layers=-1,  # Use all available GPU layers
        n_ctx=context_window_size,       # Context window size
        verbose=False
    )
    
    return llm


# Define LLM models details like: path and chat_format
llm_models_details = {
    'dorna-llama3-8b-q8' : {'path': './models/Dorna-Llama3-8B-Instruct-GGUF-Q8/dorna-llama3-8b-instruct.Q8_0.gguf',
                            'chat_format': 'llama-3'},
    'deepseek-r1-7b-qwen' : {'path': './models/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B.Q8_0.gguf',
                            'chat_format': 'gemma'},
    'gemma-3-4b-q2': {'path':'./models/gemma-3-4b-it.Q2_K/gemma-3-4b-it.Q2_K.gguf',
                     'chat_format': 'gemma'},
    'gemma-3-4b-q8': {'path':'./models/gemma-3-4b-it.Q8_0/gemma-3-4b-it.Q8_0.gguf',
                      'chat_format': 'gemma'},
    'gemma-3-4b-fp16': {'path':'./models/gemma-3-4b-it.fp16/gemma-3-4b-it.fp16.gguf',
                        'chat_format': 'gemma'}
}

# Load Llama model
target_llm_model = 'gemma-3-4b-q8'
llm_model_path, llm_chat_format = llm_models_details[target_llm_model]['path'], llm_models_details[target_llm_model]['chat_format']
llm = load_llm_model(llm_model_path, llm_chat_format)

## 8. Load NLP model for Summarization

In [None]:

# def load_summarization_model(model_path="./models/mt5-persian-summary/"):
#     """Load and configure the MT5 model for Persian summarization"""    
#     # Initialize components
#     tokenizer = MT5Tokenizer.from_pretrained(
#         model_path,
#         local_files_only=True,
#         legacy=False,
#         use_fast=True
#     )
    
#     device = "cuda" if torch.cuda.is_available() else "cpu"
#     model = AutoModelForSeq2SeqLM.from_pretrained(
#         model_path,
#         local_files_only=True
#     ).to(device)
    
#     return model, tokenizer, device

# # Define available summarization models
# summarization_models = {
#     'mt5-persian-base': {
#         'path': './models/mt5-persian-summary/',
#         'description': 'Base MT5 model fine-tuned for Persian summarization'
#     },
#     'parst5-summary': {
#         'path': './models/parsT5-base/',
#         'description': 'Specialized Persian T5 for summarization'
#     }
# }

# # Load model
# target_model = 'mt5-persian-base'
# summarizer_model, summarizer_tokenizer, summarizer_device = load_summarization_model(
#     summarization_models[target_model]['path']
# )

## 9. Create RAG Pipeline

In [None]:
# Define prompt template
PROMPT_TEMPLATE = '''

کانتکست:
{context}

---

تاریخچه مکالمات:
{history}


---
سوال کاربر:
{prompt}
'''


technical_support_system_message = """
# 🔧 دستیار فنی
- به تاریخچه مکالمات حتما توجه کن.
- فقط از کانتکست بازیابی‌شده و تاریخچه مکالمات استفاده کن. هرگونه اطلاعات اضافی ممنوع است.
- پاسخ باید فقط **۱ جمله خیلی کوتاه** باشد. توضیح بیشتر فقط در صورت درخواست کاربر مجاز است (مثلاً: "بیشتر بگو").
- از هیچ گونه توضیح اضافی، مقدمه، یا تفسیر خودداری کن.
- تکرار سوال کاربر در پاسخ ممنوع است.
- اگر جواب در کانتکست نبود، فقط بگو: «اطلاعاتی در دسترس نیست. با ادمین تماس بگیرید.»
- اگر سوال اول مکالمه است → فقط یک جمله کوتاه پاسخ بده.
اگر کاربر سوال پیگیری (follow-up) پرسید، و از تو خواست "بیشتر توضیح بده"، "مثال بزن"، یا "جزییات بگو"، آنگاه می‌توانی پاسخ مفصل و چند جمله‌ای بدهی.

"""



summarizer_system_message = """
تو یک خلاصه‌کننده متون هستی. وظیفه تو فقط و فقط خلاصه کردن است.
- هرگونه سلام و احوالپرسی را حذف کن
- فقط هسته اصلی متن را نگه دار
- اگر متن کوتاه است (کمتر از 10 کلمه)، عیناً تکرارش کن
- تحت هیچ شرایطی به متن پاسخ نده
- فقط متن خلاصه شده یا متن اصلی را برگردان
"""


# Initialize conversation history
conversation_history = []


def summarize_query_with_llm(query,llm=llm):
    """Summarize query using the LLM"""
    prefix = 'متن زیر را خلاصه کن:\n'
    query = prefix + query
    messages = [
        {"role": "system", "content": summarizer_system_message},
        {"role": "user", "content": "سلام دوست عزیز، می‌خواستم بپرسم معنی کد ۵۰۲ چیه؟"},
        {"role": "assistant", "content": "معنی کد ۵۰۲ چیه؟"},
        {"role": "user", "content": f"{query}"}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.0  # Low temperature for more deterministic responses
    )
    # .split('</think>')[-1] if the model thinks!
    response_clean = response['choices'][0]['message']['content']
    print(f"Summarized version: {response_clean}")

    return response_clean
    


# def summarize_query_with_mt5(text, model, tokenizer, device, max_length=250):
#     """Generate summary using the loaded model"""
#     # Persian-specific task prefix
#     input_text = f"خلاصه‌سازی متن فارسی: {text}"
    
#     # Tokenize and move to correct device
#     inputs = tokenizer(
#         input_text,
#         return_tensors="pt",
#         max_length=512,
#         truncation=True,
#         padding="max_length"
#     ).to(device)
    
#     # Generate with optimized parameters
#     response = model.generate(
#         input_ids=inputs.input_ids,
#         attention_mask=inputs.attention_mask,
#         max_length=max_length,
#         num_beams=4,
#         repetition_penalty=2.5,
#         length_penalty=0.8,
#         early_stopping=True,
#         no_repeat_ngram_size=3
#     )
#     response_clean = tokenizer.decode(response[0], skip_special_tokens=True)
#     print(f"Summarized version: {response_clean}")
#     return response_clean



def retrieve_context(query, collection=collection, embedding_model=embedding_model,top_k=top_k):
    """Retrieve relevant context based on the query"""
    docs = retrieve_relevant_documents(query, collection, embedding_model,top_k)
    for doc in docs:
        print(f"Retrieved document: {doc[:100]}...")
    return "\n".join(docs)


def generate_response_stream(model_input, llm=llm):
    """Generate streaming response using the LLM"""
    messages = [
        {"role": "system", "content": technical_support_system_message},
        {"role": "user", "content": f"{model_input}"}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.1,  # Low temperature for more deterministic responses
        repeat_penalty= 1.2,
        stream=True
    )
    
    full_response = ""
    for chunk in response:
        delta = chunk['choices'][0]['delta']
        if 'content' in delta:
            content = delta['content']
            print(content, end='', flush=True)
            full_response += content
            yield content


def generate_response(model_input, llm=llm):
    """Generate response using the LLM"""
    messages = [
        {"role": "system", "content": technical_support_system_message},
        {"role": "user", "content": f"{model_input}"}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.1,  # Low temperature for more deterministic responses
      repeat_penalty= 1.2
    )
    # .split('</think>')[-1] if the model thinks!
    response_clean = response['choices'][0]['message']['content']
    print(response_clean)

    return response_clean


def rag_chat(user_query, history=None, stream=False, summary = False):
    """Complete RAG pipeline: Retrieve → Generate → Respond"""
    if history is None:
        history = conversation_history
    
    # Format conversation history
    history_text = "\n".join(history)

    if summary:
        # Summarize input query
        # With MT5
        # user_query_summarized = summarize_query_with_mt5(user_query, summarizer_model, summarizer_tokenizer, summarizer_device)
        # With LLM
        user_query_summarized = summarize_query_with_llm(user_query)
        
        # Retrieve relevant context
        context = retrieve_context(user_query_summarized)
    
    
    else:
        # Retrieve relevant context
        context = retrieve_context(user_query)
    
    
    # Create prompt with context and history
    prompt = PROMPT_TEMPLATE.format(
        history=history_text,
        context=context, 
        prompt=user_query
    )

    if stream:
        # Generate streaming response
        response = ""
        response_stream = generate_response_stream(prompt)
        for chunk in response_stream:
            response += chunk
    else:
        # Generate response (non-stream)
        response = generate_response(prompt)

    if summary:
        # Summarize ouput query 
        # With MT5
        # response_summarized = summarize_query_with_mt5(response, summarizer_model, summarizer_tokenizer, summarizer_device)
        # With LLM
        response_summarized = summarize_query_with_llm(response)
        
        # Update conversation history
        history.append(f"سوال کاربر: {user_query_summarized}")
        history.append(f"پاسخ سیستم: {response_summarized}")

    else:
        # Update conversation history
        history.append(f"سوال کاربر: {user_query}")
        history.append(f"پاسخ سیستم: {response}")
    
    llm.reset()
    return response,context

## 10. Test RAG System

In [None]:
# # Example 1: Basic question (With Summary)
# query1 = questions[0]
# print(f"User query: {query1}")

# # Reset conversation history
# conversation_history = []

# # Time the response
# start = time.time()
# response, context = rag_chat(query1,stream = True, summary=True)
# end = time.time()

# print(f"---\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 1: Basic question (Without Summary)
query1 = questions[0]
print(f"User query: {query1}")

# Reset conversation history
conversation_history = []

# Time the response
start = time.time()
response, context = rag_chat(query1,stream = True, summary=False)
end = time.time()

print(f"---\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 2: Follow-up question
query2 = "میشه بیشتر راجع به این توضیح بدی؟"
print(f"User query: {query2}")

# Time the response (using existing conversation history)
start = time.time()
response, context = rag_chat(query2,stream = True)
end = time.time()

print(f"---\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 3: Different topic question
query3 = questions[-2]
print(f"User query: {query3}")

# Reset conversation history
conversation_history = []

# Time the response
start = time.time()
response, context = rag_chat(query3,stream = True)
end = time.time()


print(f"---\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 2: Follow-up question
print(f"User query: {query2}")

# Time the response (using existing conversation history)
start = time.time()
response, context = rag_chat(query2,stream = True)
end = time.time()

print(f"---\nProcessing time: {end - start:.2f} seconds")

## 11. RAG System Evaluation

Test with more complex queries to evaluate retrieval performance and answer quality.

In [None]:
def evaluate_generated_response_cosine(generated_response, ground_truth_answer,embedding_model=embedding_model):
    
    """Evaluation for generated response by model vs ground truth answer"""
    generated_response_embeddings = embedding_model.encode(generated_response, convert_to_tensor=True)
    ground_truth_answer_embeddings = embedding_model.encode(ground_truth_answer, convert_to_tensor=True)

    cosine_score_raw = util.pytorch_cos_sim(generated_response_embeddings, ground_truth_answer_embeddings)
    
    cosine_score = round(float(cosine_score_raw[0][0]) * 100, 2)
    print("Cosine Similarity between generated response and ground truth answer:", cosine_score)
    
    return cosine_score

In [None]:
def evaluate_generated_response_prf(generated_response, ground_truth_answer):
    
    """Evaluation for generated response by model vs ground truth answer"""

    P_raw, R_raw, F1_raw = score([generated_response], [ground_truth_answer], lang='en') # model_type='distilbert-base-uncased'
    P = round(float(P_raw[0]) * 100, 2)
    R = round(float(R_raw[0]) * 100, 2)
    F1 = round(float(F1_raw[0]) * 100, 2)
    print("Precision: ", P)
    print("Recall: ", R)
    print("F1 Score: ", F1)
    
    return P, R, F1

In [None]:
# def evaluate_retrieved_context_cosine(retrieved_context, ground_truth_context,embedding_model=embedding_model):
#     """Evaluation for retrieval quality; retrieved contexts vs ground truth context"""
#     # TODO: It is just the sample to remember. Make it correct later

#     retrieved_context_embeddings = embedding_model.encode(retrieved_context, convert_to_tensor=True)
#     ground_truth_context_embeddings = embedding_model.encode(ground_truth_context, convert_to_tensor=True)

#     cosine_score = util.pytorch_cos_sim(retrieved_context_embeddings, ground_truth_context_embeddings)

#     print("Cosine Similarity between generated response and ground truth answer:", cosine_score)
    
#     return cosine_score

In [None]:
def rag_chat_with_processing_time(query):
    """RAG chat + processing time """    
    print(f"User query: {query}")

    # Reset conversation history
    conversation_history = []

    # Time the response
    start = time.time()
    response, context = rag_chat(query,stream=False)
    end = time.time()

    processing_time = f"{end - start:.2f}"
    print(f"---\nProcessing time: {processing_time} seconds")
    return response, context, processing_time

In [None]:
def generate_report(questions=questions,answers=answers):
    reports = {
        'question': [],
        'response': [],
        'answer': [],
        'cosine': [],
        'precision': [],
        'recall': [],
        'f1_score': [],
        'context': [],
        'processing_time': []
    }
    for question,answer in zip(questions,answers):
        response, context, processing_time = rag_chat_with_processing_time(question)
        cosine = evaluate_generated_response_cosine(response, answer,embedding_model=embedding_model)
        precision, recall, f1_score = evaluate_generated_response_prf(response, answer)
        reports['question'].append(question)
        reports['response'].append(response)
        reports['answer'].append(answer)
        reports['cosine'].append(cosine)
        reports['precision'].append(precision)
        reports['recall'].append(recall)
        reports['f1_score'].append(f1_score)
        reports['context'].append(context)
        reports['processing_time'].append(processing_time)
        print("======================================")
        print("======================================")
    return reports

reports = generate_report()

In [None]:
def save_report_to_csv(reports):
    df_reports = pd.DataFrame(reports)
    df_reports.to_csv('report.csv', index=False)
    print("Cosine Similarity Average:",df_reports['cosine'].mean(), '%')
    print("Sum processing time:",df_reports['processing_time'].astype(float).sum(), 'sec')
    print("Report saved to the file successfully.")
    return 0

save_report_to_csv(reports)