# RAG Implementation with Quantized-Models

This notebook implements a Retrieval-Augmented Generation (RAG) system using:
- ChromaDB for vector storage and retrieval
- Sentence Transformer for embedding generation
- Llama3 for text generation

## 1. Import Dependencies

In [None]:
import sys
import time
import random
import chromadb
import pandas as pd
from llama_cpp import Llama
from bert_score import score
from chromadb.utils import embedding_functions
from transformers import logging as transformers_logging
from sentence_transformers import SentenceTransformer, util

In [None]:
# ignore warnings
transformers_logging.set_verbosity_error()

## 2. Load Sample Data

In [None]:
# # Define knowledge base chunks about NovaCloud service
# context_data = {
#     "services": '''
# شرکت نواکلود سه سرویس اصلی ارائه می‌دهد:  
# 1. نوااستورج (NovaStorage) – یک سرویس ذخیره‌سازی ابری که برای شرکت‌های بزرگ طراحی شده است و امکان رمزگذاری سرتاسری (E2EE) و پشتیبان‌گیری خودکار را دارد.  
# 2. نواکامپیوتر (NovaCompute) – یک سرویس پردازش ابری که از پردازنده‌های ZetaCore X200 استفاده می‌کند و برای مدل‌های یادگیری ماشین سنگین بهینه‌سازی شده است.  
# 3. نواکانکت (NovaConnect) – یک پلتفرم شبکه خصوصی ابری (VPC) که به شرکت‌ها امکان ایجاد زیرشبکه‌های ایزوله با IP ثابت خصوصی را می‌دهد.  
# ''',
#     "pricing": '''
# نواکلود سه طرح قیمت‌گذاری ارائه می‌دهد:  
# - طرح پایه (Basic): شامل ۵۰ گیگابایت فضای ذخیره‌سازی و ۲ هسته پردازشی با هزینه‌ی ۱۵ دلار در ماه  
# - طرح حرفه‌ای (Pro): شامل ۵۰۰ گیگابایت فضای ذخیره‌سازی، ۸ هسته پردازشی و ترافیک نامحدود با هزینه‌ی ۶۰ دلار در ماه  
# - طرح سازمانی (Enterprise): شامل ۵ ترابایت فضای ذخیره‌سازی، ۳۲ هسته پردازشی، و قابلیت تنظیم شبکه خصوصی اختصاصی با هزینه‌ی ۲۰۰ دلار در ماه  
# ''',
#     "security": '''
# نواکلود امنیت داده‌ها را با سه مکانیزم کلیدی تضمین می‌کند:  
# - رمزگذاری سرتاسری (E2EE) برای داده‌های ذخیره‌شده در NovaStorage  
# - احراز هویت چندمرحله‌ای (MFA) برای ورود به تمامی سرویس‌ها  
# - فایروال هوشمند که تنها IPهای تأیید‌شده را به شبکه NovaConnect متصل می‌کند  
# ''',
#     "uptime": '''
# در سه ماه گذشته، NovaCompute در ۹۸.۹٪ مواقع بدون اختلال کار کرده است، اما یک قطعی ۲ ساعته در تاریخ ۱۵ فوریه ۲۰۲۴ به دلیل بروزرسانی سخت‌افزاری رخ داده است. در همین مدت، NovaStorage بدون هیچ اختلالی فعال بوده است.  
# '''
# }

# # Convert dictionary to list of chunks for embedding
# chunks = list(context_data.values())

In [None]:
df = pd.read_csv('endpoints_info.csv')

df[['question','answer','context']].where(df['endpoint'] == 'تمام اندپوینت ها').dropna()

slice = df[['question','answer','context']].where(df['endpoint'] == 'تمام اندپوینت ها').dropna()

chunks = slice['context'].unique().tolist()
questions = slice['question'].tolist()
answers = slice['answer'].tolist()
chunks_cooked = (slice['question'] + ' ' + slice['answer']).to_list()
# (slice['question'] + ' ' + slice['context']).to_list()

del slice
del df

# TODO: add chunks, questions and answers in the pipeline.

## 3. Configure Embedding Model

In [None]:
# Load embedding model
def load_embedding_model(model_name='all-MiniLM-L6-v2'):
    """Load and configure the sentence transformer model for embeddings"""
    embedding_model_path = f"./models/{model_name}"
    
    # Load model from local path
    embedding_model = SentenceTransformer(embedding_model_path)
    
    # Create embedding function for ChromaDB
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=model_name
    )
    
    return embedding_model, sentence_transformer_ef

# Initialize models
embedding_model, sentence_transformer_ef = load_embedding_model()

## 4. Set Up Vector Database

In [None]:
def setup_vector_db(collection_name="novacloud_knowledge", embedding_function=None):
    """Initialize ChromaDB and create or get collection"""
    client = chromadb.PersistentClient(path="./chromadb")
    
    # Delete the collection. Uncomment it if needed
    client.delete_collection(collection_name)
    # Get or create collection
    collection = client.get_or_create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"},  # Use cosine similarity for matching
        embedding_function=embedding_function
    )
    
    return client, collection


# Set up ChromaDB
chroma_client, collection = setup_vector_db(collection_name = 'endpoints_info',embedding_function=sentence_transformer_ef)

## 5. Add Documents to Vector Database

In [None]:
def add_documents_to_collection(collection, documents, embedding_model):
    """Add documents to ChromaDB collection with embeddings"""
    # Generate embeddings for documents
    embeddings = embedding_model.encode(documents)
    
    # Add documents with embeddings to collection
    collection.add(
        embeddings=embeddings,
        documents=documents,
        ids=[str(i) for i in range(len(documents))]
    )
    
    return len(documents)

# Add documents to collection
num_added = add_documents_to_collection(collection, chunks_cooked, embedding_model)
print(f"Added {num_added} documents to vector database")

## 6. Implement Retrieval Function

In [None]:
def retrieve_relevant_documents(query, collection, embedding_model, top_k=1, similarity_threshold=0.7):
    """Retrieve relevant documents based on semantic similarity"""
    # Create embedding for query
    query_embedding = embedding_model.encode([query])
    
    # Query the collection
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )
    
    # Extract results
    documents = results["documents"][0] if results["documents"] else ["No relevant information found."]
    distances = results["distances"][0] if results["distances"] else [1.0]  # Higher distance = less relevant
    
    # Print similarity scores for debugging
    print(f"Similarity scores: {[1-d for d in distances]}")
    
    # Optional: Filter by similarity threshold
    # filtered_docs = [doc for doc, dist in zip(documents, distances) if 1-dist >= similarity_threshold]
    # return filtered_docs if filtered_docs else ["No sufficiently relevant information found."]
    
    return documents

# Test retrieval function
top_k = 3
test_query = questions[0]
print(test_query)
retrieved_docs = retrieve_relevant_documents(test_query, collection, embedding_model,top_k)
print(retrieved_docs)
for retrieved_docs in retrieved_docs:
    print(f"Retrieved document: {retrieved_docs[:100]}...")

## 7. Load LLM for Generation

In [None]:
def load_llm_model(model_path="./models/Dorna-Llama3-8B-Instruct-GGUF-Q8/dorna-llama3-8b-instruct.Q8_0.gguf",chat_format='auto'):
    """Load and configure the LLM for text generation"""
    llm = Llama(
        model_path=model_path,
        chat_format=chat_format,
        n_gpu_layers=-1,  # Use all available GPU layers
        n_ctx=9216,       # Context window size
        verbose=False
    )
    
    return llm


# Define LLM models details like: path and chat_format
llm_models_details = {
    'dorna-llama3-8b-q8' : {'path': './models/Dorna-Llama3-8B-Instruct-GGUF-Q8/dorna-llama3-8b-instruct.Q8_0.gguf',
                            'chat_format': 'llama-3'},
    'deepseek-r1-7b-qwen' : {'path': './models/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B.Q8_0.gguf',
                            'chat_format': 'deepseek'},
    'gemma-3-4b-q2': {'path':'./models/gemma-3-4b-it.Q2_K/gemma-3-4b-it.Q2_K.gguf',
                     'chat_format': 'gemma'},
    'gemma-3-4b-q8': {'path':'./models/gemma-3-4b-it.Q8_0/gemma-3-4b-it.Q8_0.gguf',
                      'chat_format': 'gemma'},
    'gemma-3-4b-fp16': {'path':'./models/gemma-3-4b-it.fp16/gemma-3-4b-it.fp16.gguf',
                        'chat_format': 'gemma'}
}

# Load Llama model
target_llm_model = 'gemma-3-4b-fp16'
llm_model_path, llm_chat_format = llm_models_details[target_llm_model]['path'], llm_models_details[target_llm_model]['chat_format']
llm = load_llm_model(llm_model_path, llm_chat_format)

## 8. Create RAG Pipeline

In [None]:
# Define prompt template
PROMPT_TEMPLATE = '''
تاریخچه مکالمات:
{history}

دانش پایه:
{context}

سوال کاربر:
{prompt}
'''

# Initialize conversation history
conversation_history = []

def summarize_query(query,llm=llm):
    """Summarize query using the LLM"""
    messages = [
        {"role": "system", "content": "تو یک دستیار هستی که وظیفه تو خلاصه کردن متن است. سعی نکن به کاربر جواب بدی فقط تشریفات رو از بین ببر و تا جایی که امکان داره خلاصه کن. فقط تا جایی پرامپت کاربر رو کوتاه کن که به هسته اصلی مطلب آسیبی وارد نشه."},
        {"role": "user", "content": query}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.1  # Low temperature for more deterministic responses
    )
    print(f"Summarized version: {response['choices'][0]['message']['content']}")
    return response['choices'][0]['message']['content']


def retrieve_context(query, collection=collection, embedding_model=embedding_model,top_k=top_k):
    """Retrieve relevant context based on the query"""
    docs = retrieve_relevant_documents(query, collection, embedding_model,top_k)
    for doc in docs:
        print(f"Retrieved document: {doc[:100]}...")
    return "\n".join(docs)

def generate_response(model_input, llm=llm):
    """Generate response using the LLM"""
    messages = [
        {"role": "system", "content": "تو یک دستیار متخصص و پشتیبانی فنی وضعیت سرویس ها هستی که با توجه به دانش پایه، به کاربر پاسخ فارسی میدی."},
        {"role": "user", "content": f"{model_input}"}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.1  # Low temperature for more deterministic responses
    )
    
    return response['choices'][0]['message']['content']

def rag_chat(user_query, history=None):
    """Complete RAG pipeline: Retrieve → Generate → Respond"""
    if history is None:
        history = conversation_history
    
    # Summarize input query 
    # user_query_summarized = summarize_query(user_query)

    # Retrieve relevant context
    context = retrieve_context(user_query)
    
    # Format conversation history
    history_text = "\n".join(history)
    
    # Create prompt with context and history
    prompt = PROMPT_TEMPLATE.format(
        history=history_text,
        context=context, 
        prompt=user_query
    )
    
    # Generate response
    response = generate_response(prompt)
    
    # Summarize ouput query 
    # response_summarized = summarize_query(response)


    # Update conversation history
    history.append(f"سوال کاربر: {user_query}")
    history.append(f"پاسخ کمک کننده: {response}")
    
    return response,context

## 9. Text Streaming Utility

In [None]:
def stream_text(text, min_delay=0.02, max_delay=0.08):
    """Display text in a streaming manner, character by character"""
    for char in text:
        sys.stdout.write(char)
        sys.stdout.flush()
        
        # Dynamic delay for natural effect
        time.sleep(random.uniform(min_delay, max_delay))
    
    print()  # New line after completion

## 10. Test RAG System

In [None]:
# Example 1: Basic question
query1 = questions[0]
print(f"User query: {query1}")

# Reset conversation history
conversation_history = []

# Time the response
start = time.time()
response, context = rag_chat(query1)
end = time.time()

# Stream or print the response
# stream_option = input("Stream response? (y/n): ").lower().strip() == 'y'
stream_option = False
if stream_option:
    stream_text(response)
else:
    print(response)

print(f"---\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 2: Follow-up question
query2 = "میشه بیشتر راجع به این توضیح بدی؟"
print(f"User query: {query2}")

# Time the response (using existing conversation history)
start = time.time()
response, context = rag_chat(query2)
end = time.time()

# Stream or print the response
if stream_option:
    stream_text(response)
else:
    print(response)

print(f"---\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 3: Different topic question
query3 = questions[2]
print(f"User query: {query3}")

# Reset conversation history
conversation_history = []

# Time the response
start = time.time()
response, context = rag_chat(query3)
end = time.time()

# Stream or print the response
if stream_option:
    stream_text(response)
else:
    print(response)

print(f"---\nProcessing time: {end - start:.2f} seconds")

In [None]:
# TEST SUMMARY

prefix = 'لطفا پیام زیر رو خلاصه کن:'

query3 = f'''
سلام و عرض ادب
وقت شما بخیر
من یک سوالی داشتم ازتون
{questions[0]}
ممنون از شما
'''
query3 = prefix + query3

print(f"User query: {query3}")


In [None]:
# TEST SUMMARY
prefix = 'لطفا پیام زیر رو خلاصه کن:'

query3 = f'''
سلام و عرض ادب
وقت شما بخیر
من یک سوالی داشتم ازتون
{questions[0]}
ممنون از شما
'''

query3 = prefix + query3

print(f"User query: {query3}")
print("------------------------")
"""Summarize query using the LLM"""
messages = [
    {"role": "user", "content": query3}
]

response = llm.create_chat_completion(
    messages=messages,
    top_p=0.85,
    temperature=0.1  # Low temperature for more deterministic responses
)
print(f"Summarized version:\n {response['choices'][0]['message']['content']}")


## 11. RAG System Evaluation

Test with more complex queries to evaluate retrieval performance and answer quality.

In [None]:
def evaluate_generated_response_cosine(generated_response, ground_truth_answer,embedding_model=embedding_model):
    
    """Evaluation for generated response by model vs ground truth answer"""
    generated_response_embeddings = embedding_model.encode(generated_response, convert_to_tensor=True)
    ground_truth_answer_embeddings = embedding_model.encode(ground_truth_answer, convert_to_tensor=True)

    cosine_score_raw = util.pytorch_cos_sim(generated_response_embeddings, ground_truth_answer_embeddings)
    
    cosine_score = round(float(cosine_score_raw[0][0]) * 100, 2)
    print("Cosine Similarity between generated response and ground truth answer:", cosine_score)
    
    return cosine_score

In [None]:
def evaluate_generated_response_prf(generated_response, ground_truth_answer):
    
    """Evaluation for generated response by model vs ground truth answer"""

    P_raw, R_raw, F1_raw = score([generated_response], [ground_truth_answer], lang='en') # model_type='distilbert-base-uncased'
    P = round(float(P_raw[0]) * 100, 2)
    R = round(float(R_raw[0]) * 100, 2)
    F1 = round(float(F1_raw[0]) * 100, 2)
    print("Precision: ", P)
    print("Recall: ", R)
    print("F1 Score: ", F1)
    
    return P, R, F1

In [None]:
# def evaluate_retrieved_context_cosine(retrieved_context, ground_truth_context,embedding_model=embedding_model):
#     """Evaluation for retrieval quality; retrieved contexts vs ground truth context"""
#     # TODO: It is just the sample to remember. Make it correct later

#     retrieved_context_embeddings = embedding_model.encode(retrieved_context, convert_to_tensor=True)
#     ground_truth_context_embeddings = embedding_model.encode(ground_truth_context, convert_to_tensor=True)

#     cosine_score = util.pytorch_cos_sim(retrieved_context_embeddings, ground_truth_context_embeddings)

#     print("Cosine Similarity between generated response and ground truth answer:", cosine_score)
    
#     return cosine_score

In [None]:
def rag_chat_with_processing_time(query):
    """RAG chat + processing time """    
    print(f"User query: {query}")

    # Reset conversation history
    conversation_history = []

    # Time the response
    start = time.time()
    response, context = rag_chat(query)
    end = time.time()

    print(response)
    processing_time = f"{end - start:.2f}"
    print(f"---\nProcessing time: {processing_time} seconds")
    return response, context, processing_time

In [None]:
def generate_report(questions=questions,answers=answers):
    reports = {
        'question': [],
        'response': [],
        'answer': [],
        'cosine': [],
        'precision': [],
        'recall': [],
        'f1_score': [],
        'context': [],
        'processing_time': []
    }
    for question,answer in zip(questions,answers):
        response, context, processing_time = rag_chat_with_processing_time(question)
        cosine = evaluate_generated_response_cosine(response, answer,embedding_model=embedding_model)
        precision, recall, f1_score = evaluate_generated_response_prf(response, answer)
        reports['question'].append(question)
        reports['response'].append(response)
        reports['answer'].append(answer)
        reports['cosine'].append(cosine)
        reports['precision'].append(precision)
        reports['recall'].append(recall)
        reports['f1_score'].append(f1_score)
        reports['context'].append(context)
        reports['processing_time'].append(processing_time)
        print("======================================")
        print("======================================")
    return reports

def save_report_to_csv(reports):
    df_reports = pd.DataFrame(reports)
    df_reports.to_csv('report.csv', index=False)
    print("Report saved to the file successfully.")
    return 0

reports = generate_report()
save_report_to_csv(reports)