# RAG Implementation with Quantized-Models

This notebook implements a Retrieval-Augmented Generation (RAG) system using:
- Milvus for vector storage and retrieval
- Sentence Transformer for embedding generation
- Gemma for text generation

## 1. Import Dependencies

In [1]:
import time
import re, json
import numpy as np
import pandas as pd
from llama_cpp import Llama
from bert_score import score
from sentence_transformers import util
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
from transformers import logging as transformers_logging
from pymilvus import DataType, MilvusClient, WeightedRanker, RRFRanker, AnnSearchRequest

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ignore warnings
transformers_logging.set_verbosity_error()

## 2. Load Data

In [13]:
data_file_name_= 'endpoints_info_v5.csv'
df = pd.read_csv(data_file_name_)

slice_df = df[['title','question','answer','context']]

pure_contexts = slice_df['context'].tolist()
questions = slice_df['question'].tolist()
answers = slice_df['answer'].tolist()
titles = slice_df['title'].tolist()
qa_as_context = (slice_df['question'] + ' ' + slice_df['answer']).to_list()
contexts = []
for pure_context in pure_contexts:
    contexts.append(pure_context)

# for qa in qa_as_context:
#     contexts.append(qa)


del slice_df
del df
del pure_contexts
del qa_as_context

## 3. Configure Embedding Model

In [4]:
computation_device = "cuda:0"
# Load embedding model
def load_embedding_model(model_name='bge-m3'):
    """Load and configure the sentence transformer model for embeddings"""
    embedding_model_path = f"./models/{model_name}"
    
    # Load model from local path
    embedding_model = BGEM3EmbeddingFunction(
        model_name=embedding_model_path,
        device=computation_device
    )
    
    return embedding_model

# Initialize models
embedding_model = load_embedding_model()

## 4. Set Up Vector Database

In [5]:
def setup_client_vector_db():
    """Initialize Milvus and create or get collection"""
    vector_db_client = MilvusClient( 
    uri = "http://192.168.100.118:19530",
    user= "admin",
    password= "admin",
    db_name= "default"
    )
    return vector_db_client

def setup_schema_vector_db(vector_db_client):
    schema = vector_db_client.create_schema(
    auto_id=False,
    enable_dynamic_field=True
        )
    # Add fields to schema
    schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
    schema.add_field(field_name="title", datatype=DataType.VARCHAR, max_length=2000)
    schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=20000)
    schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)
    schema.add_field(field_name="dense", datatype=DataType.FLOAT_VECTOR, dim=1024)
    return schema 

def setup_index_vector_db(vector_db_client):
    index_params = vector_db_client.prepare_index_params()
    # Add indexes
    index_params.add_index(
        field_name="dense",
        index_name="dense_index",
        index_type="AUTOINDEX",
        metric_type="IP"
        )

    index_params.add_index(
        field_name="sparse",
        index_name="sparse_index",
        index_type="SPARSE_INVERTED_INDEX",  # Index type for sparse vectors
        metric_type="IP",  # Currently, only IP (Inner Product) is supported for sparse vectors
        params={"drop_ratio_build": 0.2},  # The ratio of small vector values to be dropped during indexing
        )    
    return index_params 

def setup_collection_vector_db(vector_db_client,collection_name, schema,index_params):
    result_drop_collection = vector_db_client.drop_collection(collection_name)
    result_setup_collection_vector_db = vector_db_client.create_collection(
        collection_name=collection_name,
        schema=schema,
        index_params=index_params
        )    
    return result_setup_collection_vector_db 


def preview_documents_in_collection_vector_db(vector_db_client,collection_name,filter_expression = "text like '%%'"):
    result_preview_documents_in_collection_vector_db = vector_db_client.query(collection_name, filter = filter_expression, output_fields= ["title","text"])
    return result_preview_documents_in_collection_vector_db 


def delete_documents_in_collection_vector_db(vector_db_client,collection_name,filter_expression = "text like '%%'"):
    result_delete_documents_in_collection_vector_db = vector_db_client.delete(collection_name, filter = filter_expression)
    return result_delete_documents_in_collection_vector_db 



# Set up Milvus
collection_name = 'endpoints_info'
vector_db_client = setup_client_vector_db()
schema_vector_db = setup_schema_vector_db(vector_db_client)
index_params_vector_db = setup_index_vector_db(vector_db_client)
result_setup_collection_vector_db = setup_collection_vector_db(vector_db_client,collection_name, schema_vector_db,index_params_vector_db)

## 5. Add Documents to Vector Database

In [6]:
def extract_sparse_vector(encoded_document):    
    # Convert to dictionary format {index: value}
    coo = encoded_document.tocoo()
    sparse_dict = {int(col): float(val) for col, val in zip(coo.col, coo.data)}
    return sparse_dict

def l2_normalize(vec):
    norm = np.linalg.norm(vec)
    return (vec / norm).tolist() if norm > 0 else vec

def add_documents_to_collection(vector_db_client, collection_name, titles, documents, embedding_model):
    """Add documents to Milvus collection with embeddings"""
    # Generate embeddings for documents
    encoded_documents  = embedding_model.encode_documents(documents)
    print("Dense vector shape:", encoded_documents["dense"][0].shape)
    print("Sparse vector shape:", list(encoded_documents["sparse"])[0].shape)
    
    # Prepare id
    result_flush = vector_db_client.flush(collection_name)    
    row_count = vector_db_client.get_collection_stats(collection_name)['row_count']

    data = []
    for i in range(len(encoded_documents['dense'])):
        # Get the sparse vector for this row
        sparse_dict = extract_sparse_vector(encoded_documents['sparse'][i,:])

        # Get and normalize dense vector for this row
        dense_normalized_list = l2_normalize(encoded_documents['dense'][i].tolist())    

        data.append({
            "id": row_count + i,
            "title": titles[i],
            "text": documents[i],
            "sparse": sparse_dict,
            "dense": dense_normalized_list 
        })
    
    # Add documents with embeddings to collection
    result_add_documents_to_collection = vector_db_client.insert(
        collection_name=collection_name,
        data=data
    )
    return result_add_documents_to_collection['insert_count']

# Add documents to collection
insert_count = add_documents_to_collection(vector_db_client, collection_name, titles, contexts, embedding_model)
print(f"Added {insert_count} documents to vector database")

Dense vector shape: (1024,)
Sparse vector shape: (250002,)
Added 134 documents to vector database


In [7]:
preview_documents_in_collection_vector_db(vector_db_client,collection_name,'id in [0]')
# delete_documents_in_collection_vector_db(vector_db_client,collection_name)

data: []

## 6. Implement Retrieval Function

In [8]:
def prepare_ann_search_request(query_dense_vector, query_sparse_vector, top_k):
    search_param_1 = {
        "data": [query_dense_vector],
        "anns_field": "dense",
        "param": {
            "metric_type": "IP",
            "params": {"nprobe": 10}
        },
        "limit": top_k
    }
    request_1 = AnnSearchRequest(**search_param_1)

    search_param_2 = {
        "data": [query_sparse_vector],
        "anns_field": "sparse",
        "param": {
            "metric_type": "IP",
            "params": {"drop_ratio_build": 0.2}
        },
        "limit": top_k
    }
    request_2 = AnnSearchRequest(**search_param_2)

    reqs = [request_1, request_2]
    
    return reqs

def retrieve_relevant_documents(vector_db_client, query, collection_name, embedding_model, top_k=2, similarity_threshold=0.48):
    """Retrieve relevant documents based on semantic similarity"""
    # Create embedding for query
    query_embedding = embedding_model.encode_documents([query])
    query_dense_vector = l2_normalize(query_embedding['dense'][0].tolist())
    query_sparse_vector = extract_sparse_vector(query_embedding['sparse'][0,:])

    # Prepare ANN Search Request
    requests = prepare_ann_search_request(query_dense_vector, query_sparse_vector, top_k)

    # Setup Ranker
    ranker = WeightedRanker(0.3, 0.85) 

    # ranker = RRFRanker(100)

    # Query the collection
    results = vector_db_client.hybrid_search(
        collection_name=collection_name,
        reqs=requests,
        ranker=ranker,
        limit=top_k,
        output_fields=["text"],
    )


    not_found_message = ["برای پاسخ این سوال با ادمین تماس بگیرید."]
    not_found_message = [""]

    
    # Extract results
    results = results[0]
    documents = []
    distances = []
    for hits in results:
        documents.append(hits['entity']['text'])
        distances.append(hits['distance'])
    
    # Print similarity scores for debugging
    print(f"Similarity scores: {[d for d in distances]}")
    
    # Optional: Filter by similarity threshold
    filtered_docs = [doc for doc, dist in zip(documents, distances) if dist >= similarity_threshold]
    return filtered_docs if filtered_docs else not_found_message
    
    # return documents

# Test retrieval function
top_k = 5
test_query = questions[14]
print(test_query)
retrieved_docs = retrieve_relevant_documents(vector_db_client, test_query, collection_name, embedding_model,top_k)
# print(retrieved_docs)
for retrieved_docs in retrieved_docs:
    print(f"Retrieved document: {retrieved_docs[:400]}...")

سرویسای دیگه هم   به افرا  اضافه میشه یا فقط همیناست؟
Similarity scores: []
Retrieved document: ...


## 7. Evaluation Retreival System

In [9]:
def evaluate_retrieval(retrieved_contexts, expected_context, top_k=top_k):
    """
    Checks at which rank the expected (gold) context was retrieved
    and calculates basic metrics like MRR (Mean reciprocal rank).
    """
    for rank, doc in enumerate(retrieved_contexts, start=1):  # ranks start at 1
        if expected_context in doc:
            return {
                "rank": rank,
                "recall@k": 1 if rank <= top_k else 0,
                "MMR": 1 / rank
            }
    
    # Not found
    return {
        "rank": None,
        "recall@k": 0,
        "MMR": 0
    }

In [10]:
def generate_retrieval_system_report(vector_db_client, questions, expected_contexts, collection_name, top_k):
    reports = {
        'question': [],
        'expected_context': [],
        'retrieved_contexts': [],
        'rank': [],
        'recall@k': [],
        'MMR': [],
    }
    for question, expected_context in zip(questions,expected_contexts):
        retrieved_contexts = retrieve_relevant_documents(vector_db_client, question, collection_name, embedding_model, top_k)
        metrics = evaluate_retrieval(retrieved_contexts, expected_context, top_k=top_k)
        rank = metrics['rank']
        recall_at_k = metrics['recall@k']
        mmr = metrics['MMR']
        reports['question'].append(question)
        reports['expected_context'].append(expected_context)
        reports['retrieved_contexts'].append(retrieved_contexts)
        reports['rank'].append(rank)
        reports['recall@k'].append(recall_at_k)
        reports['MMR'].append(mmr)
        print("Question: ",question)
        print("Found: ",recall_at_k)
        print("Rank: ", rank)
        print("MMR: ", mmr)
        print("Retrieved Contexts count: ", len(retrieved_contexts))
        print("======================================")
        print("======================================")
    return reports

retrieval_system_report = generate_retrieval_system_report(vector_db_client, questions, contexts, collection_name, top_k)

Similarity scores: []
Question:  x trace id برای چیه؟
Found:  0
Rank:  None
MMR:  0
Retrieved Contexts count:  1
Similarity scores: []
Question:  x-trace-id کجاها استفاده میشه؟
Found:  0
Rank:  None
MMR:  0
Retrieved Contexts count:  1
Similarity scores: [0.6897344589233398, 0.6897344589233398, 0.6897344589233398, 0.6814343929290771, 0.6811771988868713]
Question:  باندل آی دی برای چیه؟
Found:  1
Rank:  1
MMR:  1.0
Retrieved Contexts count:  5
Similarity scores: [0.7057149410247803, 0.7057149410247803, 0.7057149410247803, 0.6708484888076782, 0.669792890548706]
Question:  bundle id برای چیه؟
Found:  1
Rank:  1
MMR:  1.0
Retrieved Contexts count:  5
Similarity scores: [0.7499792575836182, 0.7202954888343811, 0.6801072359085083, 0.6636160612106323, 0.44051843881607056]
Question:  کلید انحصاری اپلیکیشن user app key چیه؟
Found:  1
Rank:  1
MMR:  1.0
Retrieved Contexts count:  4
Similarity scores: [0.7084743976593018, 0.7080954909324646, 0.6679520606994629, 0.6679520606994629, 0.6679520606994

In [11]:
def save_retrieval_system_report(reports):
    df_reports = pd.DataFrame(reports)
    df_reports.to_csv('retrieval_system_report.csv', index=False)
    # print("Cosine Similarity Average:",df_reports['cosine'].mean(), '%')
    # print("Sum processing time:",df_reports['processing_time'].astype(float).sum(), 'sec')
    # print("Report saved to the file successfully.")
    return df_reports

df_retrieval_system_report = save_retrieval_system_report(retrieval_system_report)

In [12]:
df_retrieval_system_report[df_retrieval_system_report['recall@k'] == 0]

Unnamed: 0,question,expected_context,retrieved_contexts,rank,recall@k,MMR
0,x trace id برای چیه؟,در راستاي افزایش دقت لاگ تراکنش ها و تسهیل فرآ...,[],,0,0.0
1,x-trace-id کجاها استفاده میشه؟,در راستاي افزایش دقت لاگ تراکنش ها و تسهیل فرآ...,[],,0,0.0
14,سرویسای دیگه هم به افرا اضافه میشه یا فقط ه...,در صورت ارائه هر سرویس جدید قابل استفاده توسط ...,[افرا به عنوان اپراتور خدمات انتظامی کشور سروی...,,0,0.0
24,چقدر وقت دارم صورتحسابو پرداخت کنم؟,فاکتورهای موضوع قرارداد با مشتری، با رعایت الز...,[مطابق قرارداد مشتری موظف است کلیه مغایرت های ...,,0,0.0
28,چجوری میتونم برای مشتریم دسترسی بگیرم؟,در صورت استفاده سرویس ها توسط مشتریان زیرمجموع...,[اطلاعات دسترسی پنل یعنی نام کاربری (user name...,,0,0.0
37,سرویسو فراخوانی کردم، کد وضعیت 202 گرفتم، این ...,کد وضعیت ۲۰۲ از سمت سرویس دهنده دریافت میشود و...,[کد وضعیت 200 از سمت سرویس دهنده دریافت میشود ...,,0,0.0
39,سرویسو فراخوانی کردم، کد وضعیت 401 گرفتم، این ...,این کد وضعیت از سمت سرویس دهنده دریافت میشود و...,[کد وضعیت 404 ارسالی توسط سرویس دهنده، به معنا...,,0,0.0


In [13]:
df_retrieval_system_report.sort_values(by = ['rank'], ascending = False)

Unnamed: 0,question,expected_context,retrieved_contexts,rank,recall@k,MMR
10,اگه درباره پنل مشتریان سوال داشتم، چیکار کنم؟,اطلاعات دسترسی پنل یعنی نام کاربری (user name)...,[پنل مشتریان پلتفرم افرا به آدرس https://apiad...,2.0,1,0.5
34,جزئیات بیشتر ورودی و خروجی سرویس‌ها را چجوری ت...,پس از عقد قرارداد و درخواست سرویس ها توسط مشتر...,[در صورت استفاده سرویس ها توسط مشتریان زیرمجمو...,2.0,1,0.5
26,در صورت وجود مغایرت در صورت وضعیت ماهانه چه اق...,در صورت بروز هرگونه اختلاف و مغایرت بین طرفین ...,[مطابق قرارداد مشتری موظف است کلیه مغایرت های ...,2.0,1,0.5
58,سرویسو فراخوانی کردم، روی HTTP کد 403 گرفتم، ا...,کد 403 در HTTP به معنای دسترسی غیرمجاز است - ا...,[کد 503 در HTTP به معنای این است که سرویس در د...,2.0,1,0.5
65,سرویسو فراخوانی کردم، روی HTTP کد 410 گرفتم، ا...,کد 410 در HTTP به معنای این است که منبع درخواس...,[کد وضعیت 410 ارسالی توسط سرویس دهنده، به این ...,2.0,1,0.5
...,...,...,...,...,...,...
14,سرویسای دیگه هم به افرا اضافه میشه یا فقط ه...,در صورت ارائه هر سرویس جدید قابل استفاده توسط ...,[افرا به عنوان اپراتور خدمات انتظامی کشور سروی...,,0,0.0
24,چقدر وقت دارم صورتحسابو پرداخت کنم؟,فاکتورهای موضوع قرارداد با مشتری، با رعایت الز...,[مطابق قرارداد مشتری موظف است کلیه مغایرت های ...,,0,0.0
28,چجوری میتونم برای مشتریم دسترسی بگیرم؟,در صورت استفاده سرویس ها توسط مشتریان زیرمجموع...,[اطلاعات دسترسی پنل یعنی نام کاربری (user name...,,0,0.0
37,سرویسو فراخوانی کردم، کد وضعیت 202 گرفتم، این ...,کد وضعیت ۲۰۲ از سمت سرویس دهنده دریافت میشود و...,[کد وضعیت 200 از سمت سرویس دهنده دریافت میشود ...,,0,0.0


## 8. Load LLM for Generation

In [9]:
context_window_size = 20000

def load_llm_model(model_path="./models/gemma-3-4b-it.Q2_K/gemma-3-4b-it.Q2_K.Q8_0.gguf",chat_format='gemma'):
    """Load and configure the LLM for text generation"""
    llm = Llama(
        model_path=model_path,
        chat_format=chat_format,
        n_gpu_layers=-1,  # Use all available GPU layers
        n_ctx=context_window_size,       # Context window size
        verbose=False
    )
    
    return llm


# Define LLM models details like: path and chat_format
llm_models_details = {
    'dorna-llama3-8b-q8' : {'path': './models/Dorna-Llama3-8B-Instruct-GGUF-Q8/dorna-llama3-8b-instruct.Q8_0.gguf',
                            'chat_format': 'llama-3'},
    'deepseek-r1-7b-qwen' : {'path': './models/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B.Q8_0.gguf',
                            'chat_format': 'gemma'},
    'gemma-3-4b-q2': {'path':'./models/gemma-3-4b-it.Q2_K/gemma-3-4b-it.Q2_K.gguf',
                     'chat_format': 'gemma'},
    'gemma-3-4b-q8': {'path':'./models/gemma-3-4b-it.Q8_0/gemma-3-4b-it.Q8_0.gguf',
                      'chat_format': 'gemma'},
    'gemma-3-4b-fp16': {'path':'./models/gemma-3-4b-it.fp16/gemma-3-4b-it.fp16.gguf',
                        'chat_format': 'gemma'},
    'gemma-3-12b-q4': {'path':'./models/gemma-3-12b-it.Q4_0/gemma-3-12b-it-q4_0.gguf',
                        'chat_format': 'gemma'}
    }

# Load Llama model
target_llm_model = 'gemma-3-12b-q4'
llm_model_path, llm_chat_format = llm_models_details[target_llm_model]['path'], llm_models_details[target_llm_model]['chat_format']
llm = load_llm_model(llm_model_path, llm_chat_format)

llama_init_from_model: n_ctx_per_seq (20000) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


## 9. Create RAG Pipeline

In [None]:
# Define prompt template
USER_PROMPT_TEMPLATE = '''
<role>You are “Technical Assistant.”</role>

<goal>
Answer technical questions concisely, using **only** chat history or retrieved context, and reply in formal Persian.
</goal>

<precedence>
1.<guardrails> 2.<behavior> 3.Latest user request 4.History 5.Context
</precedence>

<behavior>
  • Pay attention to conversation history always.  
  • Follow-ups like «بیشتر توضیح بده / مثال بزن» → extend last answer and explain more details.  
  • If no clear keyword remains, ask one clarifying question (Persian).  
  • If info is absent in both history & context → output «اطلاعاتی در دسترس نیست. با ادمین تماس بگیرید.»  
</behavior>

<steps>
  0. **Short-Reply**: new question → 2 brief sentence.  
  1. **Detailed-Reply**: user asks for more → up to 10 sentences, only existing info.  
  2. **Clarify**: ambiguous or keywordless → ask 1 clarifying Q.  
  3. **No-Info**: nothing relevant → fixed no-info sentence.
</steps>

<style>
Formal Persian; no emojis; bullets only on request.
</style>

<guardrails>
  • No external knowledge.  
  • Reveal technical details only when asked.  
  • Reject out-of-scope requests with «درخواست خارج از حیطه است».  
  • All answers Persian.  
  • If origin queried → «من توسط شرکت امیرمهدی ساخته شده‌ام.» (never mention Google).
</guardrails>

Context:
{context}

Conversation history:
{history}

User:
{prompt}
'''


SUMMARY_PROMPT_TEMPLATE = """
<role>You are “Conversation Summarizer.”</role>

<goal>
Return a **concise Persian chronology** of every user–assistant turn in the form:  
«اول، کاربر … و دستیار … . دوم، کاربر … و دستیار … . سوم، … .»  
Continue sequentially until the last turn; each clause must capture only the key request and the core response.
</goal>

<precedence>
1. <guardrails>  
2. <behavior>  
3. Conversation history  
</precedence>

<behavior>
  <rule>Process the entire history (summarise internally if > 2 000 characters).</rule>
  <rule>Extract just the essential intent of each user message and the essence of the assistant’s reply.</rule>
  <rule>Limit the whole summary to ≤ 60 Persian words **or** 6 numbered pairs—whichever comes first.</rule>
  <rule>Use the fixed numbered format; no extra commentary, emojis, or citations.</rule>
</behavior>

<steps>
  <step0 title="Chronology">
    <trigger>Always (history is present)</trigger>
    <action>Return the numbered Persian chronology.</action>
  </step0>
</steps>

<style>
  <item>Formal Persian; no bullet points, no emojis.</item>
</style>

<guardrails>
  <rule>No external knowledge or personal assumptions.</rule>
  <rule>All outputs must be in Persian.</rule>
</guardrails>

Conversation History:
{conversation_history}
"""




EXTRACT_KEYWORD_PROMPT_TEMPLATE = """
<role>You are “Keyword Extractor.”</role>

<goal>
Return the **maximum 10 most relevant Persian keywords / phrases** from the user prompt.  
Example – input:  
«سلام … کد پاسخ ۵۰۳ معنیش چیه؟ ممنون» → output: «کد پاسخ ۵۰۳»
</goal>

<precedence>
1. <guardrails>  
2. <behavior>  
3. Latest user prompt
</precedence>

<behavior>
  <rule>Remove greetings, courtesy phrases, punctuation, and stop-words.</rule>
  <rule>Otherwise return up to ten Persian words that best capture the technical intent (e.g., «اتصال پایگاه‌داده»).</rule>
  <rule>Output must be one Persian phrase, no extra characters, no brackets, no emojis.</rule>
  <rule>If no meaningful technical term exists, output the full text without extraction.</rule>
</behavior>

<steps>
  <step0 title="Extract">
    <trigger>Always</trigger>
    <action>Return the cleaned Persian keywords/phrases per rules.</action>
  </step0>
</steps>

<style>
  <item>Output exactly the keywords phrase—nothing else.</item>
</style>

<guardrails>
  <rule>No external knowledge; extract only from the prompt text.</rule>
  <rule>All outputs must be in Persian.</rule>
</guardrails>

User Prompt:
{user_prompt}

"""

# Initialize conversation history
conversation_history = []


def summarize_history_with_llm(history,llm=llm):
    """Summarize query using the LLM"""
    summary_prompt = SUMMARY_PROMPT_TEMPLATE.format(conversation_history = history)
    messages = [
        {"role": "user", "content": summary_prompt}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.0  # Low temperature for more deterministic responses
    )
    # .split('</think>')[-1] if the model thinks!
    response_clean = response['choices'][0]['message']['content']
    print(f"Summarized version: {response_clean}")

    return response_clean


def extract_keyword_with_llm(user_prompt,llm=llm):
    """Extract Keyword using the LLM"""
    extract_keyword_prompt = EXTRACT_KEYWORD_PROMPT_TEMPLATE.format(user_prompt = user_prompt)
    messages = [
        {"role": "user", "content": extract_keyword_prompt}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.0  # Low temperature for more deterministic responses
    )
    # .split('</think>')[-1] if the model thinks!
    response_clean = response['choices'][0]['message']['content']
    print(f"Extracted version: {response_clean}")

    return response_clean


def retrieve_context(vector_db_client, query, collection_name=collection_name, embedding_model=embedding_model,top_k=top_k):
    """Retrieve relevant context based on the query"""
    docs = retrieve_relevant_documents(vector_db_client, query, collection_name, embedding_model,top_k)
    for doc in docs:
        print(f"Retrieved document: {doc[:100]}...")
    return "\n".join(docs)


def generate_response_stream(model_input, llm=llm):
    """Generate streaming response using the LLM"""
    messages = [
        {"role": "user", "content": f"{model_input}"}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.1,  # Low temperature for more deterministic responses
        repeat_penalty= 1.2,
        stream=True
    )
    
    full_response = ""
    for chunk in response:
        delta = chunk['choices'][0]['delta']
        if 'content' in delta:
            content = delta['content']
            print(content, end='', flush=True)
            full_response += content
            yield content


def generate_response(model_input, llm=llm):
    """Generate response using the LLM"""
    messages = [
        {"role": "user", "content": model_input}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.1,  # Low temperature for more deterministic responses
      repeat_penalty= 1.2
    )
    # .split('</think>')[-1] if the model thinks!
    response_clean = response['choices'][0]['message']['content']
    print(response_clean)

    return response_clean


def rag_chat(user_query, collection_name=collection_name, history=None, stream=False):
    """Complete RAG pipeline: Retrieve → Generate → Respond"""
    if history is None:
        history = conversation_history
    
    summarized_history_text = ""

    # Format conversation history
    history_text = "\n".join(history)

    if history:
        summarized_history_text = summarize_history_with_llm(history_text)
        
    
    user_query_keywords = extract_keyword_with_llm(user_query)

    # Retrieve relevant context
    context = retrieve_context(vector_db_client, user_query_keywords, collection_name, embedding_model,top_k)

        
    
    # Create prompt with context and history
    prompt = USER_PROMPT_TEMPLATE.format(
        history=summarized_history_text,
        context=context, 
        prompt=user_query
    )

    if stream:
        # Generate streaming response
        response = ""
        response_stream = generate_response_stream(prompt)
        for chunk in response_stream:
            response += chunk
    else:
        # Generate response (non-stream)
        response = generate_response(prompt)


    history.append(f"User: {user_query}")
    history.append(f"Assistant: {response}")


    
    llm.reset()
    return response,context

## 10. Test RAG System

In [14]:
# Example 1: Basic question (Without Summary)
query1 = questions[1]
print(f"User query: {query1}")

# Reset conversation history
conversation_history = []

# Time the response
start = time.time()
response, context = rag_chat(query1,stream = True)
end = time.time()

print(f"\nProcessing time: {end - start:.2f} seconds")

User query: x-trace-id کجاها استفاده میشه؟
Extracted version: x-trace-id
Similarity scores: [0.683688759803772, 0.6830585598945618, 0.6352226138114929, 0.6352226138114929, 0.445304811000824]
Retrieved document: در راستاي افزایش دقت لاگ تراکنش ها و تسهیل فرآیندهای RA، مولفه x-trace-id یا X trace id كه در هدر (H...
Retrieved document: در راستاي افزایش دقت لاگ تراکنش ها و تسهیل فرآیندهای RA، مولفه X-trace-id كه در هدر (Header) پاسخ بر...
Retrieved document: جهت حل مشکل تمدید بسته ها و استفاده از سرویس بسته های متنوع، متغیری به نام باندل آی دی یا bundle -id...
Retrieved document: جهت حل مشکل تمدید بسته ها و استفاده از سرویس بسته های متنوع، متغیری به نام باندل آی دی یا bundle -id...
x-trace-id در هدر پاسخ برگشتی ارسال می‌شود و باید توسط دریافت‌کننده سرویس ذخیر

KeyboardInterrupt: 

In [None]:
# Example 2: Follow-up question
query2 = "میشه بیشتر راجع به این توضیح بدی؟"
print(f"User query: {query2}")

# Time the response (using existing conversation history)
start = time.time()
response, context = rag_chat(query2,stream = True)
end = time.time()

print(f"---\nProcessing time: {end - start:.2f} seconds")

User query: میشه بیشتر راجع به این توضیح بدی؟
Summarized version: اول، کاربر پرسید x-trace-id کجاها استفاده میشه؟ و دستیار پاسخ داد در هدر پاسخ برگشتی ارسال می‌شود و برای افزایش دقت لاگ تراکنش‌ها صادر می‌شود.
Extracted version: توضیح
Similarity scores: [0.6261348128318787, 0.19380313158035278, 0.19331420958042145, 0.193064883351326, 0.1930556744337082]
Retrieved document: در رنج کدهای پاسخ 10xxx، درخواست کاربر به درستی به تامین کننده نهایی (سازمان مقصد) ارسال شده است. دق...
بله، می‌توانم بیشتر توضیح دهم. x-trace-id در هدر پاسخ برگشتی ارسال می‌شود تا به افزایش دقت لاگ تراکنش‌ها کمک کند و امکان ردیابی دقیق‌تر جریان درخواست را فراهم آورد. این شناسه برای شناسایی یک تراکنش خاص در سیستم‌های مختلف مورد استفاده قرار می‌گیرد.---
Processing time: 5.36 seconds


In [None]:
# Example 3: Follow-up question
query3 = "تو رو کی ساخته؟"
print(f"User query: {query3}")

# Time the response
start = time.time()
response, context = rag_chat(query3,stream = True)
end = time.time()


print(f"\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 4: Follow-up question
query4 = "اولش ازت چی پرسیدم؟"
print(f"User query: {query4}")

# Time the response
start = time.time()
response, context = rag_chat(query4,stream = True)
end = time.time()


print(f"\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 5: Follow-up question
query5 = "بعدش ازت چی پرسیدم؟"
print(f"User query: {query5}")

# Time the response
start = time.time()
response, context = rag_chat(query5,stream = True)
end = time.time()


print(f"\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 6: Different topic question
query6 = questions[29]
print(f"User query: {query6}")

# Reset conversation history
conversation_history = []

# Time the response
start = time.time()
response, context = rag_chat(query6,stream = True)
end = time.time()


print(f"---\nProcessing time: {end - start:.2f} seconds")

In [None]:
# Example 7: Follow-up question
query7 = "چرا؟"
print(f"User query: {query7}")

# Reset conversation history
# conversation_history = []

# Time the response
start = time.time()
response, context = rag_chat(query7,stream = True)
end = time.time()


print(f"\nProcessing time: {end - start:.2f} seconds")

## 11. RAG System Evaluation

Test with more complex queries to evaluate retrieval performance and answer quality.

In [None]:
JUDGE_TEMPLATE = """
<role>You are an Answer-Quality Judge.</role>

<goal>
  Return ONE and only ONE JSON object with exactly two keys:
    "score" → an integer 1-10 (no quotes, no decimals)
    "tags"  → a non-empty JSON array whose elements come only from:
              helpful, partially_helpful, unhelpful, off_topic,
              unclear, incorrect, incomplete, redundant,
              verbose
</goal>

<rules>
  <rule>No other keys, comments, or text are allowed.</rule>
  <rule>Do NOT echo the question or answers.</rule>
  <rule>Do NOT wrap the JSON in markdown.</rule>
  <rule>If you violate any rule, output exactly
        {{\"error\":\"invalid output\"}} and STOP.</rule>
</rules>

<output_format>
<![CDATA[
{{"score": <1-10>, "tags": ["<allowed_tag>", …]}}
]]>
</output_format>

Give me score and tags in JSON format.

Question: {question}
Answer (Model): {generated_response}
Answer (Ground truth): {ground_truth_answer}
"""


In [None]:
def evaluate_generated_response_llm_as_a_judge(question, generated_response, ground_truth_answer):
    
    """Evaluation for generated response by model vs ground truth answer"""
    judge_prompt = JUDGE_TEMPLATE.format(question = question, generated_response = generated_response, ground_truth_answer = ground_truth_answer)
    messages = [
        {"role": "user", "content": judge_prompt}
    ]
    
    response = llm.create_chat_completion(
        messages=messages,
        top_p=0.85,
        temperature=0.1,  # Low temperature for more deterministic responses
        repeat_penalty= 1.2 
        )
    response_clean = response['choices'][0]['message']['content']
    
    print("LLM as a judge result:", response_clean)
    
    return response_clean


def evaluate_generated_response_cosine(generated_response, ground_truth_answer,embedding_model=embedding_model):
    
    """Evaluation for generated response by model vs ground truth answer"""
    generated_response_embeddings = embedding_model.encode_documents([generated_response])['dense'][0]
    ground_truth_answer_embeddings = embedding_model.encode_documents([ground_truth_answer])['dense'][0]

    cosine_score_raw = util.pytorch_cos_sim(generated_response_embeddings, ground_truth_answer_embeddings)
    
    cosine_score = round(float(cosine_score_raw[0][0]) * 100, 2)
    print("Cosine Similarity between generated response and ground truth answer:", cosine_score)
    
    return cosine_score


def evaluate_generated_response_prf(generated_response, ground_truth_answer):
    
    """Evaluation for generated response by model vs ground truth answer"""

    P_raw, R_raw, F1_raw = score([generated_response], [ground_truth_answer], lang='en') # model_type='distilbert-base-uncased'
    P = round(float(P_raw[0]) * 100, 2)
    R = round(float(R_raw[0]) * 100, 2)
    F1 = round(float(F1_raw[0]) * 100, 2)
    print("Precision: ", P)
    print("Recall: ", R)
    print("F1 Score: ", F1)
    
    return P, R, F1

In [None]:
def rag_chat_with_processing_time(query):
    """RAG chat + processing time """    
    print(f"User query: {query}")

    # Time the response
    start = time.time()
    response, context = rag_chat(query,stream=False)
    end = time.time()
    
    processing_time = f"{end - start:.2f}"
    print(f"\nProcessing time: {processing_time} seconds")
    return response, context, processing_time

In [None]:
def extract_judgment_json_from_llm_response(llm_response):
    judgment_json_str = re.sub(r'^```json\s*|\s*```$', '', llm_response).strip()

    judgment_json = json.loads(judgment_json_str)
    return judgment_json

In [None]:
def generate_rag_system_report(questions=questions,answers=answers):
    reports = {
        'question': [],
        'response': [],
        'answer': [],
        'judgement score' : [],
        'judgement_tags':[],
        'cosine': [],
        'precision': [],
        'recall': [],
        'f1_score': [],
        'context': [],
        'processing_time': []
    }
    for question,answer in zip(questions,answers):
        response, context, processing_time = rag_chat_with_processing_time(question)
        llm_judgement = evaluate_generated_response_llm_as_a_judge(question,response,answer)
        llm_judgement_json = extract_judgment_json_from_llm_response(llm_judgement)
        llm_judgement_judgment_score = llm_judgement_json['score']
        llm_judgement_judgment_tags = ', '.join(llm_judgement_json['tags'])
        cosine = evaluate_generated_response_cosine(response, answer,embedding_model=embedding_model)
        precision, recall, f1_score = evaluate_generated_response_prf(response, answer)
        reports['question'].append(question)
        reports['response'].append(response)
        reports['answer'].append(answer)
        reports['judgement score'].append(llm_judgement_judgment_score)
        reports['judgement_tags'].append(llm_judgement_judgment_tags)
        reports['cosine'].append(cosine)
        reports['precision'].append(precision)
        reports['recall'].append(recall)
        reports['f1_score'].append(f1_score)
        reports['context'].append(context)
        reports['processing_time'].append(processing_time)
        print("======================================")
        print("======================================")
    return reports

reports = generate_rag_system_report()

In [None]:
def save_rag_system_report(reports):
    df_reports = pd.DataFrame(reports)
    df_reports.to_csv('rag_system_report.csv', index=False)
    print("Cosine Similarity Average:",df_reports['cosine'].mean(), '%')
    print("Judgement Score Average:",df_reports['judgement score'].mean(), '%')
    print("Sum processing time:",df_reports['processing_time'].astype(float).sum(), 'sec')
    print("Report saved to the file successfully.")
    return df_reports

df_rag_system_report = save_rag_system_report(reports)

In [None]:
df_rag_system_report['processing_time'].astype(float).plot()