In [1]:
# %load_ext autoreload
# %autoreload 2

In [1]:
from tqdm.auto import tqdm

In [2]:
import pandas as pd

## Ingestion

In [3]:
import json

# Real IDs
with open('../Data/documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

len(documents)

424

In [4]:
documents[3]

{'intent': 'report_copyright_infringement',
 'question': 'can uhelp me submitting a notification of copyright  infringement',
 'response': 'To address copyright infringement concerns, please adhere to the following guidelines:\n\n1. Access our website at {{WEBSITE_URL}}.\n2. Go to the {{COPYRIGHT_SECTION}} area.\n3. Find and select the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} choice.\n4. Complete the necessary fields, providing comprehensive details regarding the infringement.\n5. Submit the completed form for our assessment.\n\nYour submission will be reviewed, and we will take the requisite steps in line with our policies.',
 'category': 'CONTENT',
 'id': '1e254822'}

In [5]:
# Generated questions
df_question = pd.read_csv('../Data/ground-truth-data.csv')
df_question.shape

(1441, 3)

In [6]:
df_question.head()

Unnamed: 0,question,category,id
0,I suspect my copyright was violated on your pl...,CONTENT,34b742ae
1,Could you explain the process for filing a rep...,CONTENT,34b742ae
2,For someone with limited tech skills watching ...,CONTENT,34b742ae
3,"I need the procedure written out, from start t...",CONTENT,34b742ae
4,How can we submit our detailed content violati...,CONTENT,34b742ae


In [7]:
df_question.isna().sum()

question    0
category    0
id          0
dtype: int64

In [8]:
df_question[df_question.question.isna()]

Unnamed: 0,question,category,id


In [9]:
df_question = df_question.dropna()
df_question.shape

(1441, 3)

In [10]:
ground_truth = df_question.to_dict(orient='records')

In [11]:
ground_truth[0]

{'question': 'I suspect my copyright was violated on your platform; what do I need to know about reporting it?',
 'category': 'CONTENT',
 'id': '34b742ae'}

## RAG flow

In [12]:
documents[0].keys(), ground_truth[0].keys()

(dict_keys(['intent', 'question', 'response', 'category', 'id']),
 dict_keys(['question', 'category', 'id']))

In [13]:
import minsearch

# Create a MinSearch index with specified text and keyword fields
index = minsearch.Index(
    text_fields=['intent', 'question', 'response'],  # full-text searchable fields
    keyword_fields=['id', 'category'],  # fields for exact matching
)

# Fit the index to our document list
index.fit(documents)

<minsearch.minsearch.Index at 0x157faf04800>

In [17]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query['question'],  # string directly or export
        filter_dict={'category': query["category"]},
        boost_dict=boost,
        num_results=10
    )
    return results

## Retrieval evaluation

In [None]:
# Compute Hit Rate: % of queries for which the correct document was retrieved
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:  # If any retrieved doc matches the correct ID
            cnt = cnt + 1
    return cnt / len(relevance_total)


# Compute Mean Reciprocal Rank (MRR)
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:  # True means relevant doc found at this rank
                total_score = total_score + 1 / (rank + 1)
                # break  # only the first correct hit counts for MRR
    return total_score / len(relevance_total)


# Main evaluation loop
def evaluate(ground_truth, search_function, disable=False):
    relevance_total = []

    for q in tqdm(ground_truth, disable=disable):  # iterate over each query
        doc_id = q['id']  # correct document id
        results = search_function(q)  # run search
        relevance = [d['id'] == doc_id for d in results]  # check if results match the true id
        relevance_total.append(relevance)  # collect all relevance flags

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [19]:
# Evaluate using the ground truth and the defined search function
evaluate(ground_truth, lambda q: minsearch_search(q), disable=None)

  0%|          | 0/1441 [00:00<?, ?it/s]

{'hit_rate': 0.046495489243580844, 'mrr': 0.01509368494101318}

## VectorSearch

In [18]:
# !pip install minsearch -qq
# !pip install fastembed -qq

In [None]:
import numpy as np

In [None]:
import logging
# Suppress FastEmbed internal logs
# logging.getLogger("fastembed").setLevel(logging.CRITICAL)

In [None]:
import fastembed  # required "HF_TOKEN" for download model
fastembed.__version__

'0.7.3'

In [None]:
# os.environ["HF_TOKEN"]="hf_..."

In [None]:
from fastembed.text import TextEmbedding  # required "HF_TOKEN" for download model
sorted([ i['model'] for i in TextEmbedding.list_supported_models()])

['BAAI/bge-base-en',
 'BAAI/bge-base-en-v1.5',
 'BAAI/bge-large-en-v1.5',
 'BAAI/bge-small-en',
 'BAAI/bge-small-en-v1.5',
 'BAAI/bge-small-zh-v1.5',
 'Qdrant/clip-ViT-B-32-text',
 'intfloat/multilingual-e5-large',
 'jinaai/jina-clip-v1',
 'jinaai/jina-embeddings-v2-base-code',
 'jinaai/jina-embeddings-v2-base-de',
 'jinaai/jina-embeddings-v2-base-en',
 'jinaai/jina-embeddings-v2-base-es',
 'jinaai/jina-embeddings-v2-base-zh',
 'jinaai/jina-embeddings-v2-small-en',
 'jinaai/jina-embeddings-v3',
 'mixedbread-ai/mxbai-embed-large-v1',
 'nomic-ai/nomic-embed-text-v1',
 'nomic-ai/nomic-embed-text-v1.5',
 'nomic-ai/nomic-embed-text-v1.5-Q',
 'sentence-transformers/all-MiniLM-L6-v2',
 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
 'snowflake/snowflake-arctic-embed-l',
 'snowflake/snowflake-arctic-embed-m',
 'snowflake/snowflake-arctic-embed-m-long',
 'snowflake/snowflake-arctic-embed-s',
 'snowflake/snowflake-ar

In [None]:
from fastembed import TextEmbedding  # required "HF_TOKEN" for download model
from fastembed.common.model_description import PoolingType, ModelSource

# TextEmbedding.add_custom_model(
#     model="nomic-ai/nomic-embed-text-v1.5",
#     pooling=PoolingType.MEAN,
#     normalization=True,
#     sources=ModelSource(hf="nomic-ai/nomic-embed-text-v1.5"),  # can be used with an `url` to load files from a private storage
#     dim=384,
#     model_file="onnx/model.onnx",  # can be used to load an already supported model with another optimization or quantization, e.g. onnx/model_O4.onnx
# )
model = TextEmbedding(model_name="nomic-ai/nomic-embed-text-v1.5")
model.embed?

[31mSignature:[39m
model.embed(
    documents: Union[str, Iterable[str]],
    batch_size: int = [32m256[39m,
    parallel: Optional[int] = [38;5;28;01mNone[39;00m,
    **kwargs: Any,
) -> Iterable[Union[numpy.ndarray[Any, numpy.dtype[numpy.float64]], numpy.ndarray[Any, numpy.dtype[numpy.float32]], numpy.ndarray[Any, numpy.dtype[numpy.float16]], numpy.ndarray[Any, numpy.dtype[numpy.int8]], numpy.ndarray[Any, numpy.dtype[numpy.int64]], numpy.ndarray[Any, numpy.dtype[numpy.int32]]]]
[31mDocstring:[39m
Encode a list of documents into list of embeddings.
We use mean pooling with attention so that the model can handle variable-length inputs.

Args:
    documents: Iterator of documents or single document to embed
    batch_size: Batch size for encoding -- higher values will use more memory, but be faster
    parallel:
        If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
        If 0, use all available cores.
        If None, don't us

In [22]:
from minsearch import VectorSearch  # For vector-based semantic search
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF text vectorization
from sklearn.decomposition import TruncatedSVD  # Dimensionality reduction
from sklearn.pipeline import make_pipeline  # To chain TF-IDF + SVD

In [23]:
documents[0]

{'intent': 'report_copyright_infringement',
 'question': 'how can i report copyright violation',
 'response': 'To report copyright infringement, please adhere to the following procedures:\n\n1. Access our website at {{WEBSITE_URL}}.\n2. Direct yourself to the {{COPYRIGHT_SECTION}} section.\n3. Find and click on the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} option.\n4. Provide all required details regarding the infringement.\n5. Submit the completed form for our assessment.\n\nUpon receiving your report, it will be carefully analyzed, and appropriate measures will be implemented in accordance with our guidelines.',
 'category': 'CONTENT',
 'id': '34b742ae'}

In [24]:
def prepare_texts(documents, show_progress: bool = True):
    """Concatenate question and response into a single text string."""
    iterator = documents
    if show_progress:
        iterator = tqdm(documents, desc="Preparing texts", disable=False)

    texts = []
    for doc in iterator:
        # Concatenate question and answer text
        q = doc.get("question", "")
        r = doc.get("response", "")
        t = (q + " " + r).strip()
        texts.append(t)
    return texts

texts = prepare_texts(documents)
texts

Preparing texts:   0%|          | 0/424 [00:00<?, ?it/s]

['how can i report copyright violation To report copyright infringement, please adhere to the following procedures:\n\n1. Access our website at {{WEBSITE_URL}}.\n2. Direct yourself to the {{COPYRIGHT_SECTION}} section.\n3. Find and click on the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} option.\n4. Provide all required details regarding the infringement.\n5. Submit the completed form for our assessment.\n\nUpon receiving your report, it will be carefully analyzed, and appropriate measures will be implemented in accordance with our guidelines.',
 'need to report fucking copyright violation how to do it To facilitate a conversation with a customer service representative, please adhere to the following instructions:\n\n1. Visit {{WEBSITE_URL}}.\n2. Locate the {{CONTACT_US_SECTION}} section on the site.\n3. Choose the option for connecting with a customer service representative.\n4. Provide the necessary information in the form and submit your inquiry.\n\nShould you have any additional conce

In [None]:
def get_embeddings(
    texts: "str | list[str]",
    model_name="nomic-ai/nomic-embed-text-v1.5",
    batch_size=384,  # 256
    show_progress: bool = True,
):
    """
    Generate embeddings with FastEmbed.
    """
    # Ensure list input
    if isinstance(texts, str):
        texts = [texts]
    # Automatically uses BAAI/bge-small-en-v1.5 without credentials + invalid token (authentication fails)
    embedder = TextEmbedding(model_name=model_name)
    # embeddings = embedder.embed(texts, batch_size=batch_size)  # handle all 424
    if show_progress:
        texts = tqdm(texts, desc="Embedding texts", disable=False)
    embeddings = []
    for text in texts:
        embeddings.extend(embedder.embed(text, batch_size=batch_size))
    return embeddings


def embed_documents(
    documents, 
    model_name="nomic-ai/nomic-embed-text-v1.5", 
    show_progress: bool = True
):
    """
    Main function: Convert documents (list of dicts) -> embeddings (numpy array).

    Args:
        documents (list[dict]): Documents with text field(s).
        model_name (str): Embedding model name.
        show_progress (bool): Whether to show progress bar.

    Returns:
        np.ndarray: Matrix of embeddings (num_docs x dim).
    """
    # Prepare texts (assuming prepare_texts is your own function)
    texts = prepare_texts(documents, show_progress=show_progress)

    # Get embeddings as a list of vectors
    embeddings = get_embeddings(texts, model_name)

    # Convert to numpy array for easier downstream use
    return embeddings  # np.array(embeddings)

In [None]:
X = embed_documents(documents)
print("Embeddings shape:", len(X), len(X[0]))  # if array (n_docs, embedding_dim)

Preparing texts:   0%|          | 0/424 [00:00<?, ?it/s]

Embeddings shape: 424 768


In [25]:
# # Create a pipeline: TF-IDF vectorization + Truncated SVD (128 dimensions)
# pipeline = make_pipeline(
#     TfidfVectorizer(min_df=1, ngram_range=(1,1), norm='l1'),  # ignore rare words
#     TruncatedSVD(n_components=128, random_state=0)  # reduce to 128D
# )#.set_output(transform='pandas')

# # Fit and transform the texts into embeddings (2D numpy array)
# X = pipeline.fit_transform(texts)  # pipeline.transform([query["question"]])  # shape: (1, 128) query is still only the question
# X

In [84]:
# Initialize the vector index with 'id' as a keyword filter
vindex = VectorSearch(keyword_fields=['category'])

# Fit the vector index with our embeddings and original documents
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x23a38c4e9f0>

In [96]:
def vector_search_function(query, model_name="nomic-ai/nomic-embed-text-v1.5", show_progress: bool = True):
    # Convert the input question to a single embedding vector
    # query_vec = pipeline.transform([query["question"]])  # shape: (1, 128) query is still only the question
    query_vec = get_embeddings([query["question"]], model_name, show_progress=show_progress)

    # Run the vector search, filter by 'id'
    return vindex.search(
        query_vector=query_vec[0],  # use 1D vector
        filter_dict={"category": query["category"]},  # filter documents by id
        num_results=5  # return top 5 matches
    )

In [97]:
# Use the same evaluation function from earlier
results = evaluate(ground_truth, lambda x: vector_search_function(x, model_name="nomic-ai/nomic-embed-text-v1.5", show_progress=False))

from IPython.display import clear_output
clear_output(wait=True)

results

{'hit_rate': 0.031922276197085354, 'mrr': 0.014688873467499416}

## Finding the best parameters

In [98]:
df_validation = df_question[:len(df_question)//2]
df_test = df_question[len(df_question)//2:]

In [31]:
import random

random.seed(0)

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evaluate the objective function
        current_score = objective_function(current_params)

        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [32]:
gt_val = df_validation.to_dict(orient='records')

In [33]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query['question'],  # string directly or export
        filter_dict={'category': query["category"]},
        boost_dict=boost,
        num_results=10
    )

    return results

In [34]:
param_ranges = {
    'intent': (0.0, 10.0),
    'question': (0.0, 10.0),
    'response': (0.0, 10.0),
    'category': (0.0, 10.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q, boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [35]:
# return best_params, best_score
best_params, best_score = simple_optimize(param_ranges, objective, n_iterations=20)
best_params, best_score

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/720 [00:00<?, ?it/s]

({'intent': 5.756510141648885,
  'question': 2.90329502402758,
  'response': 1.8939132855435614,
  'category': 1.867295282555551},
 0.016361331569664902)

In [42]:
import optuna

def objective(trial):
    # Suggest float values in the given ranges
    boost_params = {
        k: trial.suggest_float(k, low, high)
        for k, (low, high) in param_ranges.items()
    }

    def search_function(q):
        return minsearch_search(q, boost_params)

    # Evaluate on validation set
    results = evaluate(gt_val, search_function, disable=True)
    return results['mrr']

# Run optimization
study = optuna.create_study(direction="maximize")  # maximize MRR
study.optimize(objective, n_trials=100, gc_after_trial=True, show_progress_bar=True, n_jobs=-1)

# Best params
print("Best params:", study.best_params)
print("Best MRR:", study.best_value)

[I 2025-09-08 06:22:01,024] A new study created in memory with name: no-name-a39b601e-cfec-42cc-a450-e723177135b8


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-09-08 06:22:45,820] Trial 1 finished with value: 0.01093915343915344 and parameters: {'intent': 1.5403377451088096, 'question': 2.5476811960809806, 'response': 9.02394355393403, 'category': 8.850083695483962}. Best is trial 1 with value: 0.01093915343915344.
[I 2025-09-08 06:22:46,149] Trial 6 finished with value: 0.01358851410934744 and parameters: {'intent': 8.903157315448468, 'question': 3.593114631113875, 'response': 7.034764818790285, 'category': 9.753407017451167}. Best is trial 6 with value: 0.01358851410934744.
[I 2025-09-08 06:22:46,555] Trial 2 finished with value: 0.01482142857142857 and parameters: {'intent': 1.8465988153968715, 'question': 6.68811792610421, 'response': 0.06792788057002008, 'category': 1.0474587725479945}. Best is trial 2 with value: 0.01482142857142857.
[I 2025-09-08 06:22:46,829] Trial 5 finished with value: 0.015577050264550262 and parameters: {'intent': 2.6185354376755976, 'question': 4.501145161177468, 'response': 7.3610847684101195, 'category'

In [37]:
best_params

{'intent': 5.756510141648885,
 'question': 2.90329502402758,
 'response': 1.8939132855435614,
 'category': 1.867295282555551}

In [43]:
study.best_params

{'intent': 0.022894885883346205,
 'question': 5.120311766582832,
 'response': 5.035355456071596,
 'category': 9.847893877170346}

In [44]:
def minsearch_improved(query, best_params):
    boost = {
       'intent': best_params["intent"],
        'question': best_params["question"],
        'response': best_params["response"],
        'category': best_params["category"],
    }

    results = index.search(
        query=query['question'],  # string directly or export
        filter_dict={'category': query["category"]},
        boost_dict=boost,
        num_results=10
    )

    return results

In [None]:
# {'hit_rate': 0.046495489243580844, 'mrr': 0.01509368494101318}

In [47]:
evaluate(ground_truth, lambda q: minsearch_improved(q, best_params))

  0%|          | 0/1441 [00:00<?, ?it/s]

{'hit_rate': 0.04371963913948647, 'mrr': 0.014952413998215525}

In [46]:
evaluate(ground_truth, lambda q: minsearch_improved(q, study.best_params))

  0%|          | 0/1441 [00:00<?, ?it/s]

{'hit_rate': 0.04718945176960444, 'mrr': 0.015215128383067307}

In [14]:
best_params = {'intent': 0.022894885883346205,
 'question': 5.120311766582832,
 'response': 5.035355456071596,
 'category': 9.847893877170346}

# RAG evaluation/monitoring

In [None]:
import sys
sys.path.append("../assistant")

import os
os.environ["LLM_PROVIDER"]="HF"
from config import SETTINGS

In [None]:
SETTINGS.HF_API_KEY = "hf_..."

In [25]:
print(SETTINGS.LLM_PROVIDER)
# print(SETTINGS.API_KEY)
print(SETTINGS.BASE_URL)
print(SETTINGS.MODEL_EMBED)
print(SETTINGS.MODEL_CHAT)

HF
https://router.huggingface.co/v1
nomic-ai/nomic-embed-text-v1.5
openai/gpt-oss-120b


In [26]:
from openai import OpenAI

# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))  # Initialize the OpenAI client

# Connect to local Ollama instead of OpenAI cloud
# client = OpenAI(
#     # https://huggingface.co/openai/gpt-oss-120b
#     # https://ollama.com/library/gpt-oss
#     # ollama pull gpt-oss:20b
#     # ollama run gpt-oss:120b
#     # curl https://<your-forwarded-url>.app.github.dev/v1/models
#     base_url="http://localhost:11434/v1",  # Ollama API endpoint
#     # base_url="https://glowing-carnival-qgwprp5pjjj255q-11434.app.github.dev/v1",  # Ollama API endpoint
#     api_key="ollama"  # dummy key (ignored by Ollama)
# )

client = OpenAI(
    base_url=SETTINGS.BASE_URL,
    api_key=SETTINGS.API_KEY,
)

In [27]:
def search(query, best_params):
    boost = {
       'intent': best_params["intent"],
        'question': best_params["question"],
        'response': best_params["response"],
        'category': best_params["category"],
    }
    results = index.search(
        query=query,
        filter_dict={'category': 'CONTENT'},  # Example filter on category field
        boost_dict=boost,
        num_results=5
    )
    return results

In [None]:
PROMPT_TEMPLATE = """
You are a customer support assistant specialized in the Media domain.
Your task is to answer the QUESTION strictly using the information provided in the CONTEXT.

Rules of conduct:
1. Only use facts explicitly present in the CONTEXT. Do not rely on external knowledge.
2. If the CONTEXT contains partial information, base your answer only on what is given.
3. If the CONTEXT does not contain the required information, respond exactly with:
   "I don’t have that information."
4. Keep answers clear, concise, and factual. Avoid speculation, assumptions, or subjective language.
5. Do not rephrase or fabricate details not present in the CONTEXT.
6. Do not merge or infer facts across unrelated sections of the CONTEXT unless explicitly stated.
7. Preserve numerical values, names, and terminology exactly as they appear in the CONTEXT.
8. Do not output meta-comments about your limitations or process (e.g., “As an AI…”).

QUESTION: {instruction}

CONTEXT: '''
{context}
'''
""".strip()


def build_prompt(query, search_results):
    context = ""
    for doc in search_results:
        context += (
            f"intent: {doc['intent']}\n"
            f"question: {doc['question']}\n"
            f"answer: {doc['response']}\n\n"
        )

    # Add to template instruction and context
    prompt = PROMPT_TEMPLATE.format(
        instruction=query,
        context=context,
    ).strip()
    return prompt


def llm(prompt):
    response = client.chat.completions.create(
        # model='gpt-4o-mini',  # OpenAI
        # model="gpt-oss:20b",   # Ollama "llama3" or any model available in your Ollama
        model=SETTINGS.MODEL_CHAT,  # "openai/gpt-oss-120b:together",  # huggingface
        messages=[{"role": "user", "content": prompt}],
        # temperature=0.0
    )
    return response


def rag(query):
    search_results = search(query, best_params)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer.choices[0].message.content

In [33]:
question = 'I need assistance to the Media domain for copyright.'
answer = rag(question)
print(answer)

2025-09-08 18:43:44,157 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"


To report a copyright infringement, please follow these steps:

1. Access our official website at {{WEBSITE_URL}}.  
2. Go to the {{COPYRIGHT_SECTION}} section.  
3. Find and select the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} option.  
4. Complete the necessary fields with accurate information regarding the infringement.  
5. Submit your completed form for our evaluation.  

Your report will be reviewed and appropriate actions will be taken in accordance with our policies.


In [36]:
# from rag import rag as rag_test
# rag_test(question)

In [None]:
EVAL_PROMPT_TEMPLATE = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to assess the **relevance** of the generated answer to the given question.

Evaluation Guidelines:
1. Judge only on relevance between the QUESTION and the GENERATED ANSWER.
2. Ignore style, grammar, tone, and fluency. They are not part of this evaluation.
3. Choose exactly one of the following labels:
   - "NON_RELEVANT": The answer does not address the question at all.
   - "PARTLY_RELEVANT": The answer addresses the question partially OR mixes relevant and irrelevant content.
   - "RELEVANT": The answer fully and directly answers the question with no irrelevant content.
4. Provide a short, factual explanation (maximum 2 sentences).
5. The output must be strictly valid JSON, without code fences, without extra text, and without additional commentary.

Evaluation Data:
- Question: {question}
- Generated Answer: '''
{answer}
'''

Output format:
{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Your brief explanation here]"
}}
""".strip()

In [24]:
len(ground_truth)

1441

In [25]:
record = ground_truth[0]


In [26]:
question

'I need assistance to the Media domain for copyright.'

In [None]:
prompt = EVAL_PROMPT_TEMPLATE.format(
    question=question,
    answer=answer,
)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to assess the **relevance** of the generated answer to the given question.

Evaluation Guidelines:
1. Judge only on relevance between the QUESTION and the GENERATED ANSWER.
2. Ignore style, grammar, tone, and fluency. They are not part of this evaluation.
3. Choose exactly one of the following labels:
   - "NON_RELEVANT": The answer does not address the question at all.
   - "PARTLY_RELEVANT": The answer addresses the question partially OR mixes relevant and irrelevant content.
   - "RELEVANT": The answer fully and directly answers the question with no irrelevant content.
4. Provide a short, factual explanation (maximum 2 sentences).
5. The output must be strictly valid JSON, without code fences, without extra text, and without additional commentary.

Evaluation Data:
- Question: I need assistance to the Media domain for copyright.
- Generated Answer: '''
To report a copyright infringement, plea

In [29]:
import json

In [30]:
df_sample = df_question.sample(n=200, random_state=0)

In [31]:
sample = df_sample.to_dict(orient='records')

In [32]:
import time

In [33]:
import numpy as np
np.random.uniform(10, 25)

20.49616112886401

In [None]:
evaluations = []

In [None]:
for record in tqdm(sample[126:]):
    time.sleep(np.random.uniform(20, 30))
    question = record['question']
    answer = rag(question) 
    prompt = EVAL_PROMPT_TEMPLATE.format(
        question=question,
        answer=answer
    )

    evaluation_response = llm(prompt).choices[0].message.content
    evaluation = json.loads(evaluation_response)

    evaluations.append((record, answer, evaluation))
    with open('evaluation.json', 'w') as json_file:
        json.dump(evaluations, json_file)

  0%|          | 0/74 [00:00<?, ?it/s]

2025-09-08 15:38:11,522 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-08 15:38:13,298 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-08 15:38:44,032 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-08 15:38:45,132 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-08 15:39:14,123 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-08 15:39:15,960 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-08 15:39:46,891 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-08 15:39:48,504 [INFO] httpx - HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 

In [37]:
len(evaluations)

200

In [38]:
# # Load
# with open("evaluation.json", "rt", encoding="utf-8") as f_in:
#     evaluations = json.load(f_in)
# len(evaluations)

In [41]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [42]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.54
NON_RELEVANT       0.33
PARTLY_RELEVANT    0.13
Name: proportion, dtype: float64

In [43]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
1,I don’t have that information.,46728d3b,If someone comes across material online I thin...,NON_RELEVANT,The answer states a lack of information and do...
2,I don’t have that information.,4333143e,"Hypothetically speaking, if the TV shows don't...",NON_RELEVANT,The answer does not provide any information ab...
3,I don’t have that information.,c7116609,"Gotcha! So, when reporting this bad content on...",NON_RELEVANT,The answer states lack of information and does...
8,I don’t have that information.,bc29a102,I need help with submitting an inappropriate c...,NON_RELEVANT,The answer does not provide any details about ...
9,I don’t have that information.,53bb44bc,Direct and easy access is much appreciated; ca...,NON_RELEVANT,The answer does not provide any instructions o...
...,...,...,...,...,...
187,I don’t have that information.,8b344933,I've encountered offensive content during my d...,NON_RELEVANT,The answer does not provide any information ab...
192,I don’t have that information.,5fdf52b9,Could you guide me through the steps to report...,NON_RELEVANT,"The response does not provide any steps, instr..."
194,I don’t have that information.,6f7fa215,After stumbling upon explicit scenes on the Fr...,NON_RELEVANT,The answer states it lacks the information and...
195,I don’t have that information.,f11d5700,What steps are needed to inform someone about ...,NON_RELEVANT,The answer does not provide any steps or infor...


In [49]:
df_eval[df_eval.relevance == 'NON_RELEVANT'].iloc[:3].head().to_dict()

{'answer': {1: 'I don’t have that information.',
  2: 'I don’t have that information.',
  3: 'I don’t have that information.'},
 'id': {1: '46728d3b', 2: '4333143e', 3: 'c7116609'},
 'question': {1: "If someone comes across material online I think breaches copyright policy when watching educational programs like documentaries on a Smart TV, how do they start this formal complaint process quickly and accurately while ensuring their report is taken seriously promptly through 'Contact Us' at example.com support email?",
  2: "Hypothetically speaking, if the TV shows don't adhere to acceptable public presentation themes on Smart TV platforms like mine owned by your service provider using English as preferred language option (but I also understand Spanish), how do we formally submit a report right from this page? Plus could you clarify once again about reporting such misconduct and any additional steps post submission with credible payment method for fee?",
  3: "Gotcha! So, when reporting 

In [44]:
df_eval[df_eval.relevance == 'PARTLY_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
23,To submit a copyright‑infringement report:\n\n...,6bd60204,What exactly do I have to fill out and where c...,PARTLY_RELEVANT,The answer outlines the general steps and requ...
34,"To report the inappropriate content, follow th...",46728d3b,What steps do I need to follow if there's cont...,PARTLY_RELEVANT,The answer provides generic steps for reportin...
46,To lodge a formal copyright‑infringement compl...,1bf6d2f3,I am really frustrated because my favorite sho...,PARTLY_RELEVANT,The answer provides the general steps to lodge...
47,"To report the offensive content, follow these ...",a9727580,Where can I go to report offensive stuff I jus...,PARTLY_RELEVANT,The answer attempts to explain how to report t...
56,"To report the offensive content, please follow...",821a0af5,Could you assist me on submitting a notice of ...,PARTLY_RELEVANT,The answer provides generic steps for reportin...
78,Visit {{WEBSITE_URL}} and go to the {{COPYRIGH...,ab58d4a4,Where on your website should I go to report so...,PARTLY_RELEVANT,The answer refers to visiting the website and ...
80,"To formally report the copyright infringement,...",caa8e588,My friend uploaded an entire season of that do...,PARTLY_RELEVANT,The answer provides generic steps for reportin...
82,"To report the unlicensed documentary series, f...",4da36375,Can you guide me through reporting that my unl...,PARTLY_RELEVANT,The answer outlines generic reporting steps bu...
105,To report the content you believe is unsuitabl...,74d54826,Could I kindly ask for assistance while report...,PARTLY_RELEVANT,The answer gives generic steps to report copyr...
110,"To report disturbing scenes, follow these step...",e6f08a22,What is the procedure I should follow when enc...,PARTLY_RELEVANT,The answer outlines how to report disturbing s...


In [50]:
df_eval[df_eval.relevance == 'PARTLY_RELEVANT'].iloc[:3].head().to_dict()

{'answer': {23: 'To submit a copyright‑infringement report:\n\n1. Go to the website at **{{WEBSITE_URL}}**.  \n2. Open the **{{COPYRIGHT_SECTION}}** area.  \n3. Click the **{{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}}** link.  \n4. Fill in the required fields – include all details about the unauthorized streaming (who, what was streamed, the device used, timestamps, etc.).  \n5. Submit the completed form online.  \n\nYour report will then be reviewed according to the platform’s policies.',
  34: 'To report the inappropriate content, follow these steps:\n\n1. Visit **{{WEBSITE_URL}}**.  \n2. Identify the specific segment of the documentary that you consider inappropriate.  \n3. Click on the **{{REPORT_SECTION}}** link that appears next to the content (marked by a **{{FLAG_ICON}}**).  \n4. Choose the **{{REPORT_INAPPROPRIATE_CONTENT_OPTION}}** option from the menu.  \n5. Select the reason that best matches the issue from the provided list.  \n6. If needed, add any additional details and then

In [45]:
df_eval.to_csv('../Data/rag-eval-gpt-oss-120b.csv', index=False)

In [None]:
# df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)

In [34]:
# Install vLLM with GPT-OSS support:
# uv venv --python 3.12 --seed
# source .venv/bin/activate
# uv pip install --pre vllm==0.10.1+gptoss \
#     --extra-index-url https://wheels.vllm.ai/gpt-oss/ \
#     --extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
#     --index-strategy unsafe-best-match

# Launch the API server:
# vllm serve openai/gpt-oss-20b
# or
# vllm serve openai/gpt-oss-120b

# from openai import OpenAI
# client = OpenAI(
#     base_url="http://localhost:8000/v1",
#     api_key="EMPTY"
# )