In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

## Ingestion

In [66]:
import json

# Real IDs
with open('../Data/documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [4]:
len(documents), documents[3]

(424,
 {'intent': 'report_copyright_infringement',
  'question': 'can uhelp me submitting a notification of copyright  infringement',
  'response': 'To address copyright infringement concerns, please adhere to the following guidelines:\n\n1. Access our website at {{WEBSITE_URL}}.\n2. Go to the {{COPYRIGHT_SECTION}} area.\n3. Find and select the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} choice.\n4. Complete the necessary fields, providing comprehensive details regarding the infringement.\n5. Submit the completed form for our assessment.\n\nYour submission will be reviewed, and we will take the requisite steps in line with our policies.',
  'category': 'CONTENT',
  'id': '1e254822'})

In [5]:
# Generated questions
df_question = pd.read_csv('../Data/ground-truth-data.csv')
df_question.shape

(1931, 3)

In [6]:
df_question.head()

Unnamed: 0,question,category,id
0,How do I go about reporting a copyright violat...,CONTENT,34b742ae
1,What steps are necessary to report an infringe...,CONTENT,34b742ae
2,Could you guide me through the process of fili...,CONTENT,34b742ae
3,"When submitting evidence for potential piracy,...",CONTENT,34b742ae
4,What kind of responses can users expect after ...,CONTENT,34b742ae


In [7]:
df_question.isna().sum()

question    1
category    0
id          0
dtype: int64

In [8]:
df_question[df_question.question.isna()]

Unnamed: 0,question,category,id
606,,CONTENT,800255d0


In [9]:
df_question = df_question.dropna()
df_question.shape

(1930, 3)

In [10]:
ground_truth = df_question.to_dict(orient='records')

In [11]:
ground_truth[0]

{'question': 'How do I go about reporting a copyright violation on your platform?',
 'category': 'CONTENT',
 'id': '34b742ae'}

## RAG flow

In [12]:
documents[0].keys(), ground_truth[0].keys()

(dict_keys(['intent', 'question', 'response', 'category', 'id']),
 dict_keys(['question', 'category', 'id']))

In [13]:
import minsearch

# Create a MinSearch index with specified text and keyword fields
index = minsearch.Index(
    text_fields=['intent', 'question', 'response'],  # full-text searchable fields
    keyword_fields=['id', 'category'],  # fields for exact matching
)

# Fit the index to our document list
index.fit(documents)

<minsearch.minsearch.Index at 0x163ad11cec0>

In [14]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query['question'],  # string directly or export
        filter_dict={'category': query["category"]},
        boost_dict=boost,
        num_results=10
    )
    return results

## Retrieval evaluation

In [15]:
from tqdm.auto import tqdm


# Compute Hit Rate: % of queries for which the correct document was retrieved
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:  # If any retrieved doc matches the correct ID
            cnt = cnt + 1
    return cnt / len(relevance_total)


# Compute Mean Reciprocal Rank (MRR)
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:  # True means relevant doc found at this rank
                total_score = total_score + 1 / (rank + 1)
                # break  # only the first correct hit counts for MRR
    return total_score / len(relevance_total)


# Main evaluation loop
def evaluate(ground_truth, search_function, disable=False):
    relevance_total = []

    for q in tqdm(ground_truth, disable=disable):  # iterate over each query
        doc_id = q['id']  # correct document id
        results = search_function(q)  # run search
        relevance = [d['id'] == doc_id for d in results]  # check if results match the true id
        relevance_total.append(relevance)  # collect all relevance flags

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [24]:
# Evaluate using the ground truth and the defined search function
evaluate(ground_truth, lambda q: minsearch_search(q), disable=None)

  0%|          | 0/1930 [00:00<?, ?it/s]

{'hit_rate': 0.04559585492227979, 'mrr': 0.015142898264659919}

# VectorSearch

In [16]:
# !pip install minsearch -qq

In [17]:
from minsearch import VectorSearch  # For vector-based semantic search
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF text vectorization
from sklearn.decomposition import TruncatedSVD  # Dimensionality reduction
from sklearn.pipeline import make_pipeline  # To chain TF-IDF + SVD

In [18]:
documents[0]

{'intent': 'report_copyright_infringement',
 'question': 'how can i report copyright violation',
 'response': 'To report copyright infringement, please adhere to the following procedures:\n\n1. Access our website at {{WEBSITE_URL}}.\n2. Direct yourself to the {{COPYRIGHT_SECTION}} section.\n3. Find and click on the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} option.\n4. Provide all required details regarding the infringement.\n5. Submit the completed form for our assessment.\n\nUpon receiving your report, it will be carefully analyzed, and appropriate measures will be implemented in accordance with our guidelines.',
 'category': 'CONTENT',
 'id': '34b742ae'}

In [19]:
texts = []

for doc in tqdm(documents):
    # Concatenate question and answer text
    t = doc['question'] + ' ' + doc['response']
    texts.append(t)

  0%|          | 0/424 [00:00<?, ?it/s]

In [20]:
# Create a pipeline: TF-IDF vectorization + Truncated SVD (128 dimensions)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=1, ngram_range=(1,1), norm='l1'),  # ignore rare words
    TruncatedSVD(n_components=128, random_state=0)  # reduce to 128D
)#.set_output(transform='pandas')

# Fit and transform the texts into embeddings (2D numpy array)
X = pipeline.fit_transform(texts)
X

array([[ 8.61903956e-02, -3.32329444e-02,  1.84945311e-02, ...,
        -4.48703441e-03,  4.45804900e-04, -1.82140197e-05],
       [ 5.39078893e-02, -1.60753488e-03, -8.88495363e-02, ...,
         1.12538530e-03, -2.76375777e-03,  7.13602034e-04],
       [ 1.04216803e-01, -4.18585586e-02,  1.36393373e-02, ...,
         4.17043648e-04,  1.47759318e-03,  9.44728636e-04],
       ...,
       [ 9.11410523e-02, -5.00482363e-02,  9.04754123e-03, ...,
         4.46023998e-03,  4.19823652e-03, -6.38576361e-03],
       [ 9.84283755e-02, -3.84751829e-02, -1.32054598e-02, ...,
         9.42712159e-04, -3.55781865e-04, -3.01358641e-03],
       [ 8.87460397e-02, -3.18344791e-02,  1.48513572e-02, ...,
        -2.57281015e-03,  3.11055623e-03,  3.82499272e-03]],
      shape=(424, 128))

In [21]:
# Initialize the vector index with 'id' as a keyword filter
vindex = VectorSearch(keyword_fields=['category'])

# Fit the vector index with our embeddings and original documents
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x1639cb9ede0>

In [22]:
def vector_search_function(query):
    # Convert the input question to a single embedding vector
    query_vec = pipeline.transform([query["question"]])  # shape: (1, 128) query is still only the question

    # Run the vector search, filter by 'id'
    return vindex.search(
        query_vector=query_vec[0],  # use 1D vector
        filter_dict={"category": query["category"]},  # filter documents by id
        num_results=5  # return top 5 matches
    )

In [55]:
# Use the same evaluation function from earlier
results = evaluate(ground_truth, vector_search_function)
results

  0%|          | 0/1930 [00:00<?, ?it/s]

{'hit_rate': 0.032642487046632127, 'mrr': 0.01636442141623488}

## Finding the best parameters

In [23]:
df_validation = df_question[:len(df_question)//2]
df_test = df_question[len(df_question)//2:]

In [24]:
import random

random.seed(0)

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evaluate the objective function
        current_score = objective_function(current_params)

        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [25]:
gt_val = df_validation.to_dict(orient='records')

In [26]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query['question'],  # string directly or export
        filter_dict={'category': query["category"]},
        boost_dict=boost,
        num_results=10
    )

    return results

In [27]:
param_ranges = {
    'intent': (0.0, 10.0),
    'question': (0.0, 10.0),
    'response': (0.0, 10.0),
    'category': (0.0, 10.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q, boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [74]:
# return best_params, best_score
best_params, best_score = simple_optimize(param_ranges, objective, n_iterations=20)
best_params, best_score

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

({'intent': 1.1713429320851798,
  'question': 2.204605368678285,
  'response': 7.9458297171057595,
  'category': 3.3253614921965546},
 0.02115593387614112)

In [81]:
import optuna

def objective(trial):
    # Suggest float values in the given ranges
    boost_params = {
        k: trial.suggest_float(k, low, high)
        for k, (low, high) in param_ranges.items()
    }

    def search_function(q):
        return minsearch_search(q, boost_params)

    # Evaluate on validation set
    results = evaluate(gt_val, search_function, disable=True)
    return results['mrr']

# Run optimization
study = optuna.create_study(direction="maximize")  # maximize MRR
study.optimize(objective, n_trials=50, gc_after_trial=True, show_progress_bar=True, n_jobs=-1)

# Best params
print("Best params:", study.best_params)
print("Best MRR:", study.best_value)

[I 2025-08-23 16:59:57,856] A new study created in memory with name: no-name-16bf3947-c49d-41e8-8b50-201fa591c352


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-08-23 17:00:56,789] Trial 4 finished with value: 0.020615182169586306 and parameters: {'intent': 2.249132414867029, 'question': 2.234031749802022, 'response': 9.771698228947187, 'category': 1.9755464781730725}. Best is trial 4 with value: 0.020615182169586306.
[I 2025-08-23 17:00:58,251] Trial 3 finished with value: 0.020734846615675623 and parameters: {'intent': 4.709344164326785, 'question': 0.9312117423908761, 'response': 5.71511647366386, 'category': 9.689631225548114}. Best is trial 3 with value: 0.020734846615675623.
[I 2025-08-23 17:00:58,354] Trial 0 finished with value: 0.017276503001891597 and parameters: {'intent': 5.950954075284947, 'question': 0.09971871764905882, 'response': 8.47176229701623, 'category': 4.472868179991415}. Best is trial 3 with value: 0.020734846615675623.
[I 2025-08-23 17:00:58,643] Trial 5 finished with value: 0.020581873509334644 and parameters: {'intent': 5.176019844107193, 'question': 1.329122027882893, 'response': 9.78720728619323, 'category

In [28]:
def minsearch_improved(query, best_params):
    boost = {
       'intent': best_params["intent"],
        'question': best_params["question"],
        'response': best_params["response"],
        'category': best_params["category"],
    }

    results = index.search(
        query=query['question'],  # string directly or export
        filter_dict={'category': query["category"]},
        boost_dict=boost,
        num_results=10
    )

    return results

In [80]:
evaluate(ground_truth, lambda q: minsearch_improved(q, best_params))

  0%|          | 0/1930 [00:00<?, ?it/s]

{'hit_rate': 0.06217616580310881, 'mrr': 0.02112385886997287}

In [84]:
best_params

{'intent': 1.1713429320851798,
 'question': 2.204605368678285,
 'response': 7.9458297171057595,
 'category': 3.3253614921965546}

In [83]:
study.best_params

{'intent': 4.709344164326785,
 'question': 0.9312117423908761,
 'response': 5.71511647366386,
 'category': 9.689631225548114}

In [82]:
evaluate(ground_truth, lambda q: minsearch_improved(q, study.best_params))

  0%|          | 0/1930 [00:00<?, ?it/s]

{'hit_rate': 0.06424870466321243, 'mrr': 0.021246813060284583}

# RAG evaluation/monitoring

In [39]:
best_params = {'intent': 4.709344164326785,
 'question': 0.9312117423908761,
 'response': 5.71511647366386,
 'category': 9.689631225548114}

In [40]:
def search(query, best_params):
    boost = {
       'intent': best_params["intent"],
        'question': best_params["question"],
        'response': best_params["response"],
        'category': best_params["category"],
    }
    results = index.search(
        query=query,
        filter_dict={'category': 'CONTENT'},  # Example filter on category field
        boost_dict=boost,
        num_results=5
    )
    return results

In [None]:
# from dotenv import load_dotenv
# load_dotenv()

import os
# os.environ["HF_TOKEN"]

In [None]:
from openai import OpenAI

# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))  # Initialize the OpenAI client

# Connect to local Ollama instead of OpenAI cloud
# client = OpenAI(
#     # https://huggingface.co/openai/gpt-oss-120b
#     # https://ollama.com/library/gpt-oss
#     # ollama pull gpt-oss:20b
#     # ollama run gpt-oss:120b
#     # curl https://<your-forwarded-url>.app.github.dev/v1/models
#     base_url="http://localhost:11434/v1",  # Ollama API endpoint
#     # base_url="https://glowing-carnival-qgwprp5pjjj255q-11434.app.github.dev/v1",  # Ollama API endpoint
#     api_key="ollama"  # dummy key (ignored by Ollama)
# )

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.environ["HF_TOKEN"],
)

In [48]:
def build_prompt(query, search_results):
    prompt_template = """
    You are a customer support assistant for the Media domain.
    Answer the QUESTION using only the information from CONTEXT.

    Rules:
    - Only use facts from CONTEXT.
    - Keep your answer clear, factual and concise.
    - Do not hallucinate or add outside knowledge.
    - Do not invent information not found in CONTEXT.
    - If CONTEXT doesn’t contain the answer, respond: "I don’t have that information."

    QUESTION: {instruction}

    CONTEXT:
    {context}
    """.strip()

    context = ""
    for doc in search_results:
        context += (
            f"intent: {doc['intent']}\n"
            f"question: {doc['question']}\n"
            f"answer: {doc['response']}\n\n"
        )

    # Add to template instruction and context
    prompt = prompt_template.format(instruction=query, context=context).strip()
    return prompt


def llm(prompt):
    response = client.chat.completions.create(
        # model='gpt-4o-mini',  # OpenAI
        # model="gpt-oss:20b",   # Ollama "llama3" or any model available in your Ollama
        model="openai/gpt-oss-120b:fireworks-ai",  # huggingface
        messages=[{"role": "user", "content": prompt}],
        # temperature=0.0
    )
    return response


def rag(query):
    search_results = search(query, best_params)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer.choices[0].message.content

In [50]:
question = 'I need assistance to the Media domain for copyright.'
answer = rag(question)
print(answer)

To report a copyright violation, please follow these steps:

1. Visit **{{WEBSITE_URL}}**.  
2. Go to the **{{COPYRIGHT_SECTION}}** section.  
3. Select the **{{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}}** option.  
4. Complete the required fields with details about the infringement.  
5. Submit the form for review.


In [51]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to judge the **relevance** of the generated answer to the given question.

Guidelines:
- Focus and analyze only on relevance the content and context of the generated answer in relation to the question, not style, grammar, or tone.
- Use exactly one of the following labels:
  - "NON_RELEVANT": The answer does not address the question.
  - "PARTLY_RELEVANT": The answer addresses the question partially or contains both correct and irrelevant parts.
  - "RELEVANT": The answer fully and directly addresses the question.
- Keep the explanation brief (1-2 sentences).
- Output valid parsable JSON without using code blocks only, no extra text.

Evaluation Data:

Question: {question}
Generated Answer: {answer_llm}

Output format:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [52]:
len(ground_truth)

1930

In [53]:
record = ground_truth[0]


In [54]:
question

'I need assistance to the Media domain for copyright.'

In [55]:
answer_llm = answer

In [56]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to judge the **relevance** of the generated answer to the given question.

Guidelines:
- Focus and analyze only on relevance the content and context of the generated answer in relation to the question, not style, grammar, or tone.
- Use exactly one of the following labels:
  - "NON_RELEVANT": The answer does not address the question.
  - "PARTLY_RELEVANT": The answer addresses the question partially or contains both correct and irrelevant parts.
  - "RELEVANT": The answer fully and directly addresses the question.
- Keep the explanation brief (1-2 sentences).
- Output valid parsable JSON without using code blocks only, no extra text.

Evaluation Data:

Question: I need assistance to the Media domain for copyright.
Generated Answer: To report a copyright violation, please follow these steps:

1. Visit **{{WEBSITE_URL}}**.  
2. Go to the **{{COPYRIGHT_SECTION}}** section.  
3. Select the **{{REPOR

In [57]:
import json

In [59]:
df_sample = df_question.sample(n=200, random_state=0)

In [60]:
sample = df_sample.to_dict(orient='records')

In [63]:
import time

In [62]:
import numpy as np
np.random.uniform(10, 25)

13.034609954165976

In [64]:
evaluations = []

for record in tqdm(sample):
    time.sleep(np.random.uniform(20, 30))
    question = record['question']
    answer_llm = rag(question) 
    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation_response = llm(prompt).choices[0].message.content
    evaluation = json.loads(evaluation_response)

    evaluations.append((record, answer_llm, evaluation))
    with open('evaluation.json', 'w') as json_file:
        json.dump(evaluations, json_file)

  0%|          | 0/200 [00:00<?, ?it/s]

APIStatusError: Error code: 402 - {'error': 'You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.'}

In [68]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [69]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.826087
NON_RELEVANT       0.123188
PARTLY_RELEVANT    0.050725
Name: proportion, dtype: float64

In [70]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
13,I don’t have that information.,0d94a91c,Could I contact the community managers directl...,NON_RELEVANT,The answer states lack of information and does...
33,I don’t have that information.,7de19bdb,What steps should I take to submit a complaint...,NON_RELEVANT,The answer does not provide any steps or infor...
39,I don’t have that information.,c1eb2c2b,What are the steps and options available for r...,NON_RELEVANT,The answer does not provide any steps or optio...
40,I don’t have that information.,f634cfdf,Could you inform me about any possible procedu...,NON_RELEVANT,The answer states a lack of information and do...
56,I don’t have that information.,dc9097b0,Can someone assist me in understanding the pro...,NON_RELEVANT,The response states a lack of information and ...
57,I don’t have that information.,3f3d1f47,If an advertisement seems misleading and thus ...,NON_RELEVANT,The answer does not provide any information ab...
63,I don’t have that information.,fba65cab,"I need directions, but I am currently not near...",NON_RELEVANT,The answer does not provide any directions or ...
67,I don’t have that information.,5b52f0ec,'Deny unauthorised content' or 'flag policy br...,NON_RELEVANT,The answer provides no clarification or instru...
71,I don’t have that information.,ed546a3b,As someone bilingual who uses both English and...,NON_RELEVANT,The answer states lack of information and does...
78,I don’t have that information.,521e7904,If we're discussing rights enforcement on your...,NON_RELEVANT,The answer states a lack of information and do...


In [73]:
df_eval[df_eval.relevance == 'PARTLY_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
8,"To report the disturbing article, follow these...",3c7c1979,I encountered a disturbing article on your web...,PARTLY_RELEVANT,"The answer outlines the reporting steps, which..."
16,You should start by visiting the company’s off...,3e980aab,I want to alert you about someone posting my o...,PARTLY_RELEVANT,The answer suggests starting at the company's ...
50,"After you submit the accusation, your report i...",c3c92293,What actions are taken after I submit an accus...,PARTLY_RELEVANT,The answer mentions that the report is reviewe...
55,To submit a complaint about sexually explicit ...,d8906304,What steps should be taken to submit a complai...,PARTLY_RELEVANT,The answer outlines general steps for reportin...
80,Yes. When you report a potential copyright inf...,521e7904,Is there specific information I need to provid...,PARTLY_RELEVANT,The answer acknowledges that specific details ...
125,"To report a possible copyright infringement, f...",f5810df0,What steps do I need to follow after finding a...,PARTLY_RELEVANT,The answer offers generic reporting steps rela...
129,You’ll need to fill out the official copyright...,caa8e588,What information do I have to provide in detai...,PARTLY_RELEVANT,The answer mentions that detailed information ...


In [72]:
df_eval.to_csv('../Data/rag-eval-gpt-oss-120b.csv', index=False)

In [None]:
# df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)

In [34]:
# Install vLLM with GPT-OSS support:
# uv venv --python 3.12 --seed
# source .venv/bin/activate
# uv pip install --pre vllm==0.10.1+gptoss \
#     --extra-index-url https://wheels.vllm.ai/gpt-oss/ \
#     --extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
#     --index-strategy unsafe-best-match

# Launch the API server:
# vllm serve openai/gpt-oss-20b
# or
# vllm serve openai/gpt-oss-120b

# from openai import OpenAI
# client = OpenAI(
#     base_url="http://localhost:8000/v1",
#     api_key="EMPTY"
# )