In [1]:
import pandas as pd

## Ingestion

In [2]:
import json

with open('../Data/documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
import minsearch

In [4]:
import minsearch

index = minsearch.Index(
    text_fields=['intent', 'question', 'response', 'category'],
    keyword_fields=['id']
)

In [5]:
index.fit(documents)

<minsearch.Index at 0x7cc9f61dfb30>

## RAG flow

In [7]:
from openai import OpenAI

client = OpenAI(
)

In [8]:
def search(query):
    boost = {'instruction': 3.0, 'intent': 0.5}

    results = index.search(
        query=query,
        filter_dict={'category': 'ORDER'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [9]:
def build_prompt(query, search_results):
    prompt_template = """
    You are a customer support assistant. Answer the following question based on the information provided in the CONTEXT from our knowledge base. 

    Use only the facts from the CONTEXT to respond accurately.

    QUESTION: {instruction}

    CONTEXT:
    {context}
    """.strip()


    context = ""
    
    for doc in search_results:
        context = context + f"intent: {doc['intent']}\\question: {doc['question']}\\nanswer: {doc['response']}\\n\\n"
    
    prompt = prompt_template.format(instruction=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
      
    return response

In [10]:

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer.choices[0].message.content

In [11]:
question = 'I need assistance to shop some articles.'
answer = rag(question)
print(answer)

Thank you for reaching out! I'm here to provide you with the assistance you need in shopping for the articles you desire. To begin, could you please let me know the specific articles you are interested in? Providing me with additional details such as preferences, sizes, or colors will allow me to tailor my recommendations to your needs. Together, we'll make sure you find the perfect articles and have a seamless shopping experience.


## Retrieval evaluation

In [12]:
df_question = pd.read_csv('../Data/ground-truth-data.csv')

In [13]:
df_question.head()

Unnamed: 0,question,category,document
0,I'm trying to cancel order {{Order Number}} as...,ORDER,4474359
1,What should I do after logging into my online ...,ORDER,4474359
2,How can I find and select my specific '{{Onlin...,ORDER,4474359
3,Are there additional steps or is it just navig...,ORDER,4474359
4,If I encounter any issues while following thes...,ORDER,4474359


In [14]:
ground_truth = df_question.to_dict(orient='records')

In [15]:
ground_truth[0]

{'question': "I'm trying to cancel order {{Order Number}} as stated in the record.",
 'category': 'ORDER',
 'document': '04474359'}

In [16]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [17]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [18]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [19]:
from tqdm.auto import tqdm

In [20]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1667 [00:00<?, ?it/s]

{'hit_rate': 0.21415716856628675, 'mrr': 0.09635239618742919}

## Finding the best parameters

In [21]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [22]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [23]:
gt_val = df_validation.to_dict(orient='records')

In [24]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [25]:
param_ranges = {
    'intent': (0.0, 3.0),
    'category': (0.0, 3.0),
    'question': (0.0, 3.0),
    'response': (0.0, 3.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [26]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'intent': 0.7205284327307347,
  'category': 2.8106613005066308,
  'question': 0.5505803374561609,
  'response': 2.963078424135383},
 0.12219841269841272)

In [27]:
def minsearch_improved(query):
    boost = {
       'intent': 2.85,
        'category': 0.12,
        'question': 0.03,
        'response': 0.96
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1667 [00:00<?, ?it/s]

{'hit_rate': 0.27714457108578283, 'mrr': 0.12176517077536858}

## RAG evaluation

In [33]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [34]:
len(ground_truth)

1667

In [35]:
record = ground_truth[0]


In [36]:
question

'I need assistance to shop some articles.'

In [37]:
answer_llm = answer

In [38]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: I need assistance to shop some articles.
Generated Answer: Thank you for reaching out! I'm here to provide you with the assistance you need in shopping for the articles you desire. To begin, could you please let me know the specific articles you are interested in? Providing me with additional details such as preferences, sizes, or colors will allow me to tailor my recommendations to your needs. Together, we'll make sure you find the perfect articles and have a seamless shopping experience.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_

In [39]:
import json

In [40]:
df_sample = df_question.sample(n=200, random_state=1)

In [41]:
sample = df_sample.to_dict(orient='records')

In [42]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation_response = llm(prompt).choices[0].message.content
    evaluation = json.loads(evaluation_response)

    evaluations.append((record, answer_llm, evaluation))
    with open('my_dict.json', 'w') as json_file:
        json.dump(evaluations, json_file)

  0%|          | 0/200 [00:00<?, ?it/s]

In [44]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['document'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [45]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.78
PARTLY_RELEVANT    0.19
NON_RELEVANT       0.03
Name: proportion, dtype: float64

In [47]:
df_eval.to_csv('../Data/rag-eval-gpt-4o-mini.csv', index=False)

In [None]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

In [104]:
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)