In [1]:
import pandas as pd

## RAG flow

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
documents = df.to_dict(orient='records')

In [4]:
documents[0]

{'id': 1,
 'attraction': 'Geirangerfjord',
 'activity_type': 'Sightseeing / Nature',
 'county': 'Møre og Romsdal',
 'time_to_visit': 'May, June, July, August, September',
 'description': 'One of Norway’s most iconic fjords, surrounded by steep cliffs, waterfalls, and small farms. It is a UNESCO World Heritage Site and offers boat tours, kayaking, and breathtaking viewpoints such as Dalsnibba.'}

In [5]:
import minsearch

In [6]:
index = minsearch.Index(
    text_fields = ['attraction', 'activity_type', 'county', 'time_to_visit',
       'description'],
    keyword_fields=['id']
)

In [7]:
index.fit(documents)

<minsearch.minsearch.Index at 0x79b3172389b0>

In [8]:
query = 'give me one hiking in Rogaland'

In [9]:
def search(query):
    boost = {
        'attraction': 3,
        'activity_type': 2,
        'county': 2,
        'time_to_visit': 1,
        'description': 1
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [10]:
prompt_template = """
You're a tourist guide in Norway. Answer the QUESTION based on the CONTEXT from our attractions database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()


entry_template = """
attraction: {attraction}
activity_type: {activity_type}
county: {county}
time_to_visit: {time_to_visit}
description: {description}
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [13]:
search_results=search(query)

In [14]:
prompt = build_prompt(query, search_results)
print(prompt)

You're a tourist guide in Norway. Answer the QUESTION based on the CONTEXT from our attractions database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: give me one hiking in Rogaland

CONTEXT:
attraction: Preikestolen (Pulpit Rock)
activity_type: Hiking
county: Rogaland
time_to_visit: May, June, July, August, September, October
description: A flat plateau on a steep cliff 604 meters above Lysefjord. The hike takes 2–4 hours each way and rewards visitors with one of Norway’s most photographed panoramic views.

attraction: Kjeragbolten
activity_type: Hiking
county: Rogaland
time_to_visit: June, July, August, September
description: A massive boulder wedged between cliffs 984 meters above Lysefjord. Adventurous hikers can step onto the rock for dramatic photos, while the hike also offers views of waterfalls and mountains.

attraction: Kjerag Waterfalls
activity_type: Nature / Hiking
county: Rogaland
time_to_visit: June, July, August, September
description: Mas

In [15]:
import openai
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [52]:
def llm(prompt, model = 'gpt_5-mini'):
    response = client.chat.completions.create(
        model = model,
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [55]:
def rag(query, model='gpt-5-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model = model)
    return answer

In [56]:
answer = rag('I want to visit a beautifull fjord in Norway in December')
print(answer)

From the attractions listed, take the Hurtigruten Coastal Voyage. It runs year‑round (including December) and offers breathtaking fjords plus northern lights in winter.

Note: Balestrand is a picturesque village on the Sognefjord, but its listed best time to visit is May–September (not December).


## test LLM as a judge

In [20]:
df_question = pd.read_csv('../data/ground-truth-data.csv')

In [21]:
ground_truth = df_question.to_dict(orient='records')

In [22]:
ground_truth[0]

{'id': 1, 'question': 'In which county is Geirangerfjord located?'}

In [23]:
len(ground_truth)

750

In [35]:
prompt_judge_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [26]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

In [30]:
print(answer_llm)

Geirangerfjord is located in Møre og Romsdal county.


In [36]:
prompt = prompt_judge_template.format(question = question, answer_llm = answer_llm)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: In which county is Geirangerfjord located?
Generated Answer: Geirangerfjord is located in Møre og Romsdal county.

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [32]:
llm(prompt)

'{\n  "Relevance": "RELEVANT",\n  "Explanation": "The generated answer directly and correctly names the county (Møre og Romsdal) where Geirangerfjord is located, matching the expected/original answer."\n}'

## Use LLM as a judge

In [33]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [57]:
df_sample = df_question.sample(n=100, random_state=1)
sample = df_sample.to_dict(orient='records')

In [58]:
sample

[{'id': 126, 'question': 'Which county is Åndalsnes in?'},
 {'id': 64, 'question': 'Does Drammensbadet have pools, slides and saunas?'},
 {'id': 68,
  'question': 'How well preserved is Fredrikstad Fortress compared to other fortified towns in Europe?'},
 {'id': 120, 'question': 'Is Vangskyrkja still used for services today?'},
 {'id': 19, 'question': 'Which county is Tromsø in?'},
 {'id': 134,
  'question': 'Which months are best to visit Sunndalsøra for hiking and nature activities?'},
 {'id': 53, 'question': 'What activities can you do at Gaustatoppen?'},
 {'id': 17, 'question': 'During which months can I visit Nidaros Cathedral?'},
 {'id': 118, 'question': 'Which county is Grimstad located in?'},
 {'id': 18, 'question': 'How long is Lysefjord?'},
 {'id': 114,
  'question': 'Was Steinkjer Church built before or after World War II?'},
 {'id': 133,
  'question': 'Are there opportunities to experience Sami culture in Malangen?'},
 {'id': 44, 'question': 'Which months are best to visit 

In [47]:
import json

In [86]:
evaluations = []

In [87]:
for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-5-nano')

    prompt = prompt_judge_template.format(question=question, answer_llm=answer_llm)
    evaluation = llm(prompt, model='gpt-5-mini')
    evaluation = json.loads(evaluation)
    
    evaluations.append((record, answer_llm, evaluation))


100%|██████████| 100/100 [13:45<00:00,  8.26s/it]


In [88]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

In [89]:
df_eval

Unnamed: 0,record,answer,evaluation
0,"{'id': 126, 'question': 'Which county is Åndal...",Møre og Romsdal.,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
1,"{'id': 64, 'question': 'Does Drammensbadet hav...",Yes. Drammensbadet is described as Norway’s la...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
2,"{'id': 68, 'question': 'How well preserved is ...",Fredrikstad Fortress is one of Europe’s best-p...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
3,"{'id': 120, 'question': 'Is Vangskyrkja still ...",Yes. Vangskyrkja is still in use today; it is ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
4,"{'id': 19, 'question': 'Which county is Tromsø...",Troms og Finnmark.,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
...,...,...,...
95,"{'id': 120, 'question': 'From which century do...",The Vangskyrkja dates from the 13th century.,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
96,"{'id': 1, 'question': 'Is Geirangerfjord desig...",Yes. Geirangerfjord is a UNESCO World Heritage...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
97,"{'id': 125, 'question': 'During which months c...","January, February, March, April, May, June, Ju...","{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
98,"{'id': 89, 'question': 'Which months are best ...","June, July, August, and September.","{'Relevance': 'RELEVANT', 'Explanation': 'The ..."


In [90]:
df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])
df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])
del df_eval['record']
del df_eval['evaluation']

In [91]:
df_eval

Unnamed: 0,answer,id,question,relevance,explanation
0,Møre og Romsdal.,126,Which county is Åndalsnes in?,RELEVANT,The generated answer correctly names the count...
1,Yes. Drammensbadet is described as Norway’s la...,64,"Does Drammensbadet have pools, slides and saunas?",RELEVANT,The generated answer directly affirms that Dra...
2,Fredrikstad Fortress is one of Europe’s best-p...,68,How well preserved is Fredrikstad Fortress com...,RELEVANT,The generated answer directly addresses the co...
3,Yes. Vangskyrkja is still in use today; it is ...,120,Is Vangskyrkja still used for services today?,RELEVANT,The generated answer directly affirms that Van...
4,Troms og Finnmark.,19,Which county is Tromsø in?,RELEVANT,The generated answer correctly identifies Trom...
...,...,...,...,...,...
95,The Vangskyrkja dates from the 13th century.,120,From which century does Vangskyrkja date?,RELEVANT,The generated answer states that Vangskyrkja d...
96,Yes. Geirangerfjord is a UNESCO World Heritage...,1,Is Geirangerfjord designated as a UNESCO World...,RELEVANT,The generated answer directly and correctly af...
97,"January, February, March, April, May, June, Ju...",125,During which months can I visit Fjellheisen Tr...,RELEVANT,The generated answer directly addresses the qu...
98,"June, July, August, and September.",89,Which months are best for visiting Mjøsa Lake?,RELEVANT,The generated answer directly and succinctly l...


In [92]:
df_eval.relevance.value_counts()

relevance
RELEVANT           82
PARTLY_RELEVANT    15
NON_RELEVANT        3
Name: count, dtype: int64

In [93]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
49,It runs through Vestland and Innlandet counties.,98,Which counties does Rallarvegen run through?,NON_RELEVANT,The generated answer is incorrect. Rallarvegen...
64,Runde Bird Island is located in Vestland county.,148,Which county is Runde Bird Island located in?,NON_RELEVANT,"The generated answer names Vestland county, wh..."
68,Vestland.,120,In which county is Vangskyrkja located?,NON_RELEVANT,The answer 'Vestland' is incorrect. Vangskyrkj...
