In [1]:
import pandas as pd

## Ingestion

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
df

Unnamed: 0,id,attraction,activity_type,county,time_to_visit,description
0,1,Geirangerfjord,Sightseeing / Nature,Møre og Romsdal,"May, June, July, August, September","One of Norway’s most iconic fjords, surrounded..."
1,2,Preikestolen (Pulpit Rock),Hiking,Rogaland,"May, June, July, August, September, October",A flat plateau on a steep cliff 604 meters abo...
2,3,Trolltunga,Hiking,Vestland,"June, July, August, September",A famous rock ledge jutting horizontally out f...
3,4,Lofoten Islands,Sightseeing / Outdoor Activities,Nordland,"June, July, August","An Arctic archipelago with dramatic peaks, san..."
4,5,North Cape (Nordkapp),Sightseeing,Troms og Finnmark,"May, June, July, August",The northernmost point accessible by car in Eu...
...,...,...,...,...,...,...
145,146,Helgeland Coast,Sightseeing / Outdoor Activities,Nordland,"June, July, August",A stunning stretch of coastline with thousands...
146,147,Hjørundfjord,Sightseeing / Nature,Møre og Romsdal,"June, July, August, September",A scenic fjord surrounded by the Sunnmøre Alps...
147,148,Runde Bird Island,Wildlife / Sightseeing,Vestland,"April, May, June, July, August","An island famous for seabird colonies, includi..."
148,149,Fauske,Nature / Outdoor Activities,Nordland,"June, July, August, September",A town near mountains and glaciers. Popular fo...


In [4]:
df.columns

Index(['id', 'attraction', 'activity_type', 'county', 'time_to_visit',
       'description'],
      dtype='object')

In [5]:
documents = df.to_dict(orient='records')

In [6]:
documents[0]

{'id': 1,
 'attraction': 'Geirangerfjord',
 'activity_type': 'Sightseeing / Nature',
 'county': 'Møre og Romsdal',
 'time_to_visit': 'May, June, July, August, September',
 'description': 'One of Norway’s most iconic fjords, surrounded by steep cliffs, waterfalls, and small farms. It is a UNESCO World Heritage Site and offers boat tours, kayaking, and breathtaking viewpoints such as Dalsnibba.'}

## Search

In [7]:
import minsearch

In [8]:
index = minsearch.Index(
    text_fields = ['attraction', 'activity_type', 'county', 'time_to_visit',
       'description'],
    keyword_fields=['id']
)

In [9]:
index.fit(documents)

<minsearch.minsearch.Index at 0x75236ecee930>

In [10]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [11]:
search('Hemsedal')

[{'id': 34,
  'attraction': 'Hemsedal',
  'activity_type': 'Skiing / Outdoor Activities',
  'county': 'Viken',
  'time_to_visit': 'December, January, February, March, April',
  'description': 'One of Norway’s top ski resorts. It offers alpine slopes, cross-country trails, and summer activities like hiking and mountain biking.'}]

## Make ground truth

In [12]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [30]:
prompt_template = """
You emulate a user of our Norway guide application.
Formulate 5 questions this user might ask based on a provided tourist attraction in Norway. Formualte only the questions, not the answers.
The record provided to you should contain the answer to the question that you formulate.
The questions should be complete and not too short. If possible, use as fewer words as possible from the record. 

The record:

attraction: {attraction}
activity_type: {activity_type}
county: {county}
time_to_visit: {time_to_visit}
description: {description}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [31]:
print(prompt_template.format(**documents[0]))

You emulate a user of our Norway guide application.
Formulate 5 questions this user might ask based on a provided tourist attraction in Norway. Formualte only the questions, not the answers.
The record provided to you should contain the answer to the question that you formulate.
The questions should be complete and not too short. If possible, use as fewer words as possible from the record. 

The record:

attraction: Geirangerfjord
activity_type: Sightseeing / Nature
county: Møre og Romsdal
time_to_visit: May, June, July, August, September
description: One of Norway’s most iconic fjords, surrounded by steep cliffs, waterfalls, and small farms. It is a UNESCO World Heritage Site and offers boat tours, kayaking, and breathtaking viewpoints such as Dalsnibba.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [32]:
prompt = prompt_template.format(**documents[0])

In [33]:
import openai
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [34]:
def llm(prompt):
    response = client.chat.completions.create(
        model = 'gpt-5-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [35]:
questions = llm(prompt)

In [36]:
import json

In [37]:
json.loads(questions)

['What makes Geirangerfjord famous?',
 'In which county is Geirangerfjord located?',
 'What months are best to visit Geirangerfjord?',
 'What activities can I do at Geirangerfjord?',
 'Are there any notable viewpoints at Geirangerfjord and which ones?']

In [38]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-5-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [39]:
from tqdm.auto import tqdm

In [40]:
results = {}

In [41]:
for doc in tqdm(documents): 
   doc_id = doc['id']
   if doc_id in results:
       continue

   questions = generate_questions(doc)
   results[doc_id] = questions

100%|██████████| 150/150 [26:20<00:00, 10.54s/it]


In [42]:
results

{1: '["In which county is Geirangerfjord located?", "What months are best for visiting Geirangerfjord?", "What activities are available at Geirangerfjord, such as boat tours or kayaking?", "Is Geirangerfjord designated as a UNESCO World Heritage Site?", "What natural features and viewpoints can I expect at Geirangerfjord, for example Dalsnibba?"]',
 2: '["How high is the plateau at Preikestolen above Lysefjord?", "In which county is Preikestolen located?", "How long does the hike to Preikestolen typically take each way?", "Which months are best for visiting Preikestolen?", "What kind of panoramic view can visitors expect from Preikestolen?"]',
 3: '["How long does the hike to Trolltunga typically take?", "When is the recommended time of year to visit Trolltunga?", "In which county is Trolltunga located?", "What type of activity is Trolltunga best known for?", "What views or scenery can I expect from the Trolltunga rock ledge?"]',
 4: '["In which county are the Lofoten Islands located?"

In [47]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [48]:
parsed_resulst

{1: ['In which county is Geirangerfjord located?',
  'What months are best for visiting Geirangerfjord?',
  'What activities are available at Geirangerfjord, such as boat tours or kayaking?',
  'Is Geirangerfjord designated as a UNESCO World Heritage Site?',
  'What natural features and viewpoints can I expect at Geirangerfjord, for example Dalsnibba?'],
 2: ['How high is the plateau at Preikestolen above Lysefjord?',
  'In which county is Preikestolen located?',
  'How long does the hike to Preikestolen typically take each way?',
  'Which months are best for visiting Preikestolen?',
  'What kind of panoramic view can visitors expect from Preikestolen?'],
 3: ['How long does the hike to Trolltunga typically take?',
  'When is the recommended time of year to visit Trolltunga?',
  'In which county is Trolltunga located?',
  'What type of activity is Trolltunga best known for?',
  'What views or scenery can I expect from the Trolltunga rock ledge?'],
 4: ['In which county are the Lofoten 

In [49]:
doc_index = {d['id']: d for d in documents}

In [56]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    for q in questions:
        final_results.append((doc_id, q))

In [57]:
final_results

[(1, 'In which county is Geirangerfjord located?'),
 (1, 'What months are best for visiting Geirangerfjord?'),
 (1,
  'What activities are available at Geirangerfjord, such as boat tours or kayaking?'),
 (1, 'Is Geirangerfjord designated as a UNESCO World Heritage Site?'),
 (1,
  'What natural features and viewpoints can I expect at Geirangerfjord, for example Dalsnibba?'),
 (2, 'How high is the plateau at Preikestolen above Lysefjord?'),
 (2, 'In which county is Preikestolen located?'),
 (2, 'How long does the hike to Preikestolen typically take each way?'),
 (2, 'Which months are best for visiting Preikestolen?'),
 (2, 'What kind of panoramic view can visitors expect from Preikestolen?'),
 (3, 'How long does the hike to Trolltunga typically take?'),
 (3, 'When is the recommended time of year to visit Trolltunga?'),
 (3, 'In which county is Trolltunga located?'),
 (3, 'What type of activity is Trolltunga best known for?'),
 (3, 'What views or scenery can I expect from the Trolltunga r

In [58]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [59]:
df_results

Unnamed: 0,id,question
0,1,In which county is Geirangerfjord located?
1,1,What months are best for visiting Geirangerfjord?
2,1,What activities are available at Geirangerfjor...
3,1,Is Geirangerfjord designated as a UNESCO World...
4,1,What natural features and viewpoints can I exp...
...,...,...
745,150,In which county is Romsdalsfjord located?
746,150,What months are best to visit Romsdalsfjord?
747,150,What outdoor activities are recommended at Rom...
748,150,What natural features and sights make Romsdals...


In [None]:
df_results.to_csv('../data/ground-truth-data.csv', index=False)