In [1]:
from tqdm.auto import tqdm
import pandas as pd
from dotenv import load_dotenv
import os
from openai import OpenAI
import json

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../data/krakow_pois_for_rag.csv')

text_columns = ['name','amenity','leisure','natural','tourism','historic','wiki_summary_en']

In [3]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)


In [4]:
df.shape

(11694, 49)

There is too many records - I will select all with wiki_descitption and same number of rows w/o description.

In [5]:
df_with_wiki = df[df['wiki_summary_en'] != 'no information']


In [6]:
df_with_wiki.shape

(335, 49)

In [7]:
df_without_wiki = df[df['wiki_summary_en'] == 'no information'].sample(df_with_wiki.shape[0], random_state=123)

In [8]:
df_without_wiki.shape

(335, 49)

In [9]:
df = pd.concat([df_with_wiki, df_without_wiki])


In [10]:
df.shape

(670, 49)

In [12]:
df.to_csv('../data/krakow_pois_selected.csv', index=False,header=True)

In [13]:
documents = df.to_dict(orient='records')

In [14]:
prompt_template = """
Generate a list of realistic and diverse questions that a traveler might ask a virtual travel assistant while visiting Kraków, Poland.

Based on the given attraction record, which is Krakow's POI (Point of interest), create 5 complete, specific questions the user might ask about the attraction. 
Only create questions for categories that contain information different than "No data available".
The questions must:

* Be relevant to the details in the record.
* Be clear and self-contained (not too short).
* Use as few exact words from the record as possible while keeping the meaning.

Attraction record format:

phone : {phone}
cemetery : {cemetery}
emergency : {emergency}
opening_hours : {opening_hours}
website : {website}
pets_allowed : {pets_allowed}
geometry : {geometry}
historic : {historic}
wiki_summary_en : {wiki_summary_en}
postal_code : {postal_code}
toilets : {toilets}
natural : {natural}
description : {description}
visiting_time : {visiting_time}
leisure : {leisure}
tourism : {tourism}
public_transport : {public_transport}
brand : {brand}
alt_name : {alt_name}
amenity : {amenity}
reservation : {reservation}
attraction : {attraction}
highchair : {highchair}
parking : {parking}
swimming_pool : {swimming_pool}
contact_phone : {contact_phone}
community_centre : {community_centre}
addr_street : {addr_street}
contact_twitter : {contact_twitter}
social_facility : {social_facility}
contact_facebook : {contact_facebook}
zoo : {zoo}
email : {email}
wheelchair : {wheelchair}
cuisine : {cuisine}
contact_website : {contact_website}
internet_access : {internet_access}
opening_hours_reception : {opening_hours_reception}
guest_house : {guest_house}
addr_city : {addr_city}
contact_instagram : {contact_instagram}
image : {image}
location : {location}
outdoor_seating : {outdoor_seating}
museum : {museum}
takeaway : {takeaway}
smoking : {smoking}
name : {name}
id : {id} 

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}

""".strip()

In [15]:
def generate_questions(doc):

    prompt = prompt_template.format(**doc)

    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [16]:
documents[0]['name']

'Wierzynek'

In [17]:
questions = generate_questions(documents[0])

In [18]:

json.loads(questions)

{'questions': ['What are the unique dining rooms available at Wierzynek and do they have different themes?',
  "Can you tell me about the restaurant's history and any notable figures who have dined there?",
  'How many guests can Wierzynek accommodate at once?',
  "What are the restaurant's opening hours during the week?",
  'Is there an official website where I can check the menu or make reservations for Wierzynek?']}

In [19]:
results = {}

In [None]:
import re

{'questions': ['What services are provided at the Oddział Rehabilitacji Kardiologicznej?', 'Is there any information available about the visiting hours or schedules for the facility?', 'Are there any specific requirements for patients wishing to use the rehabilitation services offered here?', 'Can I find assistance for someone with mobility challenges at the Oddział Rehabilitacji Kardiologicznej?', 'Is it possible to contact the rehabilitation department for inquiries or more information?', "Judging from the attraction's focus, are there any particular treatment programs offered?"]}


In [32]:
for doc in tqdm(documents):
        doc_id = doc['id']
        if doc_id in results:  # Skip if already processed
            continue


        questions_raw = generate_questions(doc)
        questions_raw = re.sub(r'(\?")\s*([A-Z])', r'?", "\2', questions_raw)
        questions = json.loads(questions_raw)


        results[doc_id] = questions['questions']

100%|██████████| 670/670 [07:35<00:00,  1.47it/s] 


In [33]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [34]:
df_results.tail(10)

Unnamed: 0,id,question
3330,9193,What is the best way to get to Bronowice SKA 0...
3331,9193,Could you provide information about the surrou...
3332,9193,Is there a place to park nearby if I plan to v...
3333,9193,Are there any notable landmarks or attractions...
3334,9193,What are some recommended activities or places...
3335,4984,What types of gardens can I expect to see at t...
3336,4984,Is there any designated area for leisure activ...
3337,4984,How would you suggest I spend my visiting time...
3338,4984,Are there any recommendations for nearby publi...
3339,4984,Are pets allowed in the garden at this location?


In [35]:
df_results.shape

(3340, 2)

In [36]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)