In [1]:
import pandas as pd

In [4]:
from openai import OpenAI

client = OpenAI()

In [5]:
df = pd.read_csv('../data/data.csv')
documents = df.to_dict(orient='records')

In [27]:
prompt_template = """
You emulate a user of our fitness assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [28]:
prompt = prompt_template.format(**documents[0])

In [29]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [30]:
questions = llm(prompt)

In [31]:
import json

In [32]:
json.loads(questions)

{'questions': ['What is the correct starting position for performing a push-up?',
  'Which muscle groups are primarily activated when doing push-ups?',
  'How do I properly lower my body during a push-up?',
  'What equipment do I need to perform push-ups?',
  'Can push-ups target the upper body effectively?']}

In [33]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [34]:
from tqdm.auto import tqdm

In [35]:
results = {}

In [38]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/207 [00:00<?, ?it/s]

In [42]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [43]:
final_results[0]

(0, 'What is the starting position for doing push-ups?')

In [46]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [48]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [49]:
!head ../data/ground-truth-retrieval.csv

id,question
0,What is the starting position for doing push-ups?
0,Which muscle groups are activated during push-ups?
0,How do you know when to push back up while doing push-ups?
0,Do you need any equipment to perform push-ups?
0,What part of the body do push-ups primarily target?
1,What is the proper stance for performing squats?
1,Which primary muscle groups are targeted when doing squats?
1,How should I position my chest during a squat?
1,What should I do with my feet while performing this exercise?
