In [1]:
import sys

sys.path.append('/workspaces/llm-zoomcamp-project/')

from cards import make_table_cards, make_column_cards, make_example_cards

In [2]:
example_cards = make_example_cards()

In [3]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [5]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [9]:
for doc in example_cards: 
    full_text = doc.get('text', '') # Safely get the 'text' value

    if full_text:
        # 1. Find the index where the SQL part begins (after the newline)
        try:
            # Split the string by the newline character ('\n')
            question_part = full_text.split('\n')[0]

            # 2. Clean up any trailing/leading whitespace (optional but good practice)
            question = question_part.strip()
            
            print(question)
        except IndexError:
            # Handle cases where the string might not contain a newline
            print(f"Error processing: {full_text.strip()}")

Q: average bed height for Jig-1 in last 10 hours
Q: tailings chrome by machine in last 24 hours
Q: hourly make-up water flow for Jig-2 yesterday
Q: average pulsation frequency for all machines in the last 6 hours
Q: average water flow for Jig-2 in the last 3 days
Q: average clayness index for each machine yesterday
Q: correlation between clayness index and tailings chrome for Jig-1 in the last 12 hours
Q: maximum bed height for each machine today
Q: hourly average of water flow and clayness for Jig-1 over the last 8 hours
Q: difference between maximum and minimum bed height for Jig-2 in the last day
Q: hourly average tailings chrome for each machine today
Q: number of telemetry records collected per sensor in the last 24 hours
Q: average chrome content in tailings per area during the last week
Q: count of machines in the Flotation area
Q: which machine has the sensor with the highest average clayness index over the last week
Q: average water flow when tailings chrome was above 0.5 perc

In [3]:
example_cards

[{'type': 'example',
  'text': "Q: average bed height for Jig-1 in last 10 hours\nSQL: SELECT AVG(t.value) AS avg_bed_height_mm FROM telemetry t JOIN sensors s ON t.sensor_id=s.sensor_id JOIN machines m ON s.machine_id=m.machine_id WHERE s.name='bed_height_mm' AND m.name='Jig-1' AND t.ts >= CURRENT_TIMESTAMP - INTERVAL '10 hours';"},
 {'type': 'example',
  'text': "Q: tailings chrome by machine in last 24 hours\nSQL: SELECT m.name, AVG(l.tailings_cr2o3_pct) AS avg_cr FROM lab_samples l JOIN machines m ON l.machine_id=m.machine_id WHERE l.ts >= CURRENT_TIMESTAMP - INTERVAL '24 hours' GROUP BY m.name ORDER BY m.name;"},
 {'type': 'example',
  'text': "Q: hourly make-up water flow for Jig-2 yesterday\nSQL: SELECT date_trunc('hour', t.ts) AS h, AVG(t.value) AS water_m3h FROM telemetry t JOIN sensors s ON t.sensor_id=s.sensor_id JOIN machines m ON s.machine_id=m.machine_id WHERE s.name='water_flow_m3h' AND m.name='Jig-2' AND t.ts::date = (CURRENT_DATE - INTERVAL '1 day')::date GROUP BY 1 OR