In [1]:
import pandas as pd

In [2]:
from openai import OpenAI

client = OpenAI()

In [3]:
df = pd.read_csv('../data/transcript_chunk.csv', sep=',')
documents = df.to_dict(orient='records')

In [4]:
prompt_template = """
You emulate a user of our YouTube assistant application.
Formulate 5 questions this user might ask based on a provided transcript chunk.
Make the questions specific to this transcript chunk.
The record should contain the answers to the questions, and the questions should
be complete and not too short. Use as few words as possible from the record. 

The record:

subtitle: {subtitle}
text_chunk: {text_chunk}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [5]:
prompt = prompt_template.format(**documents[0])

In [6]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [7]:
questions = llm(prompt)

In [8]:
import json

In [9]:
json.loads(questions)

{'questions': ['What is the purpose of the office hours session mentioned in the transcript?',
  'How many people joined the office hours according to the transcript?',
  "Who asked the question about the video's content during the session?",
  'Is this the first office hours session held, based on the transcript?',
  'What is the general tone of the speaker in the introduction?']}

In [10]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response


In [11]:
from tqdm.auto import tqdm

In [12]:
results = {}

In [13]:
for doc in tqdm(documents): 
    doc_id = doc['chunk_id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/41 [00:00<?, ?it/s]

In [14]:
final_results = []


for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id,q))

In [15]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [16]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [17]:
!head ../data/ground-truth-retrieval.csv

id,question
0,What is the purpose of the office hours session mentioned in the transcript?
0,How many people have joined the office hours?
0,Who asked the first question during the office hours?
0,How long has it been since the last office hours session?
0,What greeting does the host use to welcome participants?
1,What specific topics will this video cover?
1,"Is it only about projects, or can we ask other questions too?"
1,Who asked the initial question in the video?
1,What is the main purpose of this video?
