# Zero-shot KI retrieval workflow (OpenAI)

First load the required libraries (including the OpenAI API) and initialize relevant variables.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
from pprint import pprint 
from openai import OpenAI
from textwrap import wrap
from time import localtime, strftime, sleep

# Initalize our OpenAI key.
api_key_file = "/Users/tobo/Work/Experiments/DS-tryouts/openai-api-key.txt"
with open(api_key_file, 'r') as f:
    for line in f:
        api_key = line.strip()
        break

# Initialize variables.
N = 20   # Number of results to return
K = 3    # Number of times to run the same prompt


The function below takes a file containing first posts, converts them into the right prompts and runs them against the OpenAI API using the specified model.

In [15]:
# Define a function that takes all the first posts, converts them into the right prompts 
# and runs them against the OpenAI API using the specified model. Threads already processed
# will be skipped. By default the method prints out its progress, but this can be disabled 
# by setting the 'silent' parameter to 'True'.
def process_first_posts(first_posts_file, model, silent=False):

    # Keep stats on current call of this method.
    N_requests_processed_all = 0
    N_requests_processed_new = 0
    N_json_files_created = 0

    # Extract the metadata from the filename.
    (temp1, data_type, domain, source, extension) = first_posts_file.split(".")
    domain_singular = domain[0:len(domain) - 1]
    metadata_string = f"{model}.{domain}.{data_type}.{source}"


    # Print out the selected model.
    if (silent == False):
        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        print(f"{now}\tRunning against OpenAI API ({metadata_string})")


    # Read in all first posts from file into a Pandas dataframe. Then extract a list of all thread IDs to crawl.
    all_posts = pd.read_table(first_posts_file, header = 0)
    thread_id_list = all_posts['thread_id'].tolist()


    # Where are we going to save the API responses? Make sure this results directory exists before saving to it.
    results_dir_path = os.path.join(os.getcwd(), metadata_string)
    if not os.path.exists(results_dir_path):
        if (silent == False):
            now = strftime("%Y-%m-%d %H:%M:%S", localtime())
            print(f"{now}\t{results_dir_path} does not exist!")
        os.makedirs(results_dir_path)

    # Also create an empty version of the timestamp log file, if it does not exist yet.
    timestamp_log_path = os.path.join(os.getcwd(), "timestamp-log.tsv")
    if not os.path.exists(timestamp_log_path):
        timestamp_header = f"domain\tthread_id\tmodel\ttimestamp_processed\n"
        with open(timestamp_log_path, 'a') as f:
            f.write(timestamp_header)
        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        print(f"Creating timestamp log file {timestamp_log_path} at {now}")


    # For each post, transform the request into a prompt and run it.
    # for thread_id in thread_id_list[0:5]:
    for thread_id in thread_id_list:

        # Extract the raw post text.
        request = all_posts.loc[all_posts['thread_id'] == thread_id, 'request'].item()

        # Create the appropriate prompt.
        prompt_start = f"Identify the {domain_singular} the user is looking for as described in the request below:"
        prompt_middle = f"Request: \"{request}\""
        prompt_end = f"Please provide a ranked list of your {N} best guesses for the correct answer. Please answer in a JSON object that contains a ranked list of suggestions. Each suggestion should contain a field called 'answer' containing the suggestion (title and release year), a field 'explanation' containing an explanation of why these {domain_singular}s could be the correct answer, and a 'confidence' score that represents how confident you are of your suggestion."
        if (domain == "books"):
            prompt_end = f"Please provide a ranked list of your {N} best guesses for the correct answer. Please answer in a JSON object that contains a ranked list of suggestions. Each suggestion should contain a field called 'answer' containing the suggestion (title and author), a field 'explanation' containing an explanation of why these {domain_singular}s could be the correct answer, and a 'confidence' score that represents how confident you are of your suggestion."
        prompt_start = "\n".join(wrap(prompt_start))
        prompt_middle = "\n".join(wrap(prompt_middle))
        prompt_end = "\n".join(wrap(prompt_end))
        prompt_text = f"{prompt_start}\n\n{prompt_middle}\n\n{prompt_end}"

        # Save the prompt to file (so we can easily run it manually if we wish).
        prompt_file = os.path.join(results_dir_path, f"prompt.{thread_id}.txt")
        f = open(prompt_file, "w")
        f.write(prompt_text)
        f.close()

        # Update the stats.
        N_requests_processed_all += 1

        # Have we already processed this one? If not, skip to the next one (just check the v1 of this).
        output_file = os.path.join(results_dir_path, f"{thread_id}.{model}.v1.json") # Defining this v1 is just to check whether they've been generated already. K is used later on to create separate JSON files.
        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        if os.path.exists(output_file):
            if (silent == False):
                print(f"{now}\tAlready processed thread '{thread_id}'")
            continue
        else:
            
            # Run the prompt against OpenAI's API.
            sleep(1)
            if (silent == False):
                print(f"{now}\tProcessing of thread '{thread_id}'")
            client = OpenAI()
            completion = client.chat.completions.create(
                model=f"{model}",
                n=K,
                messages=[{"role": "user", "content": prompt_text}]
            )

            # Process the response(s) and save to file(s).
            no_of_responses = len(completion.choices)
            now = strftime("%Y-%m-%d %H:%M:%S", localtime())   # Outside the IF-statement, cause we log this separately.
            if (silent == False):
                print(f"{now}\t  * Saving {no_of_responses} responses to file")
            for k in range(0, no_of_responses):
                output_file = os.path.join(results_dir_path, f"{thread_id}.{model}.v{k+1}.json")
                if (silent == False):
                    print(f"{now}\t    - {thread_id}.{model}.v{k+1}.json")
                choice = completion.choices[k]        # Get the current response of K total responses
                response = choice.message             # Extract the full response message that OpenAI returned
                generated_answer = response.content   # Extract the actual JSON-formatted answer.
                
                # Remove the enclosing ```json and ``` lines.
                generated_answer = generated_answer.replace("```json", "")
                generated_answer = generated_answer.replace("```", "")

                # Save to file.
                f = open(output_file, "w")
                f.write(generated_answer)
                f.close()

            # Update the stats.
            N_requests_processed_new += 1
            N_json_files_created += K

            # Record the current timestamp for this response in a separate file.
            timestamp_entry = f"{domain}\t{thread_id}\t{model}\t{now}\n"
            with open(timestamp_log_path, 'a') as f:
                f.write(timestamp_entry)


    if (silent == False):
        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        print(f"{now}\tDone with current crawl call")
        print(f"{now}\t  * Total requests processed: {N_requests_processed_all}")
        print(f"{now}\t  * New requests processed: {N_requests_processed_new}")
        print(f"{now}\t  * New JSON files created: {N_json_files_created}")


In [None]:
# Books - Extra - Goodreads
model = "gpt-4o-mini"   # Which OpenAI model will we use?
first_posts_file = "first-posts.extra.books.goodreads-20250415.tsv"
process_first_posts(first_posts_file, model)

model = "gpt-4o-mini"   # Which OpenAI model will we use?
first_posts_file = "first-posts.extra.books.goodreads-20250525.tsv"
process_first_posts(first_posts_file, model)

model = "gpt-3.5-turbo"   # Which OpenAI model will we use?
first_posts_file = "first-posts.extra.books.goodreads-20250415.tsv"
process_first_posts(first_posts_file, model)

model = "gpt-3.5-turbo"   # Which OpenAI model will we use?
first_posts_file = "first-posts.extra.books.goodreads-20250525.tsv"
process_first_posts(first_posts_file, model)

# model = "gpt-4o"   # Which OpenAI model will we use?
# first_posts_file = "first-posts.extra.books.goodreads-20250525.tsv"
# process_first_posts(first_posts_file, model)

# model = "gpt-4o"   # Which OpenAI model will we use?
# first_posts_file = "first-posts.extra.books.goodreads-20250415.tsv"
# process_first_posts(first_posts_file, model)



# Games - Extra - Reddit
model = "gpt-4o-mini"   # Which OpenAI model will we use?
first_posts_file = "first-posts.extra.games.reddit-202504xx.tsv"
process_first_posts(first_posts_file, model)

model = "gpt-4o-mini"   # Which OpenAI model will we use?
first_posts_file = "first-posts.extra.games.reddit-202505xx.tsv"
process_first_posts(first_posts_file, model)

model = "gpt-3.5-turbo"   # Which OpenAI model will we use?
first_posts_file = "first-posts.extra.games.reddit-202504xx.tsv"
process_first_posts(first_posts_file, model)

model = "gpt-3.5-turbo"   # Which OpenAI model will we use?
first_posts_file = "first-posts.extra.games.reddit-202505xx.tsv"
process_first_posts(first_posts_file, model)

# model = "gpt-4o"   # Which OpenAI model will we use?
# first_posts_file = "first-posts.extra.games.reddit-202504xx.tsv"
# process_first_posts(first_posts_file, model)

# model = "gpt-4o"   # Which OpenAI model will we use?
# first_posts_file = "first-posts.extra.games.reddit-202505xx.tsv"
# process_first_posts(first_posts_file, model)


# Shouldn't do these ones until the end of June 2025, as it is a monthly crawl and this wouldn't be a true random sample.
# model = "gpt-4o-mini"   # Which OpenAI model will we use?
# first_posts_file = "first-posts.extra.games.reddit-202506xx.tsv"
# process_first_posts(first_posts_file, model)

# model = "gpt-3.5-turbo"   # Which OpenAI model will we use?
# first_posts_file = "first-posts.extra.games.reddit-202506xx.tsv"
# process_first_posts(first_posts_file, model)

# model = "gpt-4o"   # Which OpenAI model will we use?
# first_posts_file = "first-posts.extra.games.reddit-202506xx.tsv"
# process_first_posts(first_posts_file, model)




2025-06-19 15:53:04	Running against OpenAI API (gpt-4o-mini.books.extra.goodreads-20250415)
2025-06-19 15:53:04	Already processed thread '23100744'
2025-06-19 15:53:04	Already processed thread '23098575'
2025-06-19 15:53:04	Already processed thread '23097271'
2025-06-19 15:53:04	Already processed thread '1342078'
2025-06-19 15:53:04	Already processed thread '23102942'
2025-06-19 15:53:04	Already processed thread '23049605'
2025-06-19 15:53:05	Already processed thread '22561810'
2025-06-19 15:53:05	Already processed thread '23100145'
2025-06-19 15:53:05	Already processed thread '23098250'
2025-06-19 15:53:05	Already processed thread '23101518'
2025-06-19 15:53:05	Already processed thread '23090864'
2025-06-19 15:53:05	Already processed thread '23106345'
2025-06-19 15:53:05	Already processed thread '23098560'
2025-06-19 15:53:05	Already processed thread '23102847'
2025-06-19 15:53:05	Already processed thread '23099349'
2025-06-19 15:53:05	Already processed thread '23091078'
2025-06-19 15

To make things easier for us in the future, let's create some special subsets of this dataset.

## To do

- Add crawling timestamp
- Add the OpenAI key to my ENV so I don't have to read it from file.