# Zero-shot KI retrieval workflow (OpenAI)

First load the required libraries (including the OpenAI API) and initialize relevant variables.

In [2]:
import os
import sys
import numpy as np
import pandas as pd
from pprint import pprint 
from openai import OpenAI
from textwrap import wrap
from time import localtime, strftime, sleep

# # Initalize our OpenAI key.
# api_key_file = "/Users/tobo/Work/Experiments/DS-tryouts/openai-api-key.txt"
# with open(api_key_file, 'r') as f:
#     for line in f:
#         api_key = line.strip()
#         break

# Initialize variables.
N = 20   # Number of results to return
K = 3    # Number of times to run the same prompt


The function below takes a file containing first posts, converts them into the right prompts and runs them against the OpenAI API using the specified model.

In [1]:

# Define a function that takes all the first posts, converts them into the right prompts 
# and runs them against the OpenAI API using the specified model. Threads already processed
# will be skipped. By default the method prints out its progress, but this can be disabled 
# by setting the 'silent' parameter to 'True'.
def process_first_posts(first_posts_file, model, silent=False):

    # Extract the metadata from the filename.
    (temp1, data_type, domain, source, extension) = first_posts_file.split(".")
    domain_singular = domain[0:len(domain) - 1]
    metadata_string = f"{model}.{domain}.{data_type}.{source}"


    # Print out the selected model.
    if (silent == False):
        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        print(f"{now}\tRunning against OpenAI API ({metadata_string})")


    # Read in all first posts from file into a Pandas dataframe. Then extract a list of all thread IDs to crawl.
    all_posts = pd.read_table(first_posts_file, header = 0)
    thread_id_list = all_posts['thread_id'].tolist()


    # Where are we going to save the API responses? Make sure this results directory exists before saving to it.
    results_dir_path = os.path.join(os.getcwd(), metadata_string)
    if not os.path.exists(results_dir_path):
        if (silent == False):
            now = strftime("%Y-%m-%d %H:%M:%S", localtime())
            print(f"{now}\t{results_dir_path} does not exist!")
        os.makedirs(results_dir_path)


    # For each publication, find the citing documents and collect all the publication metadata.
    for thread_id in thread_id_list:
    # for thread_id in thread_id_list[0:5]:

        # Extract the raw post text.
        request = all_posts.loc[all_posts['thread_id'] == thread_id, 'request'].item()

        # Create the appropriate prompt.
        prompt_start = f"Identify the {domain_singular} the user is looking for as described in the request below:"
        prompt_middle = f"Request: \"{request}\""
        prompt_end = f"Please provide a ranked list of your {N} best guesses for the correct answer. Please answer in a JSON object that contains a ranked list of suggestions. Each suggestion should contain a field called 'answer' containing the suggestion (title and release year), a field 'explanation' containing an explanation of why these {domain_singular}s could be the correct answer, and a 'confidence' score that represents how confident you are of your suggestion."
        if (domain == "books"):
            prompt_end = f"Please provide a ranked list of your {N} best guesses for the correct answer. Please answer in a JSON object that contains a ranked list of suggestions. Each suggestion should contain a field called 'answer' containing the suggestion (title and author), a field 'explanation' containing an explanation of why these {domain_singular}s could be the correct answer, and a 'confidence' score that represents how confident you are of your suggestion."
        prompt_start = "\n".join(wrap(prompt_start))
        prompt_middle = "\n".join(wrap(prompt_middle))
        prompt_end = "\n".join(wrap(prompt_end))
        prompt_text = f"{prompt_start}\n\n{prompt_middle}\n\n{prompt_end}"

        # Save the prompt to file (so we can easily run it manually if we wish).
        prompt_file = os.path.join(results_dir_path, f"prompt.{thread_id}.txt")
        f = open(prompt_file, "w")
        f.write(prompt_text)
        f.close()

        # Have we already processed this one? If not, skip to the next one (just check the v1 of this).
        output_file = os.path.join(results_dir_path, f"{thread_id}.{model}.v1.json")
        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        if os.path.exists(output_file):
            if (silent == False):
                print(f"{now}\tAlready processed thread '{thread_id}'")
            continue
        else:
            
            # Run the prompt against OpenAI's API.
            sleep(1)
            if (silent == False):
                print(f"{now}\tProcessing of thread '{thread_id}'")
            client = OpenAI()
            completion = client.chat.completions.create(
                model=f"{model}",
                n=K,
                messages=[{"role": "user", "content": prompt_text}]
            )

            # Process the response(s) and save to file(s).
            no_of_responses = len(completion.choices)
            if (silent == False):
                now = strftime("%Y-%m-%d %H:%M:%S", localtime())
                print(f"{now}\t  * Saving {no_of_responses} responses to file")
            for k in range(0, no_of_responses):
                output_file = os.path.join(results_dir_path, f"{thread_id}.{model}.v{k+1}.json")
                if (silent == False):
                    print(f"{now}\t    - {thread_id}.v{k+1}.json")
                choice = completion.choices[k]        # Get the current response of K total responses
                response = choice.message             # Extract the full response message that OpenAI returned
                generated_answer = response.content   # Extract the actual JSON-formatted answer.
                
                # Remove the enclosing ```json and ``` lines.
                generated_answer = generated_answer.replace("```json", "")
                generated_answer = generated_answer.replace("```", "")

                # Save to file.
                f = open(output_file, "w")
                f.write(generated_answer)
                f.close()


In [6]:

# Which OpenAI model will we use?
# model = "gpt-4o"
model = "gpt-4o-mini"
# model = "gpt-4-turbo"
# model = "gpt-4"         # Note: does not answer the questions!
# model = "gpt-3.5-turbo"

# # Books - JDoc annotated - LibraryThing
# first_posts_file = "first-posts.jdoc-annotated.books.librarything.tsv"
# process_first_posts(first_posts_file, model)

# # Games - JDoc annotated - Reddit
# first_posts_file = "first-posts.jdoc-annotated.games.reddit.tsv"
# process_first_posts(first_posts_file, model)

# # Movies - JDoc annotated - IMDB
# first_posts_file = "first-posts.jdoc-annotated.movies.imdb.tsv"
# process_first_posts(first_posts_file, model)

# # Books - Extra - Reddit
# first_posts_file = "first-posts.extra.books.reddit.tsv"
# process_first_posts(first_posts_file, model)

# # Games - Extra - Reddit
# first_posts_file = "first-posts.extra.games.reddit.tsv"
# process_first_posts(first_posts_file, model)

# # Movies - Extra - IMDB
# first_posts_file = "first-posts.extra.movies.imdb.tsv"
# process_first_posts(first_posts_file, model)



# model = "gpt-4o"

# # Books - JDoc annotated - LibraryThing
# first_posts_file = "first-posts.jdoc-annotated.books.librarything.tsv"
# process_first_posts(first_posts_file, model)

# # Games - JDoc annotated - Reddit
# first_posts_file = "first-posts.jdoc-annotated.games.reddit.tsv"
# process_first_posts(first_posts_file, model)

# # Movies - JDoc annotated - IMDB
# first_posts_file = "first-posts.jdoc-annotated.movies.imdb.tsv"
# process_first_posts(first_posts_file, model)



model = "gpt-3.5-turbo"

# Books - JDoc annotated - LibraryThing
first_posts_file = "first-posts.jdoc-annotated.books.librarything.tsv"
process_first_posts(first_posts_file, model)

# Games - JDoc annotated - Reddit
first_posts_file = "first-posts.jdoc-annotated.games.reddit.tsv"
process_first_posts(first_posts_file, model)

# Movies - JDoc annotated - IMDB
first_posts_file = "first-posts.jdoc-annotated.movies.imdb.tsv"
process_first_posts(first_posts_file, model)

# Books - Extra - Reddit
first_posts_file = "first-posts.extra.books.reddit.tsv"
process_first_posts(first_posts_file, model)

# Games - Extra - Reddit
first_posts_file = "first-posts.extra.games.reddit.tsv"
process_first_posts(first_posts_file, model)

# Movies - Extra - IMDB
first_posts_file = "first-posts.extra.movies.imdb.tsv"
process_first_posts(first_posts_file, model)



2024-10-11 20:53:28	Running against OpenAI API (gpt-3.5-turbo.books.jdoc-annotated.librarything)
2024-10-11 20:53:28	Already processed thread '127931'
2024-10-11 20:53:28	Already processed thread '57827'
2024-10-11 20:53:28	Already processed thread '20772'
2024-10-11 20:53:28	Already processed thread '41717'
2024-10-11 20:53:29	Already processed thread '113809'
2024-10-11 20:53:29	Already processed thread '20193'
2024-10-11 20:53:29	Already processed thread '68922'
2024-10-11 20:53:29	Already processed thread '129136'
2024-10-11 20:53:29	Already processed thread '21768'
2024-10-11 20:53:29	Already processed thread '3861'
2024-10-11 20:53:29	Already processed thread '114836'
2024-10-11 20:53:29	Already processed thread '42904'
2024-10-11 20:53:29	Already processed thread '111795'
2024-10-11 20:53:29	Already processed thread '88491'
2024-10-11 20:53:29	Already processed thread '93190'
2024-10-11 20:53:29	Already processed thread '50119'
2024-10-11 20:53:29	Already processed thread '10163

To make things easier for us in the future, let's create some special subsets of this dataset.

## 4. OpenAI

Load the OpenAI library and set the appropriate API key.

# Workflow

- Annotate replies in Excel spreadsheet for solved requests
- Try out different ChatGPT prompts and document them
  - Original request
  - Original request + instructional prompt
  - Original request + instructional prompt + admit uncertainty
  - Original request + instructional prompt + admit uncertainty + answer explanation
- Run all request (solved first!) through ChatGPT
- Check how many requests ChatGPT is able to solve