# Generation of Questions / Answers using GPT-4

In [20]:
import os
import sys
import json
import time
import openai
from multiprocessing import Pool
from tqdm import tqdm
from absl import app, flags
import importlib
import random

In [17]:
input_path = '/home/michael/Workspace/datasets/galaxy_zoo/GZ_talk_comments_notes_urls_AISSAI.json'
output_path = '/home/michael/Workspace/datasets/galaxy_zoo/metadata.json'
question_path = '/home/michael/Workspace/datasets/galaxy_zoo/questions.py'
number_of_qa = 1

In [18]:
spec = importlib.util.spec_from_file_location("settings", question_path)
Questions = importlib.util.module_from_spec(spec)
spec.loader.exec_module(Questions)
questions = Questions.questions

In [19]:
questions

['Describe the following image in detail.',
 'Provide a detailed description of the given image.',
 'Give an elaborate explanation of the image you see.',
 'Share a comprehensive rundown of the presented image.',
 'Offer a thorough analysis of the image.',
 'Explain the various aspects of the image before you.',
 'Clarify the contents of the displayed image with great detail.',
 'Characterize the image using a well-detailed description.',
 'Break down the elements of the image in a detailed manner.',
 'Walk through the important details of the image.',
 'Portray the image with a rich, descriptive narrative.',
 'Narrate the contents of the image with precision.',
 'Analyze the image in a comprehensive and detailed manner.',
 'Illustrate the image through a descriptive explanation.',
 'Examine the image closely and share its details.',
 'Write an exhaustive depiction of the given image.',
 'Describe the following image concisely.',
 'Provide a brief description of the given image.',
 'Of

In [11]:
def load_dataset(input_path: str) -> list:
    with open(input_path, 'r') as file:
        return json.load(file)

def write_to_json(data: list, output_path: str):
    with open(output_path, 'w') as file:
        json.dump(data, file, indent=4)

def concat_conversation(entry: dict) -> str:
    conversation = ""
    for j in range(len(entry['conversations'])):
        conversation += "User: " + entry['conversations'][j]['value'] + "\n\n"
    return conversation

def get_answer(entry, prompt: str, questions: list):
    conversation = concat_conversation(entry)
    question = questions[random.randint(0, len(questions) - 1)]

    # Maximum number of tokens you can send to this model is 2,048 tokens per request.

    content = prompt % (conversation, question)

    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{'role': 'user', 'content': content}],
                temperature=0,
           )
            
            question_and_answer = [{"from": "human", "value": question}, {"from": "gpt", "value": response['choices'][0]['message'].content}]

            obj = {
                "id": "{}".format(entry['id']),
                "image": "{}.png".format(entry['id']),
                "conversations": question_and_answer
            }

            return obj
        
        except openai.error.RateLimitError:
            pass
        except Exception as e:
            print(e)
            return {
                "id": "{}".format(entry['id']),
                "image": "{}.png".format(entry['id']),
                "conversations": [{"from": "human", "value": question}, {"from": "gpt", "value": "I am not sure what this image shows."}]
            }
        
        time.sleep(1)

def generate_summaries(dataset, output_path: str):
    """
    Use multiprocessing to parallelize the generation of summaries by calling call_api over a list of prompts.
    """
    data = []
    with Pool(4) as pool:
        for result in tqdm(pool.imap(get_answer, dataset, questions), total=len(dataset), desc="Generating QA"):
            data.append(result)
            # Write to json every 100 answers
            if len(data) % 100 == 0:
                write_to_json(data, output_path)

    return data

In [6]:
# Load the API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load input JSON file
dataset = load_dataset(input_path)

In [None]:
# Generate the summaries
#summaries = generate_summaries(dataset)

# Write the summaries to the output JSON file
#write_to_json(summaries, output_path)