This notebook illustrates how we query GPT3 on everyday things.

In [1]:
#!pip install tenacity

In [2]:
import os
import json
import openai
import math
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed

In [3]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
!mkdir gpt3_query_api_output

In [5]:
# Following function taken from Niket's defeasible inference repo
# https://github.com/allenai/defeasible_explanations/blob/fcb4a8dce3be2cb90511121f0a476b5b638d3f7c/src/prompting_utils.py
def boolean_label(x):
    true_indicators = {"true", "t", "yes", "correct", "1"}
    false_indicators = {"false", "f", "no", "wrong", "incorrect", "0"}
    if x.strip().lower() in true_indicators:
        return True
    elif x.strip().lower() in false_indicators:
        return False
    else:
        return ""


In [6]:
@retry(wait=wait_chain(*[wait_fixed(3) for i in range(3)] +
                       [wait_fixed(5) for i in range(2)] +
                       [wait_fixed(10)]))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

In [7]:
def get_true_false_prob_from_gpt3(prompt_q, verbose=False):
    if verbose:
        print(prompt_q)
    response = completion_with_backoff(
        model="text-davinci-003",
        prompt= prompt_q,
        temperature=0,
        max_tokens=256,
        top_p=1,
        logprobs=5,
        frequency_penalty=0,
        presence_penalty=0,
        stop=[" ", ".", ","]
    )
    
    # default
    label_True_False_probs = {"answer": False, "prob_True": 0.5, "prob_False": 0.5}

    # Try to get logprobs
    logprobs = response["choices"][0]["logprobs"] if response and "logprobs" in response["choices"][0] else {}
    if not logprobs:
        print(statement, "Alert: no logprobs.")
        # No log probs, label_True_False_probs stays as default, no need to process further
        return label_True_False_probs

    # Process top 5 candidates for next token
    candidates = logprobs["top_logprobs"][0]

    # Get True score and False score, note that this added up to 80-90% because we only have top 5 logprob
    score_true = sum([math.exp(score) for x, score in candidates.items() if boolean_label(x) is True])
    score_false = sum([math.exp(score) for x, score in candidates.items() if boolean_label(x) is False])
    if verbose:
        print(score_true, score_false)

    # Get label
    max_label = score_true > score_false

    # Scale it to 100%
    if score_true + score_false != 0.0:
        score_true_norm = score_true/(score_false + score_true)
        score_false_norm = 1 - score_true_norm
        label_True_False_probs = {"answer": max_label, "prob_True": score_true_norm, "prob_False": score_false_norm}
    else:
        # if true and false not in the top options, label_True_False_probs stays as default
        print(statement, "Alert: true and false not in the top 5 options.")

    if verbose:
        print(score_true_norm, score_false_norm)
    return label_True_False_probs


There are many queries to get model response for, so the following is such that you can choose the 'start' and 'end' indices for which you want to query the API for, and record that in an output file with a 'run_idx'. 

You may consider making copies of this notebook/code to run the queries in parallel, 

e.g. run_idx = 0 for start = 0 and end = 976, run_idx = 1 for start = 976 and end = 15976 etc

You'd then have output files like:
- gpt3_zero-shot_pred_on_full-ET-dataset_0-idx0-976.jsonl
- gpt3_zero-shot_pred_on_full-ET-dataset_1-idx976-15976.jsonl

In [8]:
# # ask the qns as that for macaw
run_idx = 0 
start = 0 
end = 976
with open("macaw_query_beaker_input/macaw_query_full-ET-dataset.json", "r") as query_file,\
    open("gpt3_query_api_output/gpt3_zero-shot_pred_on_full-ET-dataset_" + str(run_idx) + "-idx" + str(start) + "-" + str(end) + ".jsonl", "w") as predfile:
    query_lines = query_file.readlines()
    for idx, query_line in tqdm(enumerate(query_lines[start:end]), total= len(query_lines[start:end])):
        json_query = json.loads(query_line)
        #print(json_query)
        prompt_q = json_query['question'] + "\n\n"
        #print(prompt_q)
        
        json_query['gpt3_input_prompt'] = prompt_q
        #json_query['gpt3_answer'] = {"label": False, "True": 0.5, "False": 0.5}
        json_query['gpt3_answer'] = get_true_false_prob_from_gpt3(prompt_q)
        
        predfile.write(json.dumps(json_query))
        predfile.write("\n")

100%|█████████████████████████████████████████| 976/976 [11:07<00:00,  1.46it/s]


The following gives an estimate of the cost for our queries:

In [9]:
print(30, "words per query (upper bound)")
print(round(1000/750 * 30, 2), "tokens per query (upper bound)")
print(round((1000/750 * 30) * len(query_lines), 2), "total tokens across all {} queries".format(len(query_lines)))
print("\nPricing $0.0200  / 1K tokens")
print("$", round(((1000/750 * 30) * len(query_lines))/1000 * 0.02, 2), "for all {} queries".format(len(query_lines)))

30 words per query (upper bound)
40.0 tokens per query (upper bound)
4341120.0 total tokens across all 108528 queries

Pricing $0.0200  / 1K tokens
$ 86.82 for all 108528 queries
