# Scoring the 'interactions' dataset using ChatGPT

In this notebook, we prompt ChatGPT to score the candidate answers (determined considering the interaction with the highest BERTScore) taking into account the golden answer.

In [None]:
%pip install artifacts/gpt_wrapper-0.0.8-py3-none-any.whl
%pip install tiktoken

In [None]:
import gpt_wrapper
gpt_wrapper.api_key = "GPT_API_KEY_REMOVED_FOR_PRIVACY"

In [None]:
from gpt_wrapper.chat import Chat

In [None]:
import json

with open('complete_complete.json') as json_file:
    data = json.load(json_file)

In [None]:
def flatten_and_concatenate(nested_list):
    # If the input is a string, return it
    if isinstance(nested_list, str):
        return nested_list

    # If the input is a list, apply the function to each element and concatenate the results
    if isinstance(nested_list, list):
        return ' '.join(flatten_and_concatenate(element) for element in nested_list)

    # If the input is neither a string nor a list, return an empty string
    return ''

In [None]:
import time
import tiktoken

start_tokens_used = 1766408
used_tokens = 0

# Create an empty list to store the datapoints
scored = []
skipped = []
start_index = 0 

try:
    # Load the previously scored datapoints (useful if the notebook crashes and you need to restart it)
    with open('complete_complete_with_grades.json', 'r') as f:
        scored = json.load(f)
    with open('skipped.json', 'r') as f:
        skipped = json.load(f)

    start_index = len(scored) + len(skipped)

except Exception as e:
    print(f"Failed to load previous data: {e}")
    
data_len = len(data)
start_time = time.time()
elapsed_times = []

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Iterate over the data
for count, datapoint in enumerate(data[start_index:], start_index + 1):
    print("########################################################")
    print("Processing datapoint", count, "of", data_len, "(", round(count/data_len*100, 2), "%)")
    print("########################################################")

    if "question" in datapoint and "answer" in datapoint and "candidate_answer" in datapoint and datapoint["question"] is not None and datapoint["answer"] is not None and datapoint["candidate_answer"] is not None:
        question = datapoint["question"]
        answer = datapoint["answer"]
        candidate = datapoint["candidate_answer"]
    else:
        print("Skipping datapoint", count, "because it is missing a question, answer, or candidate_answer")
        skipped.append(count)
        with open('skipped.json', 'w') as f:
            json.dump(skipped, f)
        continue

    prompt = "Given the following question, correct answer, and candidate answer, score the candidate answer on a scale of 0-5, based on how much it matches the correct answer. Just return the number, DO NOT ADD FURTHER COMMENTS.\n\nQuestion: " + flatten_and_concatenate(question) + "\n\nAnswer: " + flatten_and_concatenate(answer) + "\n\nCandidate Answer: " + flatten_and_concatenate(candidate) + "\n\nScore: "
    chat = Chat.create("Score_Candidate_Answer_" + str(count))

    if (len(encoding.encode(prompt)) + 10) < 4097:
        message = chat.ask(content=prompt)
        print(message)
    else:
        print("Skipping datapoint", count, "because the prompt is too long for the model")
        skipped.append(count)
        with open('skipped.json', 'w') as f:
            json.dump(skipped, f)
        continue

    try:
        score = int(message.content)
    except (ValueError, TypeError):
        print("Skipping datapoint", count, "because the scorer did not return a valid score")
        skipped.append(count)
        with open('skipped.json', 'w') as f:
            json.dump(skipped, f)
        continue

    if score != 0 and score != 1 and score != 2 and score != 3 and score != 4 and score != 5:
        print("Skipping datapoint", count, "because the scorer did not return a valid score")
        skipped.append(count)
        with open('skipped.json', 'w') as f:
            json.dump(skipped, f)
        continue

    choices = None
    if "choices" in datapoint and datapoint["choices"] is not None:
        choices = datapoint["choices"]
    explanation = None
    if "explanation" in datapoint and datapoint["explanation"] is not None:
        explanation = datapoint["explanation"]

    datapoint_dict = {
            "question": question,
            "answer": answer,
            "BERTScore": datapoint["BERTScore"],
            "candidate_answer": candidate,
            "confidence": datapoint["confidence"],
            "sol_id": datapoint["sol_id"],
            "interaction_id": datapoint["interaction_id"],
            "choices": choices,
            "explanation": explanation,
            "score": score
            }
    
    # Add the datapoint to the list
    scored.append(datapoint_dict)

    # Save the list of datapoints as a JSON file
    with open('complete_complete_with_grades.json', 'w') as f:
        json.dump(scored, f)

    print("Tokens used: ", Chat.budget()['usage'] / Chat.budget()['limit'] * 100, "%")
    print("Total tokens used: ", Chat.budget()['usage'])
    available_tokens = Chat.budget()['limit'] - Chat.budget()['usage']
    avg_tokens_per_datapoint = (Chat.budget()['usage'] - start_tokens_used) / count
    print("Average remaining datapoints before running out of tokens: ", round(available_tokens / avg_tokens_per_datapoint))