# Introduction

The goal of this experiment is to determine the sensitivity of the results across multiple “runs” of the experiment.

- Using the training set sizes of 60 and 90, as determined from Experiment 2, the labelling task was repeated once again, independently run 50 times, for each case.

# Libraries and Configurations

In [1]:
import os
import re
import json
import random
import openai
import tiktoken
import pandas as pd
import numpy as np

from pathlib import Path
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from utils import calculate_openai_cost


# load environment variables from .env
load_dotenv()  

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

# Initialize the OpenAI client w/API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# https://platform.openai.com/docs/models
GPT_MODEL = os.getenv("OPENAI_MODEL")

# https://openai.com/api/pricing/
PROMPT_COST_PER_1000 = os.getenv("PROMPT_COST_PER_1000")
COMPLETION_COST_PER_1000 = os.getenv("COMPLETION_COST_PER_1000")

DATA_DIR = os.getenv("DATA_DIR")
DATA_FILE = os.getenv("DATA_FILE")

# dataset ref: https://doi.org/10.1017/pds.2023.100
data_path_qa = Path("../", DATA_DIR, DATA_FILE)

df = pd.read_excel(data_path_qa, usecols="F,G,H,O")

df.dropna(inplace=True)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


SEED = 42
SEED_EXPERIMENTS = [0, 1, 2, 3, 4, 42, 100, 123, 420, 999]
set_seed(SEED)

# Dataset

[Relevant Paper](https://doi.org/10.1017/pds.2023.100)

In [4]:
# Create separate DataFrames for each class
llq_data = df[df['A-General Type of Questions'] == 'LLQ']
gdq_data = df[df['A-General Type of Questions'] == 'GDQ']
drq_data = df[df['A-General Type of Questions'] == 'DRQ']

# Sample 1/3 of LLQ, 1/3 of GDQ, and 1/3 of DRQ for training
llq_train, llq_test = train_test_split(llq_data, test_size=0.3, random_state=SEED)
gdq_train, gdq_test = train_test_split(gdq_data, test_size=0.3, random_state=SEED)
drq_train, drq_test = train_test_split(drq_data, test_size=0.3, random_state=SEED)

# Concatenate the sampled data for testing
test_sample = pd.concat([llq_test.head(10), gdq_test.head(10), drq_test.head(10)])
test_sample = test_sample.sample(frac=1, random_state=SEED).reset_index(drop=True)

def get_train_sample(num_sample):
    num_unit = int(num_sample / 3)
    # Concatenate the sampled data for training
    train_sample = pd.concat([llq_train.head(num_unit), gdq_train.head(num_unit), drq_train.head(num_unit)])
    
    # Shuffle the training and test samples
    train_sample = train_sample.sample(frac=1, random_state=SEED).reset_index(drop=True)
    return train_sample

# Prompting

## System message

In [5]:
encoding = tiktoken.encoding_for_model(GPT_MODEL)

with open("../system-message.txt") as f:
    persona_text = f.read()
    print(f"System Prompt Message Input token size: {len(encoding.encode(persona_text))}")


# NB 
# 1. The content is collected from [https://doi.org/10.1016/j.destud.2016.07.002] Appendix 1
# 2. In the prompt, Right/Left Double Quotation Mark (“ ”) to quote Eris instead of Straight Double Quotation Mark ("")

System Prompt Message Input token size: 1006


## User Message

In [9]:
def get_train_sample(num_sample):
    num_unit = int(num_sample / 3)
    
    # Concatenate the sampled data for training
    train_sample = pd.concat([llq_train.head(num_unit), gdq_train.head(num_unit), drq_train.head(num_unit)])
    
    # Shuffle the training and test samples
    train_sample = train_sample.sample(frac=1, random_state=SEED).reset_index(drop=True)
    return train_sample


def get_user_prompt(sample_size):
    # create train example list
    train_sample = get_train_sample(sample_size)
    train_sample['example'] = train_sample['Questions'] \
                        + ' : ' + train_sample["A-General Type of Questions"] +'\n' 
    example_txt = ''.join(train_sample['example'])
    
    # numbered test qs list
    ques_list = '\n'.join(test_sample['Questions'])
    num_ques_list = '\n'.join(f"{i+1}. {question}" for i, question in enumerate(ques_list.split('\n')))

    return (
        f"Classify each of the questions below, delimited by triple backticks, using the taxonomy proposed by Eris."
        f" Label each question with one of the three categories: Low-level questions (LLQ), Deep Reasoning Questions (DRQ), or Generative Design Questions (GDQ)."
        f" The result includes only the label, do not state your reasoning for the assigned label. Format the result in JSON. Do not repeat answers.\n"
        f"```\n{num_ques_list}\n```\n"
        f"To help you categorize the questions above, here are some examples delimited by triple backticks, each line contains an example that has two segments - question and category separated by colon (:)\n\n"
        f"```\n{example_txt}```"
    )


# With Training Examples

## Sample Size 90

In [10]:
prompt_train = get_user_prompt(90)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 2139


### SEED 1

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[0],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    # pattern = r'```json\n(.*?)```'
    # match = re.search(pattern, input_text, re.DOTALL) # match.groups(1)[0]
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")

df_result.apply(pd.Series.nunique, axis=0)

### SEED 2

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[1],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 3

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[2],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 4

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[3],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 5

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[4],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 6

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[5],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 7

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[6],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 8

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[7],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 9

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[8],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 10

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[9],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

## Sample size 60

In [11]:
prompt_train = get_user_prompt(60)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 1617


### SEED 1

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[0],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    # pattern = r'```json\n(.*?)```'
    # match = re.search(pattern, input_text, re.DOTALL) # match.groups(1)[0]
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 2

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[1],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 3

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[2],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 4

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[3],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 5

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[4],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 6

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[5],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 7

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[6],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 8

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[7],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 9

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[8],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

### SEED 10

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=50,
    response_format={ "type": "json_object" },
    seed=SEED_EXPERIMENTS[9],
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
for choice in c.choices:
    print(choice.message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

In [None]:
df_result = pd.DataFrame()

for choice in c.choices:
    input_text = choice.message.content
    json_result = json.loads(input_text)
    df_result = df_result._append(json_result, ignore_index=True)

# display(df_result)
# columns refer - question number
# rows refer - different output
print(f"All results are the same?: {(df_result.apply(pd.Series.nunique, axis=0) == 1).all()}")
df_result.apply(pd.Series.nunique, axis=0)

# Result

|     |  |           Human label          |                     | |        AI, training w/60 Qs              |                     |  |      AI, training w/90 Qs               |                     |
| --- | :----------: | :-----------------: | :-----------------: | :------------------: | :-----------------: | :-----------------: | :------------------: | :-----------------: | :-----------------: |
|     |    LLQ    |    DRQ    |    GDQ    |    LLQ    |    DRQ    |    GDQ    |    LLQ    |    DRQ    |    GDQ    |
| Q1  | 1 |   |   |  1   |      |      |  1   |      |   |
| Q2  |   |   | 1 |      |      |  1   |      |      | 1 |
| Q3  |   | 1 |   |      | 1    |      |      | 1    |   |
| Q4  |   |   | 1 |      |      | 1    |      |      | 1 |
| Q5  | 1 |   |   | 1    |      |      | 1    |      |   |
| Q6  | 1 |   |   |      | 0.24 | 0.76 |      |      | 1 |
| Q7  |   | 1 |   |      | 1    |      |      | 1    |   |
| Q8  |   | 1 |   |      | 1    |      | 1    |      |   |
| Q9  |   |   | 1 |      | 1    |      |      | 1    |   |
| Q10 | 1 |   |   | 0.86 | 0.14 |      | 1    |      |   |
| Q11 | 1 |   |   |      | 1    |      | 1    |      |   |
| Q12 |   |   | 1 | 1    |      |      | 1    |      |   |
| Q13 | 1 |   |   | 0.62 | 0.38 |      | 1    |      |   |
| Q14 |   |   | 1 |      | 0.48 | 0.52 |      |      | 1 |
| Q15 |   |   | 1 |      | 0.48 | 0.52 |      |      | 1 |
| Q16 |   | 1 |   |      | 1    |      |      | 1    |   |
| Q17 | 1 |   |   | 1    |      |      | 1    |      |   |
| Q18 | 1 |   |   | 1    |      |      | 1    |      |   |
| Q19 |   | 1 |   |      | 1    |      | 1    |      |   |
| Q20 | 1 |   |   | 1    |      |      | 1    |      |   |
| Q21 |   | 1 |   |      | 1    |      |      | 1    |   |
| Q22 |   | 1 |   | 0.66 | 0.34 |      | 1    |      |   |
| Q23 |   |   | 1 |      | 0.34 | 0.66 |      | 1    |   |
| Q24 |   | 1 |   |      | 1    |      |      | 1    |   |
| Q25 |   | 1 |   | 0.66 | 0.34 |      | 0.78 | 0.22 |   |
| Q26 | 1 |   |   |      | 1    |      | 1    |      |   |
| Q27 |   |   | 1 |      | 0.48 | 0.52 |      |      | 1 |
| Q28 |   |   | 1 |      |      | 1    |      |      | 1 |
| Q29 |   |   | 1 |      |      | 1    |      |      | 1 |
| Q30 | 1 |   |   | 1    |      |      | 1    |      |   |

- The table presents the aggregated labelling results (e.g., a score of 0.5 indicates that GPT-4 selected that particular label in half of the 50 runs (i.e., 25 times)). 
- We see the probabilistic nature of the labelling, with many questions being labelled differently on different runs. 
- And when the train set is larger it reduced the variation over runs.

Explanation:
- From the first column we see Q5 is labelled as LLQ by human, Q22 as DRQ and Q28 as GDQ. 
- Now we want to compare this to aggregated labelling results from the training set with size 60. We see some 100% match and some partial match. For example – For the  Question #4 human labelled as GDQ and we see it was also labeled as GDQ by AI across all runs. For the case of partial match, we see human labeled the Question #15 as GDQ, but GPT labelled as DRQ for 48% of the time and for 52% of the time it was labeled as GDQ.
- We see only one partical match (Question #25) from the training set with size 60.
