# Introduction

The goal of this experiment is to determine the effect of the size of the training set on the accuracy of labelling by the GPT-4. 

- The labelling task described in Experiment 1 was repeated under different conditions of the size of training set, which was varied from 0 (i.e., no training) to 300 pre-labelled questions, in increments of 30.
- The testing set was kept constant and identical to the one in Experiment 1. 

# Libraries and Configuration

In [1]:
import os
import random
import openai
import tiktoken
import pandas as pd 
import numpy as np

from pathlib import Path
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from utils import calculate_openai_cost

# load environment variables from .env
load_dotenv()  

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

# Initialize the OpenAI client w/API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# https://platform.openai.com/docs/models
GPT_MODEL = os.getenv("OPENAI_MODEL")

# https://openai.com/api/pricing/
PROMPT_COST_PER_1000 = os.getenv("PROMPT_COST_PER_1000")
COMPLETION_COST_PER_1000 = os.getenv("COMPLETION_COST_PER_1000")

DATA_DIR = os.getenv("DATA_DIR")
DATA_FILE = os.getenv("DATA_FILE")

# dataset ref: https://doi.org/10.1017/pds.2023.100
data_path_qa = Path("../", DATA_DIR, DATA_FILE)

df = pd.read_excel(data_path_qa, usecols="F,G,H,O")

df.dropna(inplace=True)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

SEED = 42
set_seed(42)

# Dataset

[Relevant Paper](https://doi.org/10.1017/pds.2023.100)

In [2]:
# Create separate DataFrames for each class
llq_data = df[df['A-General Type of Questions'] == 'LLQ']
gdq_data = df[df['A-General Type of Questions'] == 'GDQ']
drq_data = df[df['A-General Type of Questions'] == 'DRQ']

# Sample 1/3 of LLQ, 1/3 of GDQ, and 1/3 of DRQ for training
llq_train, llq_test = train_test_split(llq_data, test_size=0.3, random_state=SEED)
gdq_train, gdq_test = train_test_split(gdq_data, test_size=0.3, random_state=SEED)
drq_train, drq_test = train_test_split(drq_data, test_size=0.3, random_state=SEED)

# Concatenate the sampled data for testing
test_sample = pd.concat([llq_test.head(10), gdq_test.head(10), drq_test.head(10)])
test_sample = test_sample.sample(frac=1, random_state=SEED).reset_index(drop=True)

# Prompting

## System message

In [3]:
encoding = tiktoken.encoding_for_model(GPT_MODEL)

with open("../system-message.txt") as f:
    persona_text = f.read()
    print(f"System Prompt Message Input token size: {len(encoding.encode(persona_text))}")


# NB 
# 1. The content is collected from [https://doi.org/10.1016/j.destud.2016.07.002] Appendix 1
# 2. In the prompt, Right/Left Double Quotation Mark (“ ”) to quote Eris instead of Straight Double Quotation Mark ("")

System Prompt Message Input token size: 1006


## User Message

In [4]:
def get_train_sample(num_sample):
    num_unit = int(num_sample / 3)
    
    # Concatenate the sampled data for training
    train_sample = pd.concat([llq_train.head(num_unit), gdq_train.head(num_unit), drq_train.head(num_unit)])
    
    # Shuffle the training and test samples
    train_sample = train_sample.sample(frac=1, random_state=SEED).reset_index(drop=True)
    return train_sample


def get_user_prompt(sample_size):
    # create train example list
    train_sample = get_train_sample(sample_size)
    train_sample['example'] = train_sample['Questions'] \
                        + ' : ' + train_sample["A-General Type of Questions"] +'\n' 
    example_txt = ''.join(train_sample['example'])
    
    # numbered test qs list
    ques_list = '\n'.join(test_sample['Questions'])
    num_ques_list = '\n'.join(f"{i+1}. {question}" for i, question in enumerate(ques_list.split('\n')))


    return (
        f"Classify each of the questions below, delimited by triple backticks, using the taxonomy proposed by Eris."
        f" Label each question with one of the three categories: Low-level questions, Deep Reasoning Questions, or Generative Design Questions."
        f" State your reasoning for the assigned label. Format the result as a markdown table.\n"
        f"```\n{num_ques_list}\n```\n"
        f"To help you categorize the questions above, here are some examples delimited by triple backticks, each line contains an example that has two segments - question and category separated by colon (:)\n\n"
        f"```\n{example_txt}```"
    )


# With Training Examples

## Sample size 30

In [5]:
prompt_train = get_user_prompt(30)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 1105


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

## Sample size 60

In [94]:
prompt_train = get_user_prompt(60)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 1596


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

## Sample size 90

In [95]:
prompt_train = get_user_prompt(90)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 2118


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

## Sample size 120

In [96]:
prompt_train = get_user_prompt(120)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 2556


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

## Sample size 150

In [97]:
prompt_train = get_user_prompt(150)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 3134


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

## Sample size 180

In [98]:
prompt_train = get_user_prompt(180)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 3605


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

## Sample size 210

In [99]:
prompt_train = get_user_prompt(210)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 4093


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

## Sample size 240

In [100]:
prompt_train = get_user_prompt(240)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 4644


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

##  Sample size 270

In [101]:
prompt_train = get_user_prompt(270)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 5258


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

## Sample size 300

In [102]:
prompt_train = get_user_prompt(300)
print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 5867


In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

# Result

<table border="1" cellpadding="10" cellspacing="0">
  <tr>
    <th rowspan="2"></th>
    <th colspan="11">Size of training set (# of human-labelled questions)</th>
  </tr>
  <tr>
    <th>0 (No training)</th>
    <th>30</th>
    <th>60</th>
    <th>90</th>
    <th>120</th>
    <th>150</th>
    <th>180</th>
    <th>210</th>
    <th>240</th>
    <th>270</th>
    <th>300</th>
  </tr>
  <tr>
    <th>Alignment with human-assigned labels (%)</th>
    <td align="center"><span style="border: 1px solid purple; border-radius: 50%; padding: 4px;">60</span></td>
    <td align="center">67</td>
    <td align="center" style="background-color: gray;"><span style="border: 1px solid purple; border-radius: 50%; padding: 4px;">83</span></td>
    <td align="center" style="background-color: gray;"><span style="border: 1px solid purple; border-radius: 50%; padding: 4px;">83</span></td>
    <td align="center">80</td>
    <td align="center">83</td>
    <td align="center">83</td>
    <td align="center">73</td>
    <td align="center">70</td>
    <td align="center">73</td>
    <td align="center">83</td>
  </tr>
</table>

- GPT-4 generated labels align more closely with the human labels when a training set is provided - 83% when training set size is 60 or 90; compared to 60% alignment where no training set is provided.
- No accuracy improvements are achieved when the size is increased past 90 questions. 