# Introduction

These experiments aim to determine the baseline performance by classifying a test set of standalone question utterances, with (size 90) and without a training set.

# Libraries and Configuration

In [13]:
import os
import random
import openai
import tiktoken
import pandas as pd 
import numpy as np

from pathlib import Path
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from utils import calculate_openai_cost

# load environment variables from .env
load_dotenv()  

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

# Initialize the OpenAI client w/API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# https://platform.openai.com/docs/models
GPT_MODEL = os.getenv("OPENAI_MODEL")

# https://openai.com/api/pricing/
PROMPT_COST_PER_1000 = os.getenv("PROMPT_COST_PER_1000")
COMPLETION_COST_PER_1000 = os.getenv("COMPLETION_COST_PER_1000")

DATA_DIR = os.getenv("DATA_DIR")
DATA_FILE = os.getenv("DATA_FILE")

# dataset ref: https://doi.org/10.1017/pds.2023.100
data_path_qa = Path("../", DATA_DIR, DATA_FILE)

df = pd.read_excel(data_path_qa, usecols="F,G,H,O")

df.dropna(inplace=True)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

SEED = 42
set_seed(SEED)

# Dataset

[Relevant Paper](https://doi.org/10.1017/pds.2023.100)

In [15]:
# Create separate DataFrames for each class
llq_data = df[df['A-General Type of Questions'] == 'LLQ']
gdq_data = df[df['A-General Type of Questions'] == 'GDQ']
drq_data = df[df['A-General Type of Questions'] == 'DRQ']

# Sample 30% of LLQ, 30% of GDQ, and 30% of DRQ for training
llq_train, llq_test = train_test_split(llq_data, test_size=0.3, random_state=SEED)
gdq_train, gdq_test = train_test_split(gdq_data, test_size=0.3, random_state=SEED)
drq_train, drq_test = train_test_split(drq_data, test_size=0.3, random_state=SEED)

# Concatenate the sampled data for training
train_sample = pd.concat([llq_train.head(30), gdq_train.head(30), drq_train.head(30)])

# Concatenate the sampled data for testing
test_sample = pd.concat([llq_test.head(10), gdq_test.head(10), drq_test.head(10)])

# Shuffle the training and test samples
train_sample = train_sample.sample(frac=1, random_state=SEED).reset_index(drop=True)
test_sample = test_sample.sample(frac=1, random_state=SEED).reset_index(drop=True)

In [16]:
df['A-General Type of Questions'].value_counts()

A-General Type of Questions
LLQ    1322
GDQ     536
DRQ     174
Name: count, dtype: int64

# With Training Examples

## Prompting

### System message

In [5]:
encoding = tiktoken.encoding_for_model(GPT_MODEL)

with open("../system-message.txt") as f:
    persona_text = f.read()
    print(f"System Prompt Message Input token size: {len(encoding.encode(persona_text))}")


# NB 
# 1. The content is collected from [https://doi.org/10.1016/j.destud.2016.07.002] Appendix 1
# 2. In the prompt, Right/Left Double Quotation Mark (“ ”) to quote Eris instead of Straight Double Quotation Mark ("")

System Prompt Message Input token size: 1006


### User message

In [8]:
# create train example list
train_sample['example'] = train_sample['Questions'] \
                        + ' : ' + train_sample["A-General Type of Questions"] +'\n' 
example_txt = ''.join(train_sample['example'])

# numbered test qs list
ques_list = '\n'.join(test_sample['Questions'])
num_ques_list = '\n'.join(f"{i+1}. {question}" for i, question in enumerate(ques_list.split('\n')))

prompt_train = f"Classify each of the questions below, delimited by triple backticks, using the taxonomy proposed by Eris. \
Label each question with one of the three categories: Low-level questions, Deep Reasoning Questions, or Generative Design Questions. \
State your reasoning for the assigned label. Format the result as a markdown table.\n\
```\n{num_ques_list}\n```\nTo help you categorize the questions above, here are some examples \
delimited by triple backticks, each line contains an example that has two segments - question \
and category separated by colon (:)\n\n```\n{example_txt}\n```"


print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_train))}")

User Prompt Message Input token size: 2118


## Experiment 1A

Determine baseline performance. Classify a test set of stand-alone question utterances, with a training set 

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

# Without Training Examples

## Prompting

### User message

In [7]:
prompt_no_train = f"Classify each of the questions below, delimited by triple backticks, using the taxonomy proposed by Eris. \
Label each question with one of the three categories: Low-level questions, Deep Reasoning Questions, or Generative Design Questions. \
State your reasoning for the assigned label. Format the result as a markdown table.\n\n \
```\n{num_ques_list}\n```"

print(f"User Prompt Message Input token size: {len(encoding.encode(prompt_no_train))}")

User Prompt Message Input token size: 544


## Experiment 1B

In [None]:
%%time

c = openai.chat.completions.create(
    model=GPT_MODEL,
    n=1,
    seed=SEED,
    temperature=0,
    messages=[{"role": "system", "content": persona_text},
              {"role": "user", "content": prompt_no_train}])

In [None]:
print(f"System Fingerprint: {c.system_fingerprint}\n")
print(c.choices[0].message.content)

cost, prompt_tokens, completion_tokens, total_tokens = calculate_openai_cost(c.usage, PROMPT_COST_PER_1000, COMPLETION_COST_PER_1000)

print(f"Total cost: ${cost:.5f}")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")

# Result

| Testing set: question number and utterance                                                                                                                          | Human   | AI /woT | AI /wT |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ------- | ------ |
| 1\. So how do you know how much do?                                                                                                                                 | DRQ | LLQ     | DRQ    |
| 2\. …what are the possible contributors?                                                                                                                            | GDQ | GDQ     | GDQ    |
| 3\. Why would you put that on there if you could swap in the new roll in a minute?                                                                                  | DRQ | DRQ     | DRQ    |
| 4\. What about cleats, right cleated conveyor cleats?                                                                                                               | GDQ | GDQ     | GDQ    |
| 5\. But so it's full 15 or whatever, probably right?                                                                                                                | LLQ | LLQ     | LLQ    |
| 6\. Could you take the candy and slide it on the wrapper?                                                                                                           | LLQ | GDQ     | GDQ    |
| 7\. Why would I do that?                                                                                                                                            | DRQ | DRQ     | DRQ    |
| 8\. How they wrap candy currently?                                                                                                                                  | DRQ | LLQ     | DRQ    |
| 9\. How does the very last one behave?                                                                                                                              | GDQ | LLQ     | DRQ    |
| 10\. But this whole thing has to rotate, right?                                                                                                                     | LLQ | LLQ     | LLQ    |
| 11\. So when they're wrapping in one of these candies, have you seen the whole process?                                                                             | LLQ | LLQ     | LLQ    |
| 12\. Yeah, but I think, isn't it the same?                                                                                                                          | GDQ | LLQ     | LLQ    |
| 13\. So is the main assumption right now that the wrapper will stick?                                                                                               | LLQ | LLQ     | LLQ    |
| 14\. How are you going to intersection things that are exactly the same…?                                                                                           | GDQ | GDQ     | GDQ    |
| 15\. How are you going to keep rolling and pulling it?                                                                                                              | GDQ | GDQ     | GDQ    |
| 16\. How was the candy being ejected from your machine.                                                                                                             | DRQ | LLQ     | DRQ    |
| 17\. Can I do this stuff?                                                                                                                                           | LLQ | LLQ     | LLQ    |
| 18\. Makes sense? Right?                                                                                                                                            | LLQ | LLQ     | LLQ    |
| 19\. What kind of processes out there now?                                                                                                                          | DRQ | LLQ     | DRQ    |
| 20\. Because these paper will come on to the stage right?                                                                                                           | LLQ | LLQ     | LLQ    |
| 21\. …Why did you put grid?                                                                                                                                         | DRQ | DRQ     | DRQ    |
| 22\. What function is providing?                                                                                                                                    | DRQ | LLQ     | LLQ    |
| 23\. What's gonna happen?                                                                                                                                           | GDQ | DRQ     | GDQ    |
| 24\. So what's happening here at the end of this conveyor belt, tell me a little bit more …What would happen in terms of how this thing gets pushed in the plastic? | DRQ | LLQ     | DRQ    |
| 25\. …What is the purpose of this box?                                                                                                                              | DRQ | LLQ     | LLQ    |
| 26\. And is this manual pushing? Or .. a motor pushing?                                                                                                             | LLQ | LLQ     | LLQ    |
| 27…How are you going to then finalize the closure?                                                                                                                  | GDQ | GDQ     | GDQ    |
| 28. So how do we animate this?                                                                                                                                      | GDQ | GDQ     | GDQ    |
| 29\. Or is it something that we can outsource to a company?                                                                                                         | GDQ | LLQ     | GDQ    |
| 30\. Do you want to a machine to produce 15 to 20 pieces of candy per min and currently they're doing what is it?                                                   | LLQ | LLQ     | LLQ    |


- GPT-4 generated labels align more closely with the human labels when a training set is provided - 83% when training set size is 90; compared to 60% alignment where no training set is provided. Therefore, training set could be useful.