# Using GPT-4 to label the analogies

In [1]:
import openai
from tqdm import tqdm

RATE_LIMIT = .5  # 1 request per 2 seconds
with open("../../../openai_valid.key", 'r') as inputfile:
    apiKey = inputfile.readline().strip()  # Reads the first line (API key)
    orgKey = inputfile.readline().strip()  # Reads the second line (optional)
    projKey = inputfile.readline().strip()  # Reads the third line (optional)

# Set the OpenAI API key
openai.api_key = apiKey
# And set organization and project keys if applicable
openai.organization = orgKey
openai.project = projKey


In [2]:
def prompt_model(prompt, reasoning=False):

    if not reasoning:
        response = openai.chat.completions.create(
            model="gpt-4o",
            max_tokens=100,
            temperature=0,
            messages=[
                {"role": "user", "content": prompt},
            ],   
        )
        return(response.choices[0].message.content)
    else:
        #todo: add reasoning
        pass

In [3]:
with open("../../data/labeling_promptv3.txt", "r") as f_in:
    label_prompt = "".join(f_in.readlines())

In [4]:
# store labels of analogies - change this:
annotator = "gpt-4"

# don't change this
file_name = "labeled_analogies_"+annotator+".txt"
file_path = "results/analogy_labels/"+file_name

In [5]:
# load analogies
path_to_data = "../../data/human_results/label_validation_analogies.txt"

with open(path_to_data, "r") as f_in:
    analogies = [x.strip() for x in f_in.readlines()]

In [6]:
run_model = input("Running the model? (y/n)")
if run_model != "y":
   pass
else:
   labels = []
   for analogy in tqdm(analogies):
      prompt = label_prompt.replace("[EXPLANATION]", "Explanation: "+analogy)
      labels.append(prompt_model(prompt))

100%|██████████| 30/30 [00:15<00:00,  1.91it/s]


In [7]:
for label in labels:
    print(label)

Physical Action
Physical Action
Physical Action
Physical Action
Cultural/Convention
Cultural/Convention
Physical Action
Interactive Entities
Cultural/Convention
Interactive Entities
Physical Action
Interactive Entities
Physical Action
Physical Action
Physical Action
No Analogy/Explanation
Interactive Entities
Physical Action
Cultural/Convention
Cultural/Convention
Cultural/Convention
Physical Action
No Analogy/Explanation
Cultural/Convention
Interactive Entities
Cultural/Convention
Physical Action
Cultural/Convention
Cultural/Convention
Cultural/Convention


In [None]:
with open("../../data/human_results/labels_p.txt", "r") as f:
    labels_p = [line.strip() for line in f.readlines()]
with open("../../data/human_results/labels_j.txt", "r") as f:
    labels_j = [line.strip() for line in f.readlines()]

with open(f"../../data/human_results/labels_{annotator}.txt", "w") as f:
    [f.write(line+'\n') for line in labels]

with open(f"../../data/human_results/labels_{annotator}.txt", "r") as f:
    labels_g = [line.strip() for line in f.readlines()]

labels_g = labels

In [9]:
from statsmodels.stats.inter_rater import fleiss_kappa
import numpy as np
from collections import Counter


# Combine the labels into a single list of lists
annotations = [labels_p, labels_j, labels_g]
annotations = list(zip(*annotations))  # Transpose so each sentence has its annotations grouped

# Define all possible topics
categories = ["Physical Action", "Cultural/Convention", "Interactive Entities", "No Analogy/Explanation"]

# Create the matrix of counts (rows: sentences, columns: categories)
def create_fleiss_matrix(annotations, categories):
    fleiss_matrix = []
    for sentence_annotations in annotations:
        # Ensure all categories are included, even if count is 0
        count = Counter({cat: 0 for cat in categories})
        count.update(sentence_annotations)
        row = [count[cat] for cat in categories]  # Ensure counts are aligned with category order
        fleiss_matrix.append(row)
    return np.array(fleiss_matrix)

fleiss_matrix = create_fleiss_matrix(annotations, categories)

# Print the Fleiss' matrix
#print("Fleiss' Matrix:")
#print(fleiss_matrix)

# Calculate Fleiss' Kappa
kappa = fleiss_kappa(fleiss_matrix, method='fleiss')
print("Fleiss' Kappa:", kappa)


Fleiss' Kappa: 0.6023818670764501


In [10]:
from sklearn.metrics import cohen_kappa_score

# Compute Cohen's Kappa for each pair of annotators
kappa_j_p = cohen_kappa_score(labels_j, labels_p)
kappa_j_g = cohen_kappa_score(labels_j, labels_g)
kappa_p_g = cohen_kappa_score(labels_p, labels_g)

# Print the results
print("Cohen's Kappa (j-p):", kappa_j_p)
print("Cohen's Kappa (j-g):", kappa_j_g)
print("Cohen's Kappa (p-g):", kappa_p_g)


# Cohen's Kappa (j-p): 0.6276595744680851 on the merged 3rd revision

# Cohen's Kappa (j-p): 0.5376712328767124 on the merged 2nd revision
# Cohen's Kappa (j-g): 0.48630136986301375 on the merged 2nd revision
# Cohen's Kappa (p-g): 0.53125             on the merged 2nd revision

Cohen's Kappa (j-p): 0.6276595744680851
Cohen's Kappa (j-g): 0.47916666666666663
Cohen's Kappa (p-g): 0.7029702970297029
