In [1]:
import numpy as np
import pandas as pd

import os
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
import openai

client = openai.OpenAI(
    api_key=OPENAI_API_KEY
)

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def get_embeddings(queries, model, **kwargs):
    MAX_BATCH_SIZE = 2048

    # replace newlines and truncate to 8190 characters
    queries = [q.replace('\n', ' ') for q in queries]

    embeddings_data = []
    for chunk in chunker(queries, MAX_BATCH_SIZE):
        response = client.embeddings.create(
            input=chunk,
            model=model,
            **kwargs
        )
        chunk_embeddings = response.data
        embeddings_data.extend(chunk_embeddings)

    embeddings = pd.DataFrame(
        [x.embedding for x in embeddings_data],
        index=queries
    )

    return embeddings

Makes dates prompts and get embeddings.

In [3]:
months = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]
n_days = [31, 27, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
th_or_st = ["st", "nd", "rd"] + ["th"] * 17 + ["st", "nd", "rd"] + ["th"] * 7 + ["st"]
prompts = [f"{d}{suffix} {m}" for m, n in zip(months, n_days) for d, suffix in zip(range(1, n+1), th_or_st[:n])]

In [5]:
X_dates = get_embeddings(prompts, model="text-embedding-3-large")

In [6]:
X_dates.to_csv(
    "representations/dates_reprs.csv",
    index=False,
    header=False,
)

In [7]:
# save labels

pd.DataFrame({
    "label": prompts
}).to_csv(
    "representations/dates_labels.csv",
    index=False,
    header=False,
)

Make colour prompts and get embeddings.

In [48]:
# get colour names and hex codes from XKCD survey
f = "https://xkcd.com/color/rgb.txt"
df = pd.read_csv(f, skiprows=1, delimiter="\t", names=["name", "hex"], index_col=0, usecols=[0,1])

In [49]:
colors = np.array(list(df.index))
prompts = [f"The color of the object is {color}. What color is the object?" for color in colors]

In [22]:
X_colours = get_embeddings(prompts, model="text-embedding-3-large")

In [50]:
X_colours.to_csv(
    "representations/colours_reprs.csv",
    index=False,
    header=False,
)

In [51]:
# save labels

pd.DataFrame({
    "label": colors,
}).to_csv(
    "representations/colours_labels.csv",
    index=False
)