### How well can GPT play Codenames

In [28]:
# %init
import glob, sys, os, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from yahist import Hist1D, Hist2D
import glob
np.set_printoptions(linewidth=150)

from myhelpers.dataframes import fix_types
from myhelpers.personal import sqlite_magic

import random

In [26]:
all_words = pd.read_csv("https://raw.githubusercontent.com/sagelga/codenames/main/wordlist/en-EN/default/wordlist.txt", header=None).squeeze().str.lower().tolist()

In [None]:
import os
from openai import OpenAI

client = OpenAI()

In [119]:
import json
import ast

def get_spymaster_clue_number(words_with_teams, model="gpt-4"):
    words_with_teams_json = json.dumps(words_with_teams)
    content = f"""
    You are playing the codenames game where you are the spymaster on the blue team who knows the secret words and need to give a clue to point
    your teammates at the relevant words and a number for how many words the clue identifies.
    Avoid the enemies' words and avoid clues that could guide your teammates to the assassin word which causes an immediate loss.

    The words and classifications are
    {words_with_teams_json}

    Now give a single word clue and a number (e.g. "animal 4"). Number should be between 3 and 6. Clue word can be super specific if needed.
    Try to be cool and go for a lot of connected words.
    """
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": content,
            }
        ],
        model=model,
    #     model="gpt-3.5-turbo",
    )
    output = response.choices[0].message.content
    clue, number = output.strip('"').strip("'").lower().split()
    return clue, number

def get_team_choices(clue_word, clue_number, words, model="gpt-4"):
    words_json = json.dumps(list(words))

    content = f"""
    You are playing the codenames game where you will be given a clue and a number. You need to find the relevant words 
    for the clue up to that number. The words are:
    {words_json}

    And the clue and number are "{clue_word} {clue_number}". List the words as a python list starting with most confident/relevant to least (eg. `["foo", "bar", ...]`) with no other text.
    """
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": content,
            }
        ],
        model=model,
    )
    output = response.choices[0].message.content
    choices = ast.literal_eval(output.lower())
    return choices


In [105]:
client.models.list()

SyncPage[Model](data=[Model(id='gpt-4-vision-preview', created=1698894917, object='model', owned_by='system'), Model(id='dall-e-3', created=1698785189, object='model', owned_by='system'), Model(id='gpt-3.5-turbo-0613', created=1686587434, object='model', owned_by='openai'), Model(id='text-embedding-3-large', created=1705953180, object='model', owned_by='system'), Model(id='gpt-3.5-turbo-instruct-0914', created=1694122472, object='model', owned_by='system'), Model(id='dall-e-2', created=1698798177, object='model', owned_by='system'), Model(id='whisper-1', created=1677532384, object='model', owned_by='openai-internal'), Model(id='tts-1-hd-1106', created=1699053533, object='model', owned_by='system'), Model(id='tts-1-hd', created=1699046015, object='model', owned_by='system'), Model(id='babbage-002', created=1692634615, object='model', owned_by='system'), Model(id='text-embedding-ada-002', created=1671217299, object='model', owned_by='openai-internal'), Model(id='gpt-3.5-turbo-0125', crea

In [166]:
data = []

for model in ["gpt-3.5-turbo","gpt-4","gpt-4-turbo-preview"]:
    for seed in tqdm(range(50,100)):
        d = dict()
        d["seed"] = seed
        d["model"] = model

        np.random.seed(seed)

        teams = ["blue"]*7 + ["red"]*8 + ["neutral"]*9 + ["assassin"]
        np.random.shuffle(teams)
        words = np.random.choice(all_words, size=25, replace=False)
        words_with_teams = [{"word":word,"team":team} for i, word, team in zip(range(len(words)), words, teams)]

        blue_words = [x["word"] for x in words_with_teams if x["team"] == "blue"]
        red_words = [x["word"] for x in words_with_teams if x["team"] == "red"]
        neutral_words = [x["word"] for x in words_with_teams if x["team"] == "neutral"]
        assassin_words = [x["word"] for x in words_with_teams if x["team"] == "assassin"]

        try:
            clue_word, clue_number = get_spymaster_clue_number(words_with_teams, model=model)
            choices = get_team_choices(clue_word, clue_number, words, model=model)
        except:
            data.append(d)
            continue
            pass

        d["blue_words"] = blue_words
        d["red_words"] = red_words
        d["neutral_words"] = neutral_words
        d["assassin_words"] = assassin_words
        d["clue_word"] = clue_word
        d["clue_number"] = clue_number
        d["choices"] = choices

        d["assassinated"] = any(c in assassin_words for c in choices)
        points = 0
        for c in choices:
            if c in blue_words:
                points += 1
            else:
                break
        d["points"] = points
        data.append(d)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [167]:
df = pd.DataFrame(data)

In [168]:
# df.to_parquet("experiments_v1.parquet")
# df.to_parquet("experiments_v2.parquet")

In [181]:
df = pd.concat([
    pd.read_parquet("experiments_v1.parquet"),
    pd.read_parquet("experiments_v2.parquet")
]).drop_duplicates(["seed","model"]).reset_index(drop=True)
df

Unnamed: 0,seed,model,blue_words,red_words,neutral_words,assassin_words,clue_word,clue_number,choices,assassinated,points
0,0,gpt-3.5-turbo,"[poison, heart, bat, missile, switch, straw, l...","[apple, lap, ship, face, leprechaun, mine, sho...","[capital, scuba diver, time, ivory, pie, park,...",[soul],weapon,4,"[missile, bomb, mine, laser]",False,1.0
1,1,gpt-3.5-turbo,"[jet, spine, amazon, organ, field, net, penguin]","[turkey, hood, pants, plane, skyscraper, time,...","[band, alps, mass, temple, embassy, tower, lio...",[check],space,4,"[moon, belt, space, skyscraper]",False,0.0
2,2,gpt-3.5-turbo,,,,,,,,,
3,3,gpt-3.5-turbo,"[germany, beijing, seal, check, pipe, back, log]","[cover, lab, needle, thumb, rock, mammoth, rac...","[tablet, plate, concert, ball, straw, iron, fe...",[aztec],music,5,"[concert, ball, racket, rock, circle]",False,0.0
4,4,gpt-3.5-turbo,"[kid, princess, ambulance, pitch, plot, lock, ...","[church, theater, pyramid, face, pit, part, nu...","[chest, police, mouth, march, ice cream, lawye...",[pirate],treasure,4,"[chest, pirate, treasure, pyramid]",True,0.0
...,...,...,...,...,...,...,...,...,...,...,...
295,95,gpt-4-turbo-preview,"[bug, pit, switch, bridge, drill, stick, spike]","[cotton, soul, scorpion, carrot, kangaroo, pan...","[copper, ring, strike, square, gold, fish, mod...",[shoe],insect,3,"[bug, scorpion, spike]",False,1.0
296,96,gpt-4-turbo-preview,"[iron, mammoth, helicopter, spine, apple, ice,...","[switch, cap, chocolate, crash, vet, duck, com...","[match, antarctica, america, racket, ball, loc...",[knife],prehistoric,3,"[mammoth, antarctica, ice]",False,1.0
297,97,gpt-4-turbo-preview,"[atlantis, shop, hood, laser, green, pupil, cl...","[server, marble, china, bank, mammoth, casino,...","[ship, scientist, pan, wave, state, tower, pou...",[ambulance],legendary,4,"[atlantis, mammoth, casino, ship]",False,1.0
298,98,gpt-4-turbo-preview,,,,,,,,,


In [182]:
df["clue_number"] = df["clue_number"].astype(float)
df["points"] = df["points"].astype(float)

df["points"] = (1 - df["assassinated"])*df["points"]

df["api_failed"] = df["points"].isna()

df["points_norm"] = df["points"]/df["clue_number"]

In [222]:
df.groupby("model")["api_failed"].mean()

model
gpt-3.5-turbo          0.14
gpt-4                  0.00
gpt-4-turbo-preview    0.14
Name: api_failed, dtype: float64

In [184]:
df.groupby("model")["clue_number"].agg(["mean","median"])

Unnamed: 0_level_0,mean,median
model,Unnamed: 1_level_1,Unnamed: 2_level_1
gpt-3.5-turbo,3.872093,4.0
gpt-4,3.69,4.0
gpt-4-turbo-preview,3.534884,4.0


In [185]:
df.groupby("model")["points"].agg(["mean","median"])

Unnamed: 0_level_0,mean,median
model,Unnamed: 1_level_1,Unnamed: 2_level_1
gpt-3.5-turbo,0.802326,1.0
gpt-4,1.11,1.0
gpt-4-turbo-preview,1.127907,1.0


In [221]:
df.groupby("model")["points_norm"].agg(["mean","std","count","median"])

Unnamed: 0_level_0,mean,std,count,median
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gpt-3.5-turbo,0.213953,0.243797,86,0.225
gpt-4,0.319667,0.259806,100,0.333333
gpt-4-turbo-preview,0.330039,0.263856,86,0.333333


In [187]:
df.groupby("model")["assassinated"].mean()

model
gpt-3.5-turbo          0.116279
gpt-4                  0.130000
gpt-4-turbo-preview    0.046512
Name: assassinated, dtype: float64

In [188]:
s = df.pivot(index="seed", columns="model", values="points").fillna(0)

In [189]:
df_wins = []
for m1 in s.columns:
    for m2 in s.columns:
        win_rate = (s[m1] > s[m2]).mean()
        df_wins.append(dict(m1=m1, m2=m2, win_rate=win_rate))
df_wins = pd.DataFrame(df_wins)
df_wins = df_wins.pivot(index="m1", columns="m2", values="win_rate")
df_wins

m2,gpt-3.5-turbo,gpt-4,gpt-4-turbo-preview
m1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gpt-3.5-turbo,0.0,0.17,0.22
gpt-4,0.46,0.0,0.35
gpt-4-turbo-preview,0.43,0.23,0.0


In [205]:
row

seed                                                              0
model                                                 gpt-3.5-turbo
blue_words        [poison, heart, bat, missile, switch, straw, l...
red_words         [apple, lap, ship, face, leprechaun, mine, sho...
neutral_words     [capital, scuba diver, time, ivory, pie, park,...
assassin_words                                               [soul]
clue_word                                                    weapon
clue_number                                                     4.0
choices                                [missile, bomb, mine, laser]
assassinated                                                  False
points                                                          1.0
api_failed                                                    False
points_norm                                                    0.25
Name: 0, dtype: object

In [219]:
baselines = []
for i, row in df.iterrows():
    if row["api_failed"]:
        baselines.append(None)
        continue
    blue_words = row["blue_words"].tolist()
    assassin_words = row["assassin_words"].tolist()
    words = row["blue_words"].tolist() + row["red_words"].tolist() + row["neutral_words"].tolist() + row["assassin_words"].tolist()
    choices = np.random.choice(words, int(row["clue_number"]), replace=False)
    assassinated = any(c in assassin_words for c in choices)
    points = 0
    for c in choices:
        if c in blue_words:
            points += 1
        else:
            break
    if assassinated:
        points = 0
    points_norm = points/int(row["clue_number"])
    baselines.append(points_norm)

In [220]:
pd.Series(baselines).mean()

0.11464460784313724

In [201]:
words

['moscow',
 'opera',
 'war',
 'america',
 'snowman',
 'unicorn',
 'bridge',
 'cotton',
 'mercury',
 'berry',
 'vacuum',
 'washer',
 'hollywood',
 'copper',
 'pipe',
 'microscope',
 'ice cream',
 'lion',
 'thief',
 'duck',
 'ground',
 'fence',
 'school',
 'eagle',
 'march']