In [155]:
import numpy as np
import random
import json
from tqdm.notebook import tqdm
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer

In [71]:
with open('word_lists/codenames.txt') as f:
    original_words = np.array(f.read().splitlines())
with open('word_lists/duet.txt') as f:
    duet_words = np.array(f.read().splitlines())
with open('word_lists/deep_undercover.txt') as f:
    undercover_words = np.array(f.read().splitlines())

WORD_POOL = duet_words
GRID_SIZE = 5

def generate_grids(word_pool = WORD_POOL):
    word_grid = list(np.random.choice(word_pool, GRID_SIZE**2, replace=False).reshape([GRID_SIZE,GRID_SIZE]))
    key_cards = list(zip(
        ['A','B','B','B','B','B','F','F','F','F','F','F','F','F','F','B','A','B','B','B','B','B','B','B','A'], 
        ['F','F','F','F','F','F','F','F','F','B','B','B','B','B','A','A','A','B','B','B','B','B','B','B','B']
    ))
    random.shuffle(key_cards)
    key_grid_player, key_grid_model = zip(*key_cards)
    key_grid_player = list(np.reshape(key_grid_player, [GRID_SIZE,GRID_SIZE]))
    key_grid_model = list(np.reshape(key_grid_model, [GRID_SIZE,GRID_SIZE]))
    return word_grid, key_grid_player, key_grid_model

word_grid, key_grid_player, key_grid_model = generate_grids()

In [72]:
word_grid

[array(['GENIE', 'PAINT', 'LIP', 'MILL', 'LACE'], dtype='<U11'),
 array(['RUBBER', 'BOIL', 'TEAM', 'SPURS', 'FUEL'], dtype='<U11'),
 array(['CAPTAIN', 'JUDGE', 'DISK', 'BARBECUE', 'RAIL'], dtype='<U11'),
 array(['COACH', 'THUNDER', 'CLOUD', 'MINUTE', 'RAZOR'], dtype='<U11'),
 array(['ANTHEM', 'CHRISTMAS', 'BENCH', 'FOG', 'ARMOR'], dtype='<U11')]

In [73]:
key_grid_player

[array(['F', 'B', 'A', 'F', 'F'], dtype='<U1'),
 array(['B', 'B', 'B', 'F', 'B'], dtype='<U1'),
 array(['F', 'A', 'B', 'F', 'B'], dtype='<U1'),
 array(['B', 'B', 'B', 'F', 'B'], dtype='<U1'),
 array(['A', 'F', 'B', 'F', 'B'], dtype='<U1')]

In [74]:
key_grid_model

[array(['A', 'F', 'F', 'F', 'F'], dtype='<U1'),
 array(['B', 'F', 'B', 'F', 'F'], dtype='<U1'),
 array(['B', 'A', 'F', 'B', 'B'], dtype='<U1'),
 array(['A', 'B', 'B', 'B', 'B'], dtype='<U1'),
 array(['B', 'B', 'F', 'B', 'B'], dtype='<U1')]

In [156]:
def get_word_groups(word_grid, key_grid):
    word_groups = {}
    for i in range(len(key_grid)):
        for j in range(len(key_grid[i])):
            key = key_grid[i][j]
            word = word_grid[i][j]
            
            if key in word_groups:
                word_groups[key].append(word)
            else:
                word_groups[key] = [word]
    
    return word_groups

def filter_dict_by_stemmed_keys(data_dict, word_list):
    stemmer = PorterStemmer()

    # Stem all words in the list
    stemmed_words = {stemmer.stem(word) for word in word_list}

    def should_remove(key):
        """Check if a given key should be removed based on the conditions."""
        stemmed_key = stemmer.stem(key)
        for word in stemmed_words:
            if stemmed_key == word or word in stemmed_key or stemmed_key in word:
                return True
        return False

    # Filter dictionary based on the should_remove condition
    filtered_dict = {key: value for key, value in data_dict.items() if not should_remove(key)}

    return filtered_dict

In [186]:
def normalize(v):
    """Return the unit vector of v."""
    norm = np.linalg.norm(v)
    return v if norm == 0 else v / norm

def cosine_similarity(a, b):
    """Compute cosine similarity between two vectors."""
    return np.dot(a, b)

def nearest_neighbor(v_star, candidate_embs):
    """
    Given an ideal clue vector v_star and a dictionary mapping candidate clue words
    to their precomputed embeddings, return the candidate with the highest cosine similarity.
    """
    best_word = None
    best_sim = -np.inf
    for word, emb in candidate_embs.items():
        sim = cosine_similarity(v_star, emb)
        if sim > best_sim:
            best_sim = sim
            best_word = word
    return best_word

def find_best_cluster(
    friend_words, 
    non_friend_words,
    friend_embs,  
    non_friend_embs, 
    candidate_embs,
    target_N=3, 
    penalty_coeff=0.2,
    verbose=True
):
    """
    ...
    """
    best_score = -np.inf
    best_cluster = None
    best_N = None
    best_v_star = None

    # Loop over all possible cluster sizes.
    for N in range(1, len(friend_words) + 1):
        if verbose:
            print(f"N = {N}")
            print()
        # Try every friendly word as a potential seed for the cluster.
        for seed in friend_words:
            if verbose:
                print(f"Cluster seed = \"{seed}\"")
            cluster = [seed]
            remaining = set(friend_words) - {seed}
            # Greedily grow the cluster until it has N words.
            while len(cluster) < N and remaining:
                best_candidate = None
                best_avg_sim = -np.inf
                for candidate in remaining:
                    # Compute average cosine similarity between candidate and the current cluster.
                    sim_sum = sum(cosine_similarity(friend_embs[candidate], friend_embs[w])
                                  for w in cluster)
                    avg_sim = sim_sum / len(cluster)
                    if avg_sim > best_avg_sim:
                        best_avg_sim = avg_sim
                        best_candidate = candidate
                if best_candidate is not None:
                    cluster.append(best_candidate)
                    remaining.remove(best_candidate)
                else:
                    break
            # Only consider complete clusters of size N.
            if len(cluster) != N:
                continue
            # Compute the centroid of the cluster.
            centroid = normalize(sum(friend_embs[w] for w in cluster))
            # Calculate the minimum similarity from the cluster words to the centroid.
            friend_sims = [cosine_similarity(centroid, friend_embs[w]) for w in cluster]
            min_friend_sim = min(friend_sims)
            # Calculate the maximum similarity from any non-friendly word to the centroid.
            non_friend_sims = [cosine_similarity(centroid, non_friend_embs[w]) for w in non_friend_words]
            max_non_friend_sim = max(non_friend_sims) if non_friend_sims else -np.inf
            # Define a margin that measures the separation.
            margin = min_friend_sim - max_non_friend_sim
            # Optionally, you could add a bonus for larger clusters (e.g., margin + alpha * N).
            score = margin - (penalty_coeff * (N - target_N)**2)
            if verbose:
                print(f"Cluster words = {cluster}")
                print(f"Cluster margin = {margin}")
                print(f"Cluster score = {score}")
                print(f"Cluster clue = {nearest_neighbor(centroid, candidate_embs)}")
                print()
            if score > best_score:
                best_score = score
                best_cluster = list(cluster)
                best_N = N
                best_v_star = centroid
        if verbose:
            print()

    # Once the best centroid is chosen, "round" it to a candidate clue word.
    best_clue = nearest_neighbor(best_v_star, candidate_embs)
    if verbose:
        print(f"BEST N: {best_N}")
        print(f"BEST CLUSTER: {best_cluster}")
        print(f"BEST SCORE: {best_score}")
        print(f"BEST CLUE: {best_clue}")
    return best_cluster, best_N, best_v_star, best_clue

In [119]:
# Initialize the embedding model.
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print(embedding_model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [193]:
# Re-generate random word and key grids
word_grid, key_grid_player, key_grid_model = generate_grids()
player_word_groups = get_word_groups(word_grid, key_grid_player)
model_word_groups = get_word_groups(word_grid, key_grid_player)

# Separate the friendly words and non-friendlies (bystanders and assassins).
friend_words = player_word_groups['F']
non_friend_words = player_word_groups['B'] + player_word_groups['A']
all_words = friend_words + non_friend_words

# Compute embeddings.
friend_embeddings = {word: embedding_model.encode(word) for word in friend_words}
non_friend_embeddings = {word: embedding_model.encode(word) for word in non_friend_words}
with open('clue_candidates.txt') as f:
    candidate_words = np.array(f.read().splitlines())

# candidate_embeddings = np.array([embedding_model.encode(word) for word in tqdm(candidate_words)])
# np.save("clue_candidate_embeddings.npy", candidate_embeddings)
    
candidate_embeddings = {word:embedding for word, embedding in zip(candidate_words, np.load("clue_candidate_embeddings.npy"))}

candidate_embeddings = filter_dict_by_stemmed_keys(candidate_embeddings, all_words)

# print(json.dumps(player_word_groups, indent=4))
# print()
# print("Friend Words:", friend_words)
# print("Non-Friend Words:", non_friend_words)
# print()
# print(f"Num candidate clues = {len(candidate_embeddings)}")
# print(f"Num candidate clues (filtered) = {len(candidate_embeddings)}")
# print()

best_cluster, best_N, best_v_star, best_clue = \
    find_best_cluster(
        friend_words, 
        non_friend_words, 
        friend_embeddings, 
        non_friend_embeddings, 
        candidate_embeddings,
        target_N=3, 
        penalty_coeff=0.2,
        # verbose=False
    )

N = 1

Cluster seed = "RANCH"
Cluster words = ['RANCH']
Cluster margin = 0.6298887729644775
Cluster score = -0.1701112270355225
Cluster clue = FARMS

Cluster seed = "GYMNAST"
Cluster words = ['GYMNAST']
Cluster margin = 0.6118229627609253
Cluster score = -0.18817703723907475
Cluster clue = ATHLETE

Cluster seed = "CRUSADER"
Cluster words = ['CRUSADER']
Cluster margin = 0.5410640239715576
Cluster score = -0.2589359760284424
Cluster clue = KNIGHTS

Cluster seed = "MINUTE"
Cluster words = ['MINUTE']
Cluster margin = 0.41542887687683105
Cluster score = -0.384571123123169
Cluster clue = SECONDS

Cluster seed = "TIN"
Cluster words = ['TIN']
Cluster margin = 0.642746090888977
Cluster score = -0.157253909111023
Cluster clue = TIFFIN

Cluster seed = "SADDLE"
Cluster words = ['SADDLE']
Cluster margin = 0.6192787885665894
Cluster score = -0.1807212114334107
Cluster clue = SEAT

Cluster seed = "BALLOON"
Cluster words = ['BALLOON']
Cluster margin = 0.6065757870674133
Cluster score = -0.193424212932

In [205]:
model_word_groups

{'B': ['MESS',
  'PEANUT',
  'PACIFIC',
  'WISH',
  'LOCUST',
  'SUN',
  'SCRATCH',
  'QUARTER',
  'LUMBERJACK',
  'BICYCLE',
  'SAW',
  'CAPTAIN',
  'SWORD'],
 'F': ['RANCH',
  'GYMNAST',
  'CRUSADER',
  'MINUTE',
  'TIN',
  'SADDLE',
  'BALLOON',
  'PILLOW',
  'BISCUIT'],
 'A': ['STICKER', 'TANK', 'BLIZZARD']}

In [208]:
generate_clue_sample = {
    "remaining_friendlies": [
        "SECOND",
        "ANTHEM",
        "SMOOTHIE",
        "PARROT",
        "PIZZA",
        "BABY",
        "WIZARD",
        "CHAIN",
        "NERVE"
    ],
    "remaining_bystanders": [
        "BRASS",
        "BRICK",
        "GYMNAST",
        "PADDLE",
        "SOUP",
        "RIVER",
        "GUM",
        "PINE",
        "COFFEE",
        "MUSKETEER",
        "RIFLE",
        "SUGAR",
        "POTTER"
    ],
    "remaining_assassins": [
        "FUEL",
        "DIRECTOR",
        "WONDERLAND"
    ],
    "model_name": "default"
}

generate_guess_sample = {
    "clue_word": "LULLABY",
    "clue_count": 2,
    "remaining_words": [
        'BRASS',
        'FUEL',
        'SECOND',
        'DIRECTOR',
        'ANTHEM',
        'SMOOTHIE',
        'PARROT',
        'BRICK',
        'GYMNAST',
        'PADDLE',
        'SOUP',
        'PIZZA',
        'BABY',
        'WIZARD',
        'RIVER',
        'GUM',
        'PINE',
        'COFFEE',
        'MUSKETEER',
        'WONDERLAND',
        'RIFLE',
        'SUGAR',
        'CHAIN',
        'POTTER',
        'NERVE'
    ],
    "model_name": "default"
}

In [223]:
print(json.dumps(generate_clue_sample, indent=4))

{
    "remaining_friendlies": [
        "SECOND",
        "ANTHEM",
        "SMOOTHIE",
        "PARROT",
        "PIZZA",
        "BABY",
        "WIZARD",
        "CHAIN",
        "NERVE"
    ],
    "remaining_bystanders": [
        "BRASS",
        "BRICK",
        "GYMNAST",
        "PADDLE",
        "SOUP",
        "RIVER",
        "GUM",
        "PINE",
        "COFFEE",
        "MUSKETEER",
        "RIFLE",
        "SUGAR",
        "POTTER"
    ],
    "remaining_assassins": [
        "FUEL",
        "DIRECTOR",
        "WONDERLAND"
    ],
    "model_name": "default"
}


In [224]:
print(json.dumps(generate_guess_sample, indent=4))

{
    "clue_word": "LULLABY",
    "clue_count": 2,
    "remaining_words": [
        "BRASS",
        "FUEL",
        "SECOND",
        "DIRECTOR",
        "ANTHEM",
        "SMOOTHIE",
        "PARROT",
        "BRICK",
        "GYMNAST",
        "PADDLE",
        "SOUP",
        "PIZZA",
        "BABY",
        "WIZARD",
        "RIVER",
        "GUM",
        "PINE",
        "COFFEE",
        "MUSKETEER",
        "WONDERLAND",
        "RIFLE",
        "SUGAR",
        "CHAIN",
        "POTTER",
        "NERVE"
    ],
    "model_name": "default"
}
