# Codenames
#### Aayush Kumar

Motivation: compare different word embeddings in the context of codenames, a word association game!

### Part 1: Setup
Gensim does most of the heavy lifting here: https://radimrehurek.com/gensim/ ([source code](https://github.com/RaRe-Technologies/gensim))



In [2]:
import gensim

In [85]:
model = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', binary=True, limit=200000
)

In [5]:
model.similar_by_word('ambulance', topn=10)

[('paramedics', 0.7590752243995667),
 ('ambulances', 0.7493595480918884),
 ('Ambulance', 0.7236292362213135),
 ('paramedic', 0.662133514881134),
 ('Ambulance_paramedics', 0.6315338611602783),
 ('Ambulances', 0.6211477518081665),
 ('LifeFlight_helicopter', 0.6147335171699524),
 ('hospital', 0.6099206209182739),
 ('Paramedics', 0.6081751585006714),
 ('Ambulance_Service', 0.6080097556114197)]

In [126]:
board = {
    'blue': [
        'shark', 'lab', 'ambulance', 'helicopter', 'robot', 'spike', 'usa', 'mouse'
    ],
    'red': [
        'soldier', 'hollywood', 'opera', 'boom', 'temple', 'force', 'board', 'fighter', 'giant'
    ],
    'assassin': 'plane'
}

In [112]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
converted_file = get_tmpfile("glove_converted.txt")
glove_file = datapath('/Users/aayush/Dev/Test/glove.42B.300d.txt')
glove2word2vec(glove_file, converted_file)

In [32]:
gmodel = gensim.models.KeyedVectors.load_word2vec_format(converted_file)

In [114]:
gmodel.similar_by_word('ambulance', topn=10)

[('ambulances', 0.7300186157226562),
 ('paramedics', 0.6890668869018555),
 ('paramedic', 0.644303560256958),
 ('hospital', 0.6051512956619263),
 ('emergency', 0.5894628763198853),
 ('medics', 0.572803795337677),
 ('emts', 0.5493403673171997),
 ('ems', 0.5487420558929443),
 ('taxi', 0.5362970232963562),
 ('police', 0.5148166418075562)]

In [137]:
def validClue(clue, target, sim):
    clue = clue.lower()
    return sim > 0.1 and '_' not in clue and '-' not in clue and clue not in target and target not in clue

def helper(target, model):
    allHints = model.most_similar(
        positive=[target],
        restrict_vocab=100000,
        topn=50
    )
    target = target.lower()
    candidates = [(clue, sim) for clue, sim in allHints if validClue(clue, target, sim)]
    candidates.sort(reverse=True, key=lambda c: c[1])
    return candidates[:3]
    
# get hints for each word of the blue team
def hintForEachWord(board, model, myTeam):
    return {w: helper(w, model) for w in board[myTeam]}

def common_entries(*dcts):
    for i in set(dcts[0]).intersection(*dcts[1:]):
        yield (i,) + tuple(d[i] for d in dcts)

In [138]:
w2v = hintForEachWord(board, model, 'blue')
glv = hintForEachWord(board, gmodel, 'blue')
list(common_entries(w2v, glv))

[('mouse',
  [('mice', 0.5896885395050049),
   ('cursor', 0.5472042560577393),
   ('joystick', 0.5258718132972717)],
  [('mice', 0.6726194620132446),
   ('keyboard', 0.6354192495346069),
   ('rat', 0.6319148540496826)]),
 ('robot',
  [('bots', 0.5323755741119385),
   ('android', 0.521201491355896),
   ('bionic', 0.5198897123336792)],
  [('humanoid', 0.5601454377174377),
   ('lego', 0.5381373167037964),
   ('alien', 0.5243661403656006)]),
 ('shark',
  [('whale', 0.7011443972587585),
   ('dolphin', 0.6842623949050903),
   ('stingray', 0.6708774566650391)],
  [('whale', 0.7156961560249329),
   ('dolphin', 0.650036096572876),
   ('fish', 0.6339520215988159)]),
 ('lab',
  [('forensic', 0.5069714188575745),
   ('experiments', 0.48755788803100586),
   ('testing', 0.4635547399520874)],
  [('science', 0.5693014860153198),
   ('research', 0.5681069493293762),
   ('experiments', 0.5630009174346924)]),
 ('usa',
  [('canada', 0.7136157751083374),
   ('india', 0.6836211085319519),
   ('uk', 0.677811

### Part 2: Clue for multiple words
Let's create a more generalized approach to generating clues for n target words

In [189]:
from itertools import combinations

def validClueN(clue, nTargets, sim):
    clue = clue.lower()
    if sim <= 0.3 or '_' in clue or '-' in clue:
        return False
    for target in nTargets:
        target = target.lower()
        if clue in target or target in clue:
            return False
    return True

def helperN(nTargets, model, neg = []):
    nTargets = list(nTargets)
    allHints = model.most_similar(
        positive=nTargets,
        restrict_vocab=100000,
        negative = neg,
        topn=50
    )
    candidates = [(clue, sim) for clue, sim in allHints if validClueN(clue, nTargets, sim)]
    candidates.sort(reverse=True, key=lambda c: c[1])
    return candidates[:5]

def hintForNWords(board, model, myTeam, n=2, avoidOpponent = False):
    otherTeam = 'red' if myTeam == 'blue' else 'blue'
    
    # if avoiding opponent then weigh the opponent words slightly negative and the assassin game-losing word completely negative
    avoid = [(nW, -0.1) for nW in board[otherTeam]] + [(board['assassin'], -1)] if avoidOpponent else []
    
    results = [(nWords, helperN(nWords, model, neg=avoid)) for nWords in combinations(board[myTeam], n)]
    results.sort(reverse=True, key=lambda r: r[1][0][1] if len(r[1]) > 0 else 0)
    return results
    

In [190]:
hintForNWords(board, gmodel, 'blue', n=2, avoidOpponent= False)

[(('lab', 'ambulance'),
  [('hospital', 0.6658949851989746),
   ('medical', 0.6454156637191772),
   ('emergency', 0.6095321178436279),
   ('paramedic', 0.6009153127670288),
   ('rescue', 0.5871586799621582)]),
 (('helicopter', 'robot'),
  [('flying', 0.644748330116272),
   ('airplane', 0.6269298195838928),
   ('pilot', 0.586461067199707),
   ('plane', 0.5767312049865723),
   ('aircraft', 0.5684330463409424)]),
 (('helicopter', 'mouse'),
  [('flying', 0.6417096853256226),
   ('airplane', 0.6211909055709839),
   ('fly', 0.5852834582328796),
   ('plane', 0.5674572587013245),
   ('pilot', 0.5391443967819214)]),
 (('shark', 'helicopter'),
  [('flying', 0.633224606513977),
   ('whale', 0.6035275459289551),
   ('boat', 0.5953414440155029),
   ('dolphin', 0.5918889045715332),
   ('airplane', 0.5869830846786499)]),
 (('helicopter', 'usa'),
  [('flying', 0.6278298497200012),
   ('airplane', 0.5965707302093506),
   ('states', 0.5873452425003052),
   ('united', 0.5810209512710571),
   ('aircraft',

In [199]:
hintForNWords(board, model, 'blue', n=2, avoidOpponent= False)

[(('ambulance', 'helicopter'),
  [('paramedics', 0.6677919030189514),
   ('chopper', 0.6417112350463867),
   ('paramedic', 0.629612922668457),
   ('choppers', 0.5899779796600342),
   ('airlifted', 0.5546751022338867)]),
 (('robot', 'mouse'),
  [('joystick', 0.6251699924468994),
   ('Wiimote', 0.5696859359741211),
   ('hamster', 0.5504462122917175),
   ('stylus', 0.5346438884735107),
   ('cursor', 0.5256814956665039)]),
 (('lab', 'ambulance'),
  [('paramedics', 0.6228340864181519),
   ('hospital', 0.6063573956489563),
   ('paramedic', 0.5515223741531372),
   ('EMTs', 0.5248247385025024),
   ('EMS', 0.5225606560707092)]),
 (('shark', 'robot'),
  [('dolphin', 0.611456036567688),
   ('whale', 0.5870506763458252),
   ('creature', 0.572830855846405),
   ('crocodile', 0.5590041875839233),
   ('reptile', 0.5544845461845398)]),
 (('ambulance', 'robot'),
  [('paramedics', 0.6032423973083496),
   ('paramedic', 0.5591355562210083),
   ('firetruck', 0.5159125328063965),
   ('helicopter', 0.50952595

*airlifted* is a great clue for 2 blue words, *ambulence* and *helicopter* from the second word2vec approach

### Part 3: Avoid the opponent words and the assassin word

Codenames traditionally encourages that a player avoid the opposing team's words and especially the assassin word (else they risk losing the game). We can simulate similar behavior using gensim's [most similar implementation](https://github.com/RaRe-Technologies/gensim/blob/27c524db60828e5ab1580a46a8ca5520dfd4d352/gensim/models/keyedvectors.py#L475) and slightly discourage hinting at the opponent words slightly negatively and the assassin game-losing word completely negatively.

In [201]:
hintForNWords(board, gmodel, 'blue', n=2, avoidOpponent= True)

[(('shark', 'helicopter'),
  [('hammerhead', 0.49671030044555664),
   ('dolphin', 0.43435442447662354),
   ('whale', 0.4111352264881134),
   ('kingfish', 0.38846081495285034),
   ('mackerel', 0.3750736713409424)]),
 (('ambulance', 'helicopter'),
  [('paramedics', 0.4902295768260956),
   ('paramedic', 0.4762934744358063),
   ('emts', 0.46416178345680237),
   ('medics', 0.43225401639938354),
   ('ems', 0.4044076204299927)]),
 (('shark', 'robot'),
  [('octopus', 0.4417080283164978),
   ('hammerhead', 0.38723981380462646),
   ('roomba', 0.37868183851242065),
   ('squid', 0.36537623405456543),
   ('turtle', 0.3636520504951477)]),
 (('lab', 'ambulance'),
  [('veterinary', 0.4237673282623291),
   ('paramedic', 0.4015539884567261),
   ('outpatient', 0.37773406505584717),
   ('paramedics', 0.3736908435821533),
   ('emts', 0.3734685182571411)]),
 (('lab', 'mouse'),
  [('mice', 0.4181728959083557),
   ('assays', 0.4022420644760132),
   ('vitro', 0.37227851152420044),
   ('reagents', 0.36134803295

In [202]:
hintForNWords(board, model, 'blue', n=2, avoidOpponent= True)

[(('ambulance', 'helicopter'),
  [('paramedics', 0.6229075193405151),
   ('paramedic', 0.5460344552993774),
   ('EMS', 0.49478739500045776),
   ('Paramedics', 0.49397069215774536),
   ('Paramedic', 0.4594162702560425)]),
 (('helicopter', 'usa'),
  [('uk', 0.4882464110851288),
   ('canada', 0.46909430623054504),
   ('india', 0.4115447402000427),
   ('index.html', 0.41040921211242676),
   ('indian', 0.38346603512763977)]),
 (('shark', 'helicopter'),
  [('dolphin', 0.4840955138206482),
   ('stingray', 0.4716917872428894),
   ('whale', 0.4586286246776581),
   ('crocodile', 0.4392480254173279),
   ('dolphins', 0.43269461393356323)]),
 (('helicopter', 'spike'),
  [('surge', 0.4691990315914154),
   ('uptick', 0.46736574172973633),
   ('upsurge', 0.43392059206962585),
   ('rise', 0.3786761462688446),
   ('skyrocketed', 0.36911505460739136)]),
 (('lab', 'mouse'),
  [('mice', 0.4393245577812195),
   ('PCR', 0.35275256633758545),
   ('rodents', 0.33687907457351685),
   ('rats', 0.3364789187908172

*rat* is a great clue for 2 blue words, *lab* and *mouse*, as we can imagine lab rats and mice/rat being closely associated

### Coming Soon:
We can also try to use embeddings from ConceptNet, which is "designed to help computers understand the meanings of words that people use" beyond distributional semantics.