**FETCHING DATA**

In [3]:
import gensim
import nltk
import numpy as np
from nltk.data import find
import string
import pandas as pd
import pprint
import networkx as nx
import matplotlib
from pyvis.network import Network

Getting Pre-Built W2V model https://code.google.com/archive/p/word2vec/

In [4]:
nltk.download('word2vec_sample')
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package word2vec_sample to C:\Users\Aurora
[nltk_data]     Kynkor\AppData\Roaming\nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


Exampe Usage

In [5]:
model.most_similar(positive=(['man']), topn = 10)

[('woman', 0.7664012908935547),
 ('boy', 0.6824869513511658),
 ('teenager', 0.6586930155754089),
 ('girl', 0.5921713709831238),
 ('robber', 0.5585117936134338),
 ('men', 0.5489763617515564),
 ('guy', 0.5420036315917969),
 ('person', 0.5342026948928833),
 ('gentleman', 0.5337991714477539),
 ('Man', 0.5316053032875061)]

Getting List of Codenames words

In [6]:
my_file = open("codenames.txt", "r")
content = my_file.read()
codenames_word_list = content.split("\n")
my_file.close()

Get rid of words not in model :(

In [7]:
model_word_list = model.index_to_key
model_word_list = list((map(lambda x: x.lower(), model_word_list)))
codenames_word_list = list((map(lambda x: x.lower(), codenames_word_list)))
unknown_words = [x for x in codenames_word_list if x not in model_word_list]
known_words = [x for x in codenames_word_list if x in model_word_list]
print(f"""Unknown Words: {unknown_words}""")
codenames_word_list = known_words

Unknown Words: ['aztec', 'beijing', 'centaur', 'czech', 'horseshoe', 'ice cream', 'kangaroo', 'kiwi', 'laser', 'leprechaun', 'loch ness', 'new york', 'ninja', 'olympus', 'penguin', 'platypus', 'scorpion', 'scuba diver', 'server', 'shark', 'smuggler', 'snowman', 'superhero', 'unicorn', 'whale']


There is a mismatch between the model case and the codenames:

In [6]:
def get_correct_case (input_words):
    return_words = []
    for word in input_words:
        try:
            similar_words = model.most_similar(positive=[word], topn = 5)
            return_words.append(word)
        except:
            try:
                similar_words = model.most_similar(positive=[word.capitalize()], topn = 5)
                return_words.append(word.capitalize())
            except:
                try:
                    similar_words = model.most_similar(positive=[string.capwords(word)], topn = 5)
                    return_words.append(string.capwords(word))
                except:
                    return []
    return return_words
                
codenames_word_list = get_correct_case(codenames_word_list)

**SIMULATED GAME**

In [7]:
good_list = list(np.random.choice(codenames_word_list, 9, replace=False))
codenames_word_list = [x for x in codenames_word_list if x not in good_list]

bad_list = list(np.random.choice(codenames_word_list, 8, replace=False))
codenames_word_list = [x for x in codenames_word_list if x not in bad_list]

neutral_list = list(np.random.choice(codenames_word_list, 7, replace=False))
codenames_word_list = [x for x in codenames_word_list if x not in neutral_list]

death_list = list(np.random.choice(codenames_word_list, 1, replace=False))

game_list = good_list + bad_list + neutral_list + death_list

In [8]:
def _color_board(value):
    if value in good_list:
        return 'background-color: green'
    elif value in neutral_list:
        return 'background-color: #F5F5DC'
    elif value in bad_list:
        return 'background-color: red'
    elif value in death_list:
        return 'background-color: black; color: white'
    else:
        return 'background-color: purple'

game_list = np.array(game_list)
np.random.shuffle(game_list)
game_board = pd.DataFrame(data=game_list.reshape((5,5)))
game_board.style.applymap(_color_board)

Unnamed: 0,0,1,2,3,4
0,net,compound,note,fork,Saturn
1,march,bar,grace,green,star
2,trip,alien,whip,table,hospital
3,deck,chest,dress,satellite,board
4,tie,engine,strike,lab,well


**TRIVIAL ALGORITHM**

In [9]:
pp = pprint.PrettyPrinter()
pp.pprint(model.most_similar(positive=(good_list), negative=(bad_list+death_list), topn = 10))

[('dressy', 0.3059610426425934),
 ('haggle', 0.3004680871963501),
 ('tablecloths', 0.2925092875957489),
 ('napkins', 0.2721412181854248),
 ('dine', 0.2683233618736267),
 ('prettily', 0.26317453384399414),
 ('medallions', 0.2630601227283478),
 ('floral', 0.26085275411605835),
 ('spade', 0.2597275972366333),
 ('attractively', 0.2562568485736847)]


**MY FIRST ALGORITHM**

In [10]:
def get_similarity_df(start_list, top_n = 150):
    df = pd.DataFrame(columns=['Input Word', 'Output Word', 'Strength'])
    for value in start_list:
        similarities = model.most_similar(positive=(value), topn = top_n)
        for similar in similarities:
            new_row = pd.DataFrame(({'Input Word':value, 'Output Word':similar[0], 'Strength':similar[1]}),index=[0])
            df = pd.concat([new_row, df.loc[:]]).reset_index(drop=True)
    return df

good_df = get_similarity_df(good_list)

count_df = good_df[['Input Word', 'Output Word']].groupby('Output Word').count()
count_df = count_df.rename(columns={"Input Word": "Count"})
count_df['Sum Strength'] = good_df[['Output Word', 'Strength']].groupby('Output Word').sum()
count_df['Avg Strength'] = count_df['Sum Strength'] / count_df['Count']
count_df = count_df.sort_values(by = ['Count', 'Avg Strength'], ascending = False).reset_index(level=0)

words_to_guess = good_df.loc[good_df['Output Word'] == count_df.iloc[0]['Output Word']]['Input Word'].tolist()

print(count_df)
print('')
print('CLUE')
print((count_df.iloc[0]['Output Word'], len(words_to_guess)))
print('')
print('Tiles to Guess:')
print(words_to_guess)


      Output Word  Count  Sum Strength  Avg Strength
0            pink      2      0.878496      0.439248
1            tray      2      0.834319      0.417160
2           patio      2      0.826315      0.413157
3           plaid      2      0.805388      0.402694
4           khaki      2      0.801213      0.400607
...           ...    ...           ...           ...
1290  suspensions      1      0.255195      0.255195
1291        blitz      1      0.254608      0.254608
1292      napping      1      0.253615      0.253615
1293     reprisal      1      0.253601      0.253601
1294     skirmish      1      0.252935      0.252935

[1295 rows x 4 columns]

CLUE
('pink', 2)

Tiles to Guess:
['green', 'dress']


**MY SECOND ALGORITHM**

In [11]:
g_df = get_similarity_df(good_list, top_n = 200)
b_df = get_similarity_df(bad_list, top_n = 200)
d_df = get_similarity_df(death_list, top_n = 200)



def get_best_guess(good_df, bad_df, death_df):
    good_df = good_df.copy()
    bad_df = bad_df.copy()
    death_df = death_df.copy()
    
    good_df['Reward'] = 100 * good_df['Strength']
    bad_df['Reward'] = -100 * bad_df['Strength']
    death_df['Reward'] = -500 * death_df['Strength']
    
    full_df = pd.concat([good_df, bad_df, death_df], axis=0)
    
    freq_df = full_df[['Input Word', 'Output Word']].groupby('Output Word').count().rename(columns={"Input Word": "Frequency"})
    freq_df['Total Reward'] = full_df[['Output Word', 'Reward']].groupby('Output Word').sum()
    freq_df = freq_df.sort_values(by = ['Total Reward', 'Frequency'], ascending = False).reset_index(level=0)
    
    print(freq_df)
    
    best_clue = freq_df.iloc[0]['Output Word']
    words_to_guess = full_df.loc[full_df['Output Word'] == freq_df.iloc[0]['Output Word']]['Input Word'].to_list()
    
    return best_clue, words_to_guess

    
clue, words_to_guess = get_best_guess(g_df, b_df, d_df)

print('CLUE')
print((clue, len(words_to_guess)))

print('WORDS TO GUESS')
print(words_to_guess)

     Output Word  Frequency  Total Reward
0           tray          3    115.640759
1         tables          2    101.402375
2           pink          2     87.849602
3        dresses          1     83.203125
4          patio          2     82.631499
...          ...        ...           ...
3341  conjugates          1   -189.607039
3342     hideout          1   -189.724892
3343       cells          2   -193.232375
3344        cell          2   -198.309225
3345   compounds          1   -343.781114

[3346 rows x 3 columns]
CLUE
('tray', 3)
WORDS TO GUESS
['deck', 'table', 'fork']


**MY THIRD ALGORITHM**

In [12]:
g_df = get_similarity_df(good_list, top_n = 200)
d_df = get_similarity_df(death_list, top_n = 200)

def get_best_guess(good_df, death_df):
    good_df = good_df.copy()
    death_df = death_df.copy()
    
    good_df['Reward'] = 100 * good_df['Strength']
    death_df['Reward'] = -500 * death_df['Strength']
    
    full_df = pd.concat([good_df, death_df], axis=0)
    
    freq_df = full_df[['Input Word', 'Output Word']].groupby('Output Word').count().rename(columns={"Input Word": "Frequency"})
    freq_df['Total Reward'] = full_df[['Output Word', 'Reward']].groupby('Output Word').sum()
    freq_df = freq_df.sort_values(by = ['Total Reward', 'Frequency'], ascending = False).reset_index(level=0)
    
    print(freq_df)
    
    best_clue = freq_df.iloc[0]['Output Word']
    words_to_guess = full_df.loc[full_df['Output Word'] == freq_df.iloc[0]['Output Word']]['Input Word'].to_list()
    
    return best_clue, words_to_guess


    
clue, words_to_guess = get_best_guess(g_df, d_df)

print('CLUE')
print((clue, len(words_to_guess)))

print('WORDS TO GUESS')
print(words_to_guess)

     Output Word  Frequency  Total Reward
0           tray          3    115.640759
1         tables          2    101.402375
2           pink          2     87.849602
3        dresses          1     83.203125
4          patio          2     82.631499
...          ...        ...           ...
1908      palace          1   -182.259187
1909  guardhouse          1   -184.241399
1910  conjugates          1   -189.607039
1911     hideout          1   -189.724892
1912   compounds          1   -343.781114

[1913 rows x 3 columns]
CLUE
('tray', 3)
WORDS TO GUESS
['deck', 'table', 'fork']


**VISUALIZATION**

In [13]:
good_df = get_similarity_df(good_list)

def nx2pyvis(nx_graph, pyvisnet, color):
    assert(isinstance(nx_graph, nx.Graph))
    edges = nx_graph.edges(data=True)
    nodes = nx_graph.nodes(data=True)
    
    if len(edges) > 0:
        for e in edges: 
            pyvisnet.add_node(e[0], color = color)
            pyvisnet.add_node(e[1])
            pyvisnet.add_edge(e[0], e[1], color = color, title = str(e[2]['Strength']))
            
net = Network(notebook = True)  
good_graph = nx.from_pandas_edgelist(good_df, source = 'Input Word', target = 'Output Word', edge_attr = 'Strength' )
nx2pyvis(good_graph, net, 'green')
#net.show('example.html')


**COMPLETE VISUALIZATION**

In [14]:
death_df = get_similarity_df(death_list)
bad_df = get_similarity_df(bad_list)
neutral_df = get_similarity_df(neutral_list)

death_graph = nx.from_pandas_edgelist(death_df, source = 'Input Word', target = 'Output Word', edge_attr = 'Strength')
bad_graph = nx.from_pandas_edgelist(bad_df, source = 'Input Word', target = 'Output Word', edge_attr = 'Strength')
#neutral_graph = nx.from_pandas_edgelist(neutral_df, source = 'Input Word', target = 'Output Word', edge_attr = 'Strength')


nx2pyvis(death_graph, net, 'black')
nx2pyvis(bad_graph, net, 'red')
#nx2pyvis(neutral_graph, net, '#F5F5DC')
net.toggle_physics(True)
net.show('example.html')

resources: 
https://www.kaggle.com/code/jihyeseo/word2vec-gensim-play-look-for-similar-words/notebook

https://anvaka.github.io/pm/#/galaxy/word2vec-wiki?cx=-6732&cy=-8924&cz=-15294&lx=0.0731&ly=-0.9418&lz=0.2447&lw=0.2185&ml=300&s=1.75&l=1&v=d50_clean_small

https://towardsdatascience.com/visualizing-networks-in-python-d70f4cbeb259
