In [18]:
import numpy as np
import pandas as pd
from scipy import spatial
from random import randint
import nbconvert.filters.strings
pd.set_option('display.max_rows', 500)

In [8]:
# Read in processed data as dataframe
glove_vectors = pd.read_pickle('./processing/data/glove_vectors.pkl')
google_vectors = pd.read_pickle('./processing/data/google_vectors.pkl')
ft_vectors = pd.read_pickle('./processing/data/fasttext_vectors.pkl')

In [9]:
# game_words = [w.lower() for w in card_words[0].tolist() if w.lower() in all_words]
all_words = list(glove_vectors.index.get_level_values(level=0).drop_duplicates())

In [10]:
friends = ['superhero', 'cook', 'cricket', 'calf']
foes = ['nut', 'mountain', 'straw', 'millionaire']
neutrals = ['mug', 'fence', 'cliff', 'brush']
assassin = ['monkey']

friends = ['apple', 'screen', 'bear']
foes = ['fence', 'cable', 'pitch']
neutrals = ['mug', 'fence']
assassin = ['rabbit']
board_words = friends + foes + neutrals + assassin
bad = foes + neutrals + assassin

In [14]:
# How much more important is distance to friends, as compared to foes?
constant = 4
def distance(source, target, vectors=glove_vectors):
    return spatial.distance.cosine(vectors.loc[source].to_numpy(), vectors.loc[target].to_numpy())

def get_scores(row, vectors):
    word = row.word
    if word in board_words or any([bw in word for bw in board_words]):
        goodness = assassin_minimax = foes_minimax = neutrals_minimax = variance = -1000
    else:
        assassin_dist = [distance(word, a, vectors) for a in assassin]
        # Check if assassin distance is adequate, if not don't waste your time
        if abs(assassin_dist[0]) > 0.001:
            friends_dist = [distance(word, fr, vectors) for fr in friends]
            foes_dist = [distance(word, f, vectors) for f in foes]
            neutrals_dist = [distance(word, n, vectors) for n in neutrals]
            goodness = sum(foes_dist + assassin_dist) - constant * sum(friends_dist)
            min_friends_dist = min(friends_dist)
            max_friends_dist = max(friends_dist)
            assassin_minimax = min(assassin_dist) - max_friends_dist
            foes_minimax = min(foes_dist) - max_friends_dist
            neutrals_minimax = min(neutrals_dist) - max_friends_dist
            variance = max_friends_dist - min_friends_dist
    return pd.Series([goodness, assassin_minimax, foes_minimax, neutrals_minimax, variance])

In [15]:
def get_candidates_df(vectors):
    # columns = ['word', 'frequency']
    candidates = pd.DataFrame({'word': all_words, 'frequency': [i for i in range(1, len(all_words) + 1)]})
    candidates[['goodness', 'assassin_minimax', 'foes_minimax', 'neutrals_minimax', 'variance']] = candidates.apply(lambda row: get_scores(row, vectors), axis=1)
    sort_by_columns = ['goodness', 'assassin_minimax', 'foes_minimax', 'frequency', 'neutrals_minimax', 'variance']
    return candidates.sort_values(sort_by_columns, ascending=[False for i in range(len(sort_by_columns))]).reset_index(drop=True)

In [16]:
%timeit
glove_candidates = get_candidates_df(glove_vectors)
# google_candidates = get_candidates_df(google_vectors)
# ft_candidates = get_candidates_df(ft_vectors)

In [19]:
glove_candidates.head(200)

Unnamed: 0,word,frequency,goodness,assassin_minimax,foes_minimax,neutrals_minimax,variance
0,blackberry,6805,-3.594208,0.018955,0.007611,0.022579,0.477825
1,touch,849,-3.782625,0.005293,-0.026799,0.023856,0.31606
2,mac,4232,-4.108315,0.012968,0.021072,0.07138,0.358913
3,picture,448,-4.171143,-0.058891,-0.035989,-0.100276,0.272161
4,lion,5635,-4.188666,-0.283256,-0.044617,-0.060587,0.425631
5,android,6156,-4.237552,-0.018183,-0.041134,-0.014956,0.373727
6,display,733,-4.303379,0.089963,-0.040631,-0.011962,0.462142
7,icon,2849,-4.316889,0.052958,0.100625,-0.014466,0.270919
8,fruit,1657,-4.330634,-0.155839,-0.091948,-0.13841,0.493808
9,orange,2289,-4.336476,-0.073605,0.04475,-0.136417,0.28911


In [282]:
google_candidates.head(20)

Unnamed: 0,word,frequency,goodness,assassin_minimax,foes_minimax,neutrals_minimax,variance
0,batsman,14611,-6.940535,0.063072,0.026439,0.013721,0.501947
1,bowler,13222,-7.027919,0.110902,0.044435,0.019828,0.408483
2,spinner,12995,-7.370297,0.054011,-0.073032,-0.009958,0.416151
3,bowling,5977,-7.424214,0.07588,-0.049342,-0.098777,0.488076
4,niggle,28002,-7.525158,0.100064,-0.063116,-0.048933,0.324056
5,seamer,28612,-7.5448,0.071529,0.047814,-0.005497,0.308641
6,footy,18541,-7.595697,0.112113,-0.042881,0.03863,0.408539
7,wicketkeeper,27613,-7.645818,0.076541,0.030102,-0.029576,0.424293
8,netball,21545,-7.717496,0.026143,-0.027253,-0.01227,0.47774
9,rib,8036,-7.72406,0.041011,-0.175342,-0.093872,0.51681


In [283]:
ft_candidates.head(20)

Unnamed: 0,word,frequency,goodness,assassin_minimax,foes_minimax,neutrals_minimax,variance
0,boy,905,-5.545131,0.104411,0.001008,0.019578,0.059462
1,team,119,-5.563708,0.005348,-0.036763,0.029504,0.103775
2,sport,1828,-5.622678,-0.003239,-0.117062,-0.031805,0.313987
3,football,1048,-5.710014,-0.028103,-0.094281,-0.03444,0.325899
4,batsman,14611,-5.726556,-0.033539,-0.017532,-0.041926,0.356554
5,baseball,2093,-5.83487,-0.067122,-0.107385,-0.11367,0.319541
6,costume,3277,-5.840353,0.002004,-0.022604,-0.014251,0.257787
7,wicketkeeper,27613,-5.879942,0.006377,0.070069,-0.02273,0.267105
8,tennis,2900,-5.888146,-0.030796,-0.043412,-0.005119,0.24723
9,cow,4886,-5.902571,0.091878,-0.106594,-0.104676,0.337549


In [284]:
def get_final_metrics(word, candidates=[glove_candidates, google_candidates, ft_candidates]):
    total_rank = 0
    total_variance = 0
    for candidate_df in candidates:
        word_select = candidate_df['word'] == word
        rank = candidate_df.index[word_select].tolist()[0]
        total_rank = total_rank + rank
        variance = candidate_df[word_select].variability.tolist()[0]
        total_variance = total_variance + variance
    return pd.Series([total_rank, total_varia])

In [293]:
size = 25
top_candidate_words = glove_candidates.word.tolist()[:size] + google_candidates.word.tolist()[:size] + ft_candidates.word.tolist()[:size]
final_candidates = pd.DataFrame({'word': top_candidate_words})
final_candidates[['rank', 'variance']] = final_candidates.word.apply(lambda word: get_final_metrics(word))
final_candidates = final_candidates.drop_duplicates(subset=['word'])

In [296]:
final_candidates.sort_values(['rank', 'variance'], ascending=[True,False]).reset_index(drop=True)

Unnamed: 0,word,rank,variance
0,batsman,12.0,1.379098
1,bowler,24.0,1.172499
2,bowling,42.0,1.333801
3,thigh,65.0,1.507738
4,comic,68.0,1.513774
5,bat,69.0,0.852601
6,chicken,70.0,0.943422
7,lamb,73.0,1.140618
8,batting,89.0,1.016583
9,footy,99.0,1.134243
