# Computing the Optimal First Word

In [30]:
# we have to use some previous code
import sys
sys.path.append('..')

In [2]:
# make sure we reload the code
%load_ext autoreload
%autoreload 2

In [36]:
# helpful utilities
from parse_data import read_parsed_words, DEFAULT_PARSED_WORDS_FILE
from play import UNSAFE_eval_guess, eval_guess

In [37]:
words = read_parsed_words()
print(f'Loaded {len(words)} words')

Loaded 12972 words


In [8]:
from tqdm import tqdm
import itertools

## Creating the possibility matrix

In [9]:
num_combos = len(words) * len(words)
print(f'# combinations of all words = {num_combos:,}')

# combinations of all words = 168,272,784


In [10]:
from possibilities_table import array_to_integer

In [11]:
# create the table of possible outcomes

import numpy as np
from typing import Tuple

num_words = len(words)

word_range_1 = np.arange(num_words)
word_range_2 = np.arange(num_words)

# create an nxn matrix
# each entry will be an unsigned 32-bit integer
# the row will denote the index of the guessing word
# the column will denote the index of the answer word
# to be honest, we could have done 16-bit and saved some space
table = np.empty(shape=(num_words, num_words), dtype='uint8')

print(f'Shape of the matrix: {table.shape}')
print(f'Matrix entry type: {table.dtype}')

Shape of the matrix: (12972, 12972)
Matrix entry type: uint8


In [12]:
# fill out one entry in the matrix

def f_eval_guess(guess_i: int, answer_i: int) -> int:
    guess = words[guess_i]
    answer = words[answer_i]
    rval = UNSAFE_eval_guess(guess=guess, answer=answer)
    # the numbers are guaranteed to be 0, 1, 2
    return array_to_integer(rval)

In [31]:
# there is probably a more efficient numpy way to create this matrix
# but this is fast enough

combos = itertools.product(word_range_1, word_range_2)

for guess_i, answer_i in tqdm(combos):
    table[guess_i, answer_i] = f_eval_guess(guess_i, answer_i)


168272784it [08:26, 332052.26it/s]


In [31]:
from possibilities_table import TABLE_PATH

In [32]:
# now save this matrix of hard-won computation

np.save(TABLE_PATH, table)

In [14]:
# NOTE: for the blog post, this computation took about 8 minutes on a fairly old laptop
# I am sure there is a more efficient way to do it, but why?

## Shrinking the Possibilities Matrix

In [19]:
import pandas as pd

In [15]:
table = np.load(TABLE_PATH)

In [16]:
table.shape

(12972, 12972)

In [17]:
table.dtype

dtype('uint8')

In [20]:
df = pd.DataFrame(table, index=words, columns=words)
df

Unnamed: 0,aahed,aalii,aargh,aarti,abaca,abaci,aback,abacs,abaft,abaka,...,zulus,zupan,zupas,zuppa,zurfs,zuzim,zygal,zygon,zymes,zymic
aahed,242,8,17,8,5,5,5,5,5,5,...,0,4,4,4,0,0,4,0,54,0
aalii,8,242,8,197,5,194,5,5,5,5,...,18,4,4,4,0,135,13,0,0,135
aargh,89,8,242,26,5,5,5,5,5,5,...,0,4,4,4,18,0,31,27,0,0
aarti,8,170,26,242,5,167,5,5,32,5,...,0,4,4,4,18,81,4,0,0,81
abaca,92,92,92,92,242,161,161,161,107,188,...,0,91,91,172,0,0,91,0,0,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuzim,0,54,0,27,0,27,0,0,0,0,...,17,17,17,17,17,242,11,11,92,146
zygal,27,108,36,27,27,27,27,27,27,27,...,83,56,56,29,2,2,242,26,8,8
zygon,0,0,9,0,0,0,0,0,0,0,...,2,164,2,2,2,2,26,242,8,8
zymes,54,0,0,0,0,0,0,162,0,0,...,164,2,164,2,164,11,8,8,242,26


In [77]:
# save to parquet-gzip to save space
df.to_parquet(
    '../data-parsed/possibilities-table-base-3.parquet.gzip',
    engine='pyarrow',
    compression='gzip',
    index=False,
)

## Evaluating the First Word Candidates



In [32]:
import numpy as np
import pandas as pd

In [33]:
table = np.load(TABLE_PATH)
print(table.shape)
print(table.dtype)

(12972, 12972)
uint8


In [23]:
# is our matrix symmetric?
# no, because positioning matters in the way we calculate the values (unfortunately)
# if I could do it again, I might use a different value system that is invariant to that

np.allclose(table, table.T, rtol=1e-1, atol=1e-1)

False

In [38]:
# after this, we go through the table and reverse it to get the list of possibilities for each guess
# for each (guess, possibility) what's the number of answers for that guess?

# NOTE: indexes are rows (guesses)
# columns are answers

df = pd.DataFrame(table, index=words, columns=words)
df

Unnamed: 0,aahed,aalii,aargh,aarti,abaca,abaci,aback,abacs,abaft,abaka,...,zulus,zupan,zupas,zuppa,zurfs,zuzim,zygal,zygon,zymes,zymic
aahed,242,8,17,8,5,5,5,5,5,5,...,0,4,4,4,0,0,4,0,54,0
aalii,8,242,8,197,5,194,5,5,5,5,...,18,4,4,4,0,135,13,0,0,135
aargh,89,8,242,26,5,5,5,5,5,5,...,0,4,4,4,18,0,31,27,0,0
aarti,8,170,26,242,5,167,5,5,32,5,...,0,4,4,4,18,81,4,0,0,81
abaca,92,92,92,92,242,161,161,161,107,188,...,0,91,91,172,0,0,91,0,0,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuzim,0,54,0,27,0,27,0,0,0,0,...,17,17,17,17,17,242,11,11,92,146
zygal,27,108,36,27,27,27,27,27,27,27,...,83,56,56,29,2,2,242,26,8,8
zygon,0,0,9,0,0,0,0,0,0,0,...,2,164,2,2,2,2,26,242,8,8
zymes,54,0,0,0,0,0,0,162,0,0,...,164,2,164,2,164,11,8,8,242,26


### Mean Partition

In [39]:
def get_mean_partition(row) -> int:
    x = row.value_counts().to_dict()
    arr = np.array([v for v in x.values()])
    m = np.mean(arr)
    return m

mean_part_df = df.apply(get_mean_partition, axis=1)

In [40]:
mean_part_df = pd.DataFrame(mean_part_df, columns=['mean_partition'])
mean_part_df

Unnamed: 0,mean_partition
aahed,147.409091
aalii,249.461538
aargh,156.289157
aarti,128.435644
abaca,249.461538
...,...
zuzim,212.655738
zygal,127.176471
zygon,136.547368
zymes,112.800000


In [27]:
mean_part_df.mean_partition.min()

61.18867924528302

In [28]:
mean_part_df[mean_part_df.mean_partition < 62]

Unnamed: 0,mean_partition
tares,61.188679


In [29]:
mean_part_df[mean_part_df.mean_partition < 75].sort_values('mean_partition').head(15)

Unnamed: 0,mean_partition
tares,61.188679
teras,62.066986
tears,63.588235
pelas,64.217822
pares,64.537313
dores,64.86
peats,64.86
pores,65.18593
tores,65.18593
bares,65.18593


In [30]:
mean_part_df.loc['serai']

mean_partition    74.982659
Name: serai, dtype: float64

In [31]:
mean_part_df.loc['arise']

mean_partition    72.066667
Name: arise, dtype: float64

In [32]:
mean_part_df.loc['aesir']

mean_partition    77.676647
Name: aesir, dtype: float64

In [33]:
mean_part_df.loc['adieu']

mean_partition    107.206612
Name: adieu, dtype: float64

In [34]:
mean_part_df.loc['adieu'] / mean_part_df.loc['tares']

mean_partition    1.752066
dtype: float64

In [35]:
mean_part_df.idxmax()

mean_partition    qajaq
dtype: object

### Worst Partitions

In [36]:
def get_worst_partition(row) -> int:
    x = row.value_counts().to_dict()
    m = max(x.values())
    return m

# apply function to each row
part_df = df.apply(get_worst_partition, axis=1)

In [37]:
part_df = pd.DataFrame(part_df, columns=['worst_partition'])
part_df

Unnamed: 0,worst_partition
aahed,2543
aalii,3890
aargh,3955
aarti,2609
abaca,5655
...,...
zuzim,6081
zygal,4070
zygon,5081
zymes,2113


In [38]:
part_df.sort_values('worst_partition').head(15)

Unnamed: 0,worst_partition
serai,697
reais,769
soare,769
paseo,776
aeros,801
kaies,821
nares,823
reans,823
nears,823
stoae,825


In [39]:
part_df.worst_partition.min()

697

In [40]:
part_df[part_df.worst_partition == 697]

Unnamed: 0,worst_partition
serai,697


In [41]:
part_df.loc['adieu']

worst_partition    1709
Name: adieu, dtype: int64

In [42]:
part_df.loc['adieu'] / 697

worst_partition    2.451937
Name: adieu, dtype: float64

In [43]:
part_df.loc['arise'] / 697

worst_partition    1.265423
Name: arise, dtype: float64

In [44]:
part_df.loc['tares']

worst_partition    858
Name: tares, dtype: int64

In [45]:
part_df.loc['aesir']

worst_partition    868
Name: aesir, dtype: int64

In [46]:
part_df.loc['arise']

worst_partition    882
Name: arise, dtype: int64

In [47]:
part_df.idxmax()

worst_partition    gyppy
dtype: object

## Solver Evaluation

After running our solver over the past answers, let's see how we did

In [4]:
import json
import pandas as pd

In [59]:
def load_data(word, strategy, n) -> pd.DataFrame:
    d = {}
    with open(f'../data-parsed/solver-eval/solver-eval-strat-{strategy}-past-answers-{n}-{word}.json', 'r') as fp:
        d = json.load(fp)

    rows = []
    for answer, v in d['per_word_results'].items():
        rows.append({
            'answer': answer,
            'num_guesses': v['num_guesses'],
            'is_solved': v['is_solved']
        })

    solver_df = pd.DataFrame(rows)
    return solver_df

In [85]:
def print_stats(solver_df: pd.DataFrame):
    print('# solved successfully', solver_df.is_solved.sum())
    mean_guesses = solver_df[solver_df.is_solved].num_guesses.mean()
    print(f'avg # guesses per puzzle (solved) {mean_guesses:.2f}')
    unsolved = solver_df[~solver_df.is_solved]
    print('unsolved:')
    display(unsolved[['answer']])

### serai - worst partition

In [75]:
solver_df = load_data('serai', 'worst_partition', n=219)

In [76]:
solver_df

Unnamed: 0,answer,num_guesses,is_solved
0,ABACK,4,True
1,ABASE,4,True
2,ABATE,4,True
3,ABBEY,4,True
4,ABYSS,4,True
...,...,...,...
214,WINCE,4,True
215,WOOER,5,True
216,WORLD,3,True
217,WROTE,6,False


In [77]:
print_stats(solver_df)

# solved successfully 212
avg # guesses per puzzle (solved) 4.20
unsolved:


Unnamed: 0,answer,num_guesses,is_solved
29,BOOZY,6,False
51,CRAZE,6,False
99,GRIME,6,False
149,PROVE,6,False
188,STORE,6,False
189,STOUT,6,False
217,WROTE,6,False


### tares - worst partition

In [78]:
solver_df = load_data('tares', 'worst_partition', n=219)
solver_df

Unnamed: 0,answer,num_guesses,is_solved
0,ABACK,4,True
1,ABASE,4,True
2,ABATE,3,True
3,ABBEY,4,True
4,ABYSS,5,True
...,...,...,...
214,WINCE,5,True
215,WOOER,5,True
216,WORLD,4,True
217,WROTE,4,True


In [79]:
print_stats(solver_df)

# solved successfully 213
avg # guesses per puzzle (solved) 4.25
unsolved:


Unnamed: 0,answer,num_guesses,is_solved
29,BOOZY,6,False
51,CRAZE,6,False
93,GONER,6,False
120,LUSTY,6,False
149,PROVE,6,False
177,SOWER,6,False


### tares - mean partition

In [80]:
solver_df = load_data('tares', 'mean_partition', n=219)

In [81]:
print_stats(solver_df)

# solved successfully 208
avg # guesses per puzzle (solved) 4.02
unsolved:


Unnamed: 0,answer,num_guesses,is_solved
29,BOOZY,6,False
49,CRASS,6,False
51,CRAZE,6,False
75,FERRY,6,False
78,FIXER,6,False
93,GONER,6,False
145,POUND,6,False
149,PROVE,6,False
184,START,6,False
189,STOUT,6,False


### stats for everything

In [87]:
for word in ['serai', 'tares', 'adieu']:
    for strat in ['mean_partition', 'worst_partition']:
        print(f'word: {word}, strat: {strat}')
        solver_df = load_data(word, strat, n=219)
        print_stats(solver_df)

word: serai, strat: mean_partition
# solved successfully 208
avg # guesses per puzzle (solved) 4.20
unsolved:


Unnamed: 0,answer
29,BOOZY
51,CRAZE
75,FERRY
78,FIXER
93,GONER
99,GRIME
149,PROVE
150,PROXY
184,START
188,STORE


word: serai, strat: worst_partition
# solved successfully 212
avg # guesses per puzzle (solved) 4.20
unsolved:


Unnamed: 0,answer
29,BOOZY
51,CRAZE
99,GRIME
149,PROVE
188,STORE
189,STOUT
217,WROTE


word: tares, strat: mean_partition
# solved successfully 208
avg # guesses per puzzle (solved) 4.02
unsolved:


Unnamed: 0,answer
29,BOOZY
49,CRASS
51,CRAZE
75,FERRY
78,FIXER
93,GONER
145,POUND
149,PROVE
184,START
189,STOUT


word: tares, strat: worst_partition
# solved successfully 213
avg # guesses per puzzle (solved) 4.25
unsolved:


Unnamed: 0,answer
29,BOOZY
51,CRAZE
93,GONER
120,LUSTY
149,PROVE
177,SOWER


word: adieu, strat: mean_partition
# solved successfully 204
avg # guesses per puzzle (solved) 4.20
unsolved:


Unnamed: 0,answer
14,AWAKE
29,BOOZY
49,CRASS
51,CRAZE
78,FIXER
99,GRIME
105,HATCH
134,PANEL
145,POUND
149,PROVE


word: adieu, strat: worst_partition
# solved successfully 204
avg # guesses per puzzle (solved) 4.26
unsolved:


Unnamed: 0,answer
14,AWAKE
29,BOOZY
49,CRASS
51,CRAZE
83,FLOSS
93,GONER
100,GRIPE
119,LOOPY
136,PAPER
137,PARRY
