In [9]:
# we have to use some previous code
import sys
sys.path.append('..')

In [10]:
# make sure we reload the code
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
from parse_data import read_parsed_words, DEFAULT_PARSED_WORDS_FILE

In [12]:
# should be an absolute path
DEFAULT_PARSED_WORDS_FILE

'/Users/Daniel_Kats/prog/wordle/data-parsed/wordle-words.pickle'

In [13]:
words = read_parsed_words()
print(f'Loaded {len(words)} words')

Loaded 12972 words


In [23]:
from play import UNSAFE_eval_guess, eval_guess

In [15]:
from tqdm import tqdm
import itertools

## Creating the possibility matrix

In [16]:
num_combos = len(words) * len(words)
print(f'# combinations of all words = {num_combos:,}')

# combinations of all words = 168,272,784


In [17]:
# each evaluation can be represented by a relatively small integer
# each position is a 2-bit value totalling a 10-bit integer

from typing import List

def array_to_integer(array: List[int]) -> int:
    """
    normally our evaluation is represented by a 5-integer array
    each item represents whether there is a partial (1) or full (2) match of the guess's letter i
    0 denotes absence
    we will convert this to an integer
    This integer is guaranteed to be between 0 and 4**5
    """
    assert isinstance(array, list)
    assert len(array) == 5
    v = 0
    for i, pos_value in enumerate(array):
        v += (4 ** i) * pos_value
    return v


In [33]:
# create the table of possible outcomes

import numpy as np
from typing import Tuple

num_words = len(words)

word_range_1 = np.arange(num_words)
word_range_2 = np.arange(num_words)

# create an nxn matrix
# each entry will be an unsigned 32-bit integer
# the row will denote the index of the guessing word
# the column will denote the index of the answer word
# to be honest, we could have done 16-bit and saved some space
table = np.empty(shape=(num_words, num_words), dtype='uint32')

print(f'Shape of the matrix: {table.shape}')
print(f'Matrix entry type: {table.dtype}')


Shape of the matrix: (12972, 12972)
Matrix entry type: uint32


In [21]:
# test indices
table[0,1] = 3
table

array([[0, 3, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint32)

In [28]:
def f_eval_guess(guess_i: int, answer_i: int) -> int:
    """Return an integer"""
    guess = words[guess_i]
    answer = words[answer_i]
    rval = UNSAFE_eval_guess(guess=guess, answer=answer)
    # the numbers are guaranteed to be 0, 1, 2
    return array_to_integer(rval)

table[0, 1] = f_eval_guess(0, 1)
table

array([[ 0, 10,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=uint32)

In [29]:
print(words[0])
print(words[1])
eval_guess(guess=words[0], answer=words[1])

aahed
aalii


[2, 2, 0, 0, 0]

In [31]:
# we can convert the entire output to a matrix to save space
# however it may take longer

combos = itertools.product(word_range_1, word_range_2)

for guess_i, answer_i in tqdm(combos):
    table[guess_i, answer_i] = f_eval_guess(guess_i, answer_i)


168272784it [08:26, 332052.26it/s]


In [32]:
# now save this matrix of hard-won computation

np.save('../data-parsed/possibilities-table.npy', table)

In [36]:
# NOTE: for the blog post, this computation took about 8 minutes on a fairly old laptop
# I am sure there is a more efficient way to do it, but why?

## Evaluating the First Word Candidates



In [35]:
table = np.load('../data-parsed/possibilities-table.npy')
print(table.shape)
print(table.dtype)

(12972, 12972)
uint32


In [38]:
import pandas as pd

In [62]:
# is our matrix symmetric?
# no, because positioning matters in the way we calculate the values (unfortunately)
# if I could do it again, I might use a different value system that is invariant to that

np.allclose(table, table.T, rtol=1e-1, atol=1e-1)

False

In [75]:
# after this, we go through the table and reverse it to get the list of possibilities for each guess
# for each (guess, possibility) what's the number of answers for that guess?

# NOTE: indexes are rows (guesses)
# columns are answers

df = pd.DataFrame(table, index=words, columns=words)
df

Unnamed: 0,aahed,aalii,aargh,aarti,abaca,abaci,aback,abacs,abaft,abaka,...,zulus,zupan,zupas,zuppa,zurfs,zuzim,zygal,zygon,zymes,zymic
aahed,682,10,26,10,6,6,6,6,6,6,...,0,5,5,5,0,0,5,0,128,0
aalii,10,682,10,586,6,582,6,6,6,6,...,32,5,5,5,0,384,21,0,0,384
aargh,266,10,682,42,6,6,6,6,6,6,...,0,5,5,5,32,0,69,64,0,0
aarti,10,522,42,682,6,518,6,6,70,6,...,0,5,5,5,32,256,5,0,0,256
abaca,274,274,274,274,682,426,426,426,298,554,...,0,273,273,529,0,0,273,0,0,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuzim,0,128,0,64,0,64,0,0,0,0,...,26,26,26,26,26,682,18,18,274,402
zygal,64,320,80,64,64,64,64,64,64,64,...,258,130,130,66,2,2,682,42,10,10
zygon,0,0,16,0,0,0,0,0,0,0,...,2,514,2,2,2,2,42,682,10,10
zymes,128,0,0,0,0,0,0,512,0,0,...,514,2,514,2,514,18,10,10,682,42


In [73]:
def integer_to_arr(rval: int):
    arr = [0] * 5
    
    for i in range(5, -1, -1):
        # the number at position i
        # should be a value between 0-3
        if rval >= (4 ** i):
            rem = rval % (4 ** i)
            pos_value = int((rval - rem) / (4 ** i))
            arr[i] = pos_value
            rval -= arr[i] * (4 ** i)
    return arr

integer_to_arr(16)

[0, 0, 1, 0, 0]

In [77]:
# we guess soare
# what are the various frequencies

x = df.loc['soare'].value_counts().to_dict()
m = max(x.values())
print(m)
# the max count for each return value is 769

for rval, count in x.items():
    # specifically, 16
    if count >= m:
        print(f"{rval} -> {count}")

769
16 -> 769


In [78]:
df.loc['soare'][df.loc['soare'] == 16]

aalii    16
abuna    16
abuzz    16
acidy    16
acing    16
         ..
zigan    16
zilla    16
zupan    16
zuppa    16
zygal    16
Name: soare, Length: 769, dtype: uint32

In [79]:
for rval, count in x.items():
    arr = integer_to_arr(rval)
    print(f'{arr} -> {count}')

[0, 0, 1, 0, 0] -> 769
[1, 0, 0, 0, 1] -> 640
[0, 0, 0, 0, 1] -> 608
[1, 0, 1, 0, 0] -> 601
[0, 0, 0, 0, 0] -> 577
[1, 0, 0, 0, 0] -> 505
[0, 0, 0, 1, 1] -> 401
[1, 2, 0, 0, 0] -> 400
[0, 2, 0, 0, 0] -> 369
[0, 0, 1, 0, 1] -> 349
[0, 0, 0, 0, 2] -> 302
[0, 0, 1, 1, 1] -> 257
[0, 1, 0, 0, 0] -> 254
[0, 0, 1, 1, 0] -> 251
[2, 0, 0, 0, 0] -> 231
[1, 0, 1, 0, 1] -> 219
[2, 0, 0, 0, 1] -> 211
[1, 0, 0, 1, 1] -> 198
[0, 1, 1, 0, 0] -> 195
[1, 0, 0, 1, 0] -> 178
[0, 2, 0, 0, 1] -> 177
[0, 0, 0, 1, 0] -> 175
[1, 1, 0, 0, 0] -> 170
[0, 0, 2, 0, 0] -> 168
[1, 0, 1, 1, 0] -> 161
[0, 0, 1, 0, 2] -> 156
[2, 0, 2, 0, 0] -> 155
[1, 0, 2, 0, 0] -> 149
[0, 1, 0, 1, 0] -> 143
[2, 1, 0, 0, 0] -> 141
[1, 2, 0, 0, 1] -> 138
[0, 2, 1, 0, 0] -> 132
[0, 2, 0, 0, 2] -> 128
[0, 0, 0, 1, 2] -> 127
[0, 1, 0, 0, 1] -> 123
[2, 0, 1, 0, 0] -> 119
[0, 0, 2, 1, 0] -> 107
[1, 2, 0, 1, 0] -> 91
[0, 2, 0, 1, 1] -> 88
[0, 2, 0, 1, 0] -> 80
[1, 1, 1, 0, 0] -> 80
[0, 1, 1, 1, 0] -> 79
[0, 0, 1, 2, 0] -> 74
[2, 0, 0, 0, 2] -

In [81]:
def get_worst_partition(row) -> int:
    x = row.value_counts().to_dict()
    m = max(x.values())
    return m

# apply function to each row
df.apply(get_worst_partition, axis=1)

aahed    2543
aalii    3890
aargh    3955
aarti    2609
abaca    5655
         ... 
zuzim    6081
zygal    4070
zygon    5081
zymes    2113
zymic    5269
Length: 12972, dtype: int64

In [82]:
part_df = _

In [89]:
part_df = pd.DataFrame(part_df, columns=['worst_partition'])
part_df

Unnamed: 0,worst_partition
aahed,2543
aalii,3890
aargh,3955
aarti,2609
abaca,5655
...,...
zuzim,6081
zygal,4070
zygon,5081
zymes,2113


In [90]:
part_df.worst_partition.min()

697

In [93]:
part_df[part_df.worst_partition == 697]

Unnamed: 0,worst_partition
serai,697


In [96]:
part_df.loc['adieu'] / 697

worst_partition    2.451937
Name: adieu, dtype: float64

In [97]:
part_df.loc['arise'] / 697

worst_partition    1.265423
Name: arise, dtype: float64

In [99]:
fp = open('../data-parsed/solver-eval.json', 'r')
import json
d = json.load(fp)
fp.close()

fp = open('../data-parsed/solver-eval.json', 'w')
json.dump(d, fp, indent=4, sort_keys=True)
fp.close()