# Computing the Optimal First Words

We are trying to find some candidate first words based on our heuristics: smallest worst partition and smallest mean partition.

In [6]:
# we have to use some previous code
import sys
sys.path.append('..')

In [7]:
# make sure we reload the code
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# helpful utilities
from parse_data import read_parsed_words, DEFAULT_PARSED_WORDS_FILE, read_all_answers
from play import UNSAFE_eval_guess, eval_guess

In [12]:
words = read_parsed_words()
print(f'Loaded {len(words)} words')

Loaded 12972 words


In [13]:
answers = read_all_answers()
print(f'Loaded {len(answers)} answers')

Loaded 2315 answers


In [10]:
from tqdm import tqdm
import itertools

## Using the Possibilities Matrix

In [22]:
from possibilities_table import TABLE_PATH_ASYMMETRIC

Because the possibilities matrix has the words in a different order, load that from the disk

In [44]:
import pickle
with open('../data-parsed/possibilities-keys-asymmetric.pickle', 'rb') as fp:
    guess_words, answer_words = pickle.load(fp)

In [45]:
num_combos = len(guess_words) * len(answer_words)
print(f'Matrix size = {num_combos:,}')

Matrix size = 30,030,180


Load the pre-computed matrix from disk

In [46]:
import numpy as np
table = np.load('../data-parsed/possibilities-table-asymmetric-base-3.npy')
table.shape

(12972, 2315)

In [47]:
table.dtype

dtype('uint8')

## Shrinking the Possibilities Matrix

In [124]:
import pandas as pd

In [125]:
table = np.load(TABLE_PATH)

In [126]:
table.shape

(12972, 12972)

In [127]:
table.dtype

dtype('uint8')

In [48]:
df = pd.DataFrame(table, index=guess_words, columns=answer_words)
df

Unnamed: 0,aback,abase,abate,abbey,abbot,abhor,abide,abled,abode,abort,...,wryly,yacht,yearn,yeast,yield,young,youth,zebra,zesty,zonal
aback,242,26,26,17,17,17,17,17,17,17,...,0,37,19,19,0,0,0,13,0,10
abase,26,242,188,98,17,17,179,98,179,17,...,0,10,100,154,81,0,0,94,108,10
abate,26,188,242,98,44,17,179,98,179,44,...,0,37,100,127,81,0,54,94,135,10
abbey,17,44,44,242,26,17,44,71,44,17,...,162,82,109,109,108,81,81,49,189,1
abbot,17,17,98,26,242,71,17,17,44,206,...,0,163,1,163,0,27,108,22,81,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuzim,0,0,0,0,0,0,27,0,0,0,...,0,0,0,0,27,3,3,11,11,11
zygal,27,27,27,30,27,27,27,108,27,27,...,84,30,30,30,84,12,3,29,5,218
zygon,0,0,0,3,54,54,0,0,27,27,...,3,3,165,3,3,120,30,2,5,110
zymes,0,108,27,57,0,0,27,54,27,0,...,3,3,30,111,30,3,3,29,113,2


In [49]:
# save to parquet-gzip to save space
df.to_parquet(
    '../data-parsed/possibilities-table-asymmetric-base-3.parquet.gzip',
    engine='pyarrow',
    compression='gzip',
    index=True,
)

## Evaluating the First Word Candidates



In [30]:
import numpy as np
import pandas as pd

In [50]:
df = pd.read_parquet('../data-parsed/possibilities-table-asymmetric-base-3.parquet.gzip')
df

Unnamed: 0,aback,abase,abate,abbey,abbot,abhor,abide,abled,abode,abort,...,wryly,yacht,yearn,yeast,yield,young,youth,zebra,zesty,zonal
aback,242,26,26,17,17,17,17,17,17,17,...,0,37,19,19,0,0,0,13,0,10
abase,26,242,188,98,17,17,179,98,179,17,...,0,10,100,154,81,0,0,94,108,10
abate,26,188,242,98,44,17,179,98,179,44,...,0,37,100,127,81,0,54,94,135,10
abbey,17,44,44,242,26,17,44,71,44,17,...,162,82,109,109,108,81,81,49,189,1
abbot,17,17,98,26,242,71,17,17,44,206,...,0,163,1,163,0,27,108,22,81,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuzim,0,0,0,0,0,0,27,0,0,0,...,0,0,0,0,27,3,3,11,11,11
zygal,27,27,27,30,27,27,27,108,27,27,...,84,30,30,30,84,12,3,29,5,218
zygon,0,0,0,3,54,54,0,0,27,27,...,3,3,165,3,3,120,30,2,5,110
zymes,0,108,27,57,0,0,27,54,27,0,...,3,3,30,111,30,3,3,29,113,2


### Mean Partition

In [51]:
def get_mean_partition(row: np.ndarray) -> float:
    x = row.value_counts().to_dict()
    arr = np.array([v for v in x.values()])
    return np.mean(arr)

mean_part_df = df.apply(get_mean_partition, axis=1)

In [52]:
mean_part_df = pd.DataFrame(mean_part_df, columns=['mean_partition'])
mean_part_df

Unnamed: 0,mean_partition
aback,48.229167
abase,35.075758
abate,31.712329
abbey,39.237288
abbot,34.044118
...,...
zuzim,72.343750
zygal,37.338710
zygon,38.583333
zymes,47.244898


In [57]:
mean_part_df.sort_values('mean_partition').head(15)

Unnamed: 0,mean_partition
trace,15.433333
crate,15.641892
salet,15.641892
slate,15.748299
reast,15.748299
carte,15.856164
parse,15.856164
peart,15.965517
caret,15.965517
carle,16.076389


In [58]:
mean_part_df.loc['adieu']

mean_partition    28.9375
Name: adieu, dtype: float64

In [59]:
mean_part_df.loc['adieu'] / mean_part_df.loc['trace']

mean_partition    1.875
dtype: float64

In [60]:
mean_part_df.idxmax()

mean_partition    qajaq
dtype: object

### Worst Partitions

In [61]:
def get_worst_partition(row) -> int:
    x = row.value_counts().to_dict()
    m = max(x.values())
    return m

# apply function to each row
part_df = df.apply(get_worst_partition, axis=1)

In [62]:
part_df = pd.DataFrame(part_df, columns=['worst_partition'])
part_df

Unnamed: 0,worst_partition
aback,925
abase,423
abate,419
abbey,442
abbot,543
...,...
zuzim,1092
zygal,717
zygon,874
zymes,544


In [63]:
part_df.sort_values('worst_partition').head(15)

Unnamed: 0,worst_partition
arise,168
reais,168
aesir,168
serai,168
raise,168
aiery,171
ayrie,171
raile,173
ariel,173
aloes,174


In [42]:
part_df.worst_partition.min()

168

In [40]:
part_df[part_df.worst_partition == 697]

Unnamed: 0,worst_partition
serai,697


In [64]:
part_df.loc['adieu']

worst_partition    284
Name: adieu, dtype: int64

In [65]:
part_df.loc['adieu'] / part_df.loc['arise']

worst_partition    1.690476
dtype: float64

In [42]:
part_df.loc['adieu'] / 697

worst_partition    2.451937
Name: adieu, dtype: float64

In [43]:
part_df.loc['arise'] / 697

worst_partition    1.265423
Name: arise, dtype: float64

In [44]:
part_df.loc['tares']

worst_partition    858
Name: tares, dtype: int64

In [45]:
part_df.loc['aesir']

worst_partition    868
Name: aesir, dtype: int64

In [46]:
part_df.loc['arise']

worst_partition    882
Name: arise, dtype: int64

In [47]:
part_df.idxmax()

worst_partition    gyppy
dtype: object

## Solver Evaluation

After running our solver over the past answers, let's see how we did

In [4]:
import json
import pandas as pd

In [59]:
def load_data(word, strategy, n) -> pd.DataFrame:
    d = {}
    with open(f'../data-parsed/solver-eval/solver-eval-strat-{strategy}-past-answers-{n}-{word}.json', 'r') as fp:
        d = json.load(fp)

    rows = []
    for answer, v in d['per_word_results'].items():
        rows.append({
            'answer': answer,
            'num_guesses': v['num_guesses'],
            'is_solved': v['is_solved']
        })

    solver_df = pd.DataFrame(rows)
    return solver_df

In [85]:
def print_stats(solver_df: pd.DataFrame):
    print('# solved successfully', solver_df.is_solved.sum())
    mean_guesses = solver_df[solver_df.is_solved].num_guesses.mean()
    print(f'avg # guesses per puzzle (solved) {mean_guesses:.2f}')
    unsolved = solver_df[~solver_df.is_solved]
    print('unsolved:')
    display(unsolved[['answer']])

### serai - worst partition

In [75]:
solver_df = load_data('serai', 'worst_partition', n=219)

In [76]:
solver_df

Unnamed: 0,answer,num_guesses,is_solved
0,ABACK,4,True
1,ABASE,4,True
2,ABATE,4,True
3,ABBEY,4,True
4,ABYSS,4,True
...,...,...,...
214,WINCE,4,True
215,WOOER,5,True
216,WORLD,3,True
217,WROTE,6,False


In [77]:
print_stats(solver_df)

# solved successfully 212
avg # guesses per puzzle (solved) 4.20
unsolved:


Unnamed: 0,answer,num_guesses,is_solved
29,BOOZY,6,False
51,CRAZE,6,False
99,GRIME,6,False
149,PROVE,6,False
188,STORE,6,False
189,STOUT,6,False
217,WROTE,6,False


### tares - worst partition

In [78]:
solver_df = load_data('tares', 'worst_partition', n=219)
solver_df

Unnamed: 0,answer,num_guesses,is_solved
0,ABACK,4,True
1,ABASE,4,True
2,ABATE,3,True
3,ABBEY,4,True
4,ABYSS,5,True
...,...,...,...
214,WINCE,5,True
215,WOOER,5,True
216,WORLD,4,True
217,WROTE,4,True


In [79]:
print_stats(solver_df)

# solved successfully 213
avg # guesses per puzzle (solved) 4.25
unsolved:


Unnamed: 0,answer,num_guesses,is_solved
29,BOOZY,6,False
51,CRAZE,6,False
93,GONER,6,False
120,LUSTY,6,False
149,PROVE,6,False
177,SOWER,6,False


### tares - mean partition

In [80]:
solver_df = load_data('tares', 'mean_partition', n=219)

In [81]:
print_stats(solver_df)

# solved successfully 208
avg # guesses per puzzle (solved) 4.02
unsolved:


Unnamed: 0,answer,num_guesses,is_solved
29,BOOZY,6,False
49,CRASS,6,False
51,CRAZE,6,False
75,FERRY,6,False
78,FIXER,6,False
93,GONER,6,False
145,POUND,6,False
149,PROVE,6,False
184,START,6,False
189,STOUT,6,False


### stats for everything

In [87]:
for word in ['serai', 'tares', 'adieu']:
    for strat in ['mean_partition', 'worst_partition']:
        print(f'word: {word}, strat: {strat}')
        solver_df = load_data(word, strat, n=219)
        print_stats(solver_df)

word: serai, strat: mean_partition
# solved successfully 208
avg # guesses per puzzle (solved) 4.20
unsolved:


Unnamed: 0,answer
29,BOOZY
51,CRAZE
75,FERRY
78,FIXER
93,GONER
99,GRIME
149,PROVE
150,PROXY
184,START
188,STORE


word: serai, strat: worst_partition
# solved successfully 212
avg # guesses per puzzle (solved) 4.20
unsolved:


Unnamed: 0,answer
29,BOOZY
51,CRAZE
99,GRIME
149,PROVE
188,STORE
189,STOUT
217,WROTE


word: tares, strat: mean_partition
# solved successfully 208
avg # guesses per puzzle (solved) 4.02
unsolved:


Unnamed: 0,answer
29,BOOZY
49,CRASS
51,CRAZE
75,FERRY
78,FIXER
93,GONER
145,POUND
149,PROVE
184,START
189,STOUT


word: tares, strat: worst_partition
# solved successfully 213
avg # guesses per puzzle (solved) 4.25
unsolved:


Unnamed: 0,answer
29,BOOZY
51,CRAZE
93,GONER
120,LUSTY
149,PROVE
177,SOWER


word: adieu, strat: mean_partition
# solved successfully 204
avg # guesses per puzzle (solved) 4.20
unsolved:


Unnamed: 0,answer
14,AWAKE
29,BOOZY
49,CRASS
51,CRAZE
78,FIXER
99,GRIME
105,HATCH
134,PANEL
145,POUND
149,PROVE


word: adieu, strat: worst_partition
# solved successfully 204
avg # guesses per puzzle (solved) 4.26
unsolved:


Unnamed: 0,answer
14,AWAKE
29,BOOZY
49,CRASS
51,CRAZE
83,FLOSS
93,GONER
100,GRIPE
119,LOOPY
136,PAPER
137,PARRY
