# Cheating

How well do when we train the (same) solver using the set of all answers, rather than the set of all words?

In [1]:
# we have to use some previous code
import sys
sys.path.append('..')

In [24]:
import numpy as np
import pandas as pd

In [9]:
# make sure we reload the code
%load_ext autoreload
%autoreload 2

In [10]:
from possibilities_table import compute_possibilities_table

In [11]:
from parse_data import read_parsed_words, read_parsed_answers

In [17]:
answers_df = read_parsed_answers()
len(answers_df)

2315

In [19]:
answers = answers_df.answer.str.lower().values
answers

array(['cigar', 'rebut', 'sissy', ..., 'artsy', 'rural', 'shave'],
      dtype=object)

In [20]:
table = compute_possibilities_table(answers)

21399it [00:00, 213979.49it/s]

computing 2315x2315 possibilities matrix...


5359225it [00:17, 314137.36it/s]


In [23]:
np.save('../data-parsed/possibilities-table-cheating-base-3.npy', table)

In [25]:
df = pd.DataFrame(table, index=answers, columns=answers)
df

Unnamed: 0,cigar,rebut,sissy,humph,awake,blush,focal,evade,naval,serve,...,rival,untie,refit,aorta,adult,judge,rower,artsy,rural,shave
cigar,242,81,6,0,27,0,55,27,54,81,...,141,3,84,108,27,9,162,108,135,27
rebut,1,242,0,27,3,36,0,3,0,7,...,2,111,170,82,189,30,5,82,29,3
sissy,6,0,242,0,0,64,0,0,0,38,...,6,3,3,0,0,0,0,226,0,38
humph,0,3,0,242,0,166,0,0,0,0,...,0,3,0,0,3,6,0,0,6,82
awake,10,81,0,0,242,0,10,181,10,162,...,10,162,81,11,11,162,84,11,10,181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
judge,27,84,0,6,162,3,0,171,0,162,...,0,165,81,0,12,242,81,0,6,162
rower,163,110,0,0,36,0,6,27,0,109,...,83,27,110,88,0,27,242,82,83,27
artsy,4,12,216,0,2,54,1,1,1,30,...,4,18,12,14,11,0,3,242,4,28
rural,64,14,0,6,27,84,216,27,216,19,...,227,3,11,46,111,6,11,37,242,27


In [29]:
# save to parquet-gzip to save space
df.to_parquet(
    '../data-parsed/possibilities-table-cheating-base-3.parquet.gzip',
    engine='pyarrow',
    compression='gzip',
    index=True,
)

In [26]:
def get_worst_partition(row) -> int:
    x = row.value_counts().to_dict()
    m = max(x.values())
    return m

# apply function to each row
part_df = df.apply(get_worst_partition, axis=1)

In [27]:
part_df = pd.DataFrame(part_df, columns=['worst_partition'])
part_df

Unnamed: 0,worst_partition
cigar,360
rebut,368
sissy,939
humph,1121
awake,561
...,...
judge,681
rower,514
artsy,302
rural,409


In [28]:
part_df.sort_values('worst_partition').head(15)

Unnamed: 0,worst_partition
raise,168
arise,168
alone,182
arose,183
atone,191
ratio,192
irate,194
alter,196
alert,196
aisle,196


## Results

In [33]:
import json

In [34]:
def load_eval_results(path: str) -> pd.DataFrame:
    d = {}
    with open(path, 'r') as fp:
        d = json.load(fp)

    rows = []
    for answer, v in d['per_word_results'].items():
        rows.append({
            'answer': answer,
            'num_guesses': v['num_guesses'],
            'is_solved': v['is_solved']
        })

    solver_df = pd.DataFrame(rows)
    return solver_df

In [35]:
solver_df = load_eval_results('../data-parsed/solver-eval/solver-eval-strat-worst_partition-past-answers-2315-raise-custom-matrix.json')

In [36]:
def print_stats(solver_df: pd.DataFrame):
    print('# solved successfully', solver_df.is_solved.sum())
    mean_guesses = solver_df[solver_df.is_solved].num_guesses.mean()
    print(f'avg # guesses per puzzle (solved) {mean_guesses:.2f}')
    unsolved = solver_df[~solver_df.is_solved]
    print('unsolved:')
    display(unsolved[['answer']])

In [37]:
print_stats(solver_df)

# solved successfully 2302
avg # guesses per puzzle (solved) 3.65
unsolved:


Unnamed: 0,answer
158,BASTE
159,BATCH
257,BOXER
799,FOYER
905,GRAZE
995,HOUND
1205,MATCH
1714,SHAVE
1827,SNORE
2003,TAFFY
