In [1]:
import os
import sys
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
plt.rcParams['svg.fonttype'] = 'none'
import seaborn as sns

# Allow imports from project dir
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.Wordle import Wordle

In [3]:
def calc_lfl(word_list, bow, colname):
    """    
    Calculate Likelihood of word from Letter Frequency given in a bag of words 5x26 matrix 
        see BJA-01.2-make-word-lists
    """
    result = []
    for word in word_list:
        likelihood = 1
        for i, letter in enumerate(word):
            likelihood *= bow[str(i)][letter]
        result.append((word, likelihood))
    return pd.DataFrame(sorted(result, key=lambda x: x[1], reverse=True), columns=['w', colname]).set_index('w') 
    
    
def contains_any(word1, word2, max_repeat=0):
    """ Check whether word1 contains any of the letters in word2. """
#     set_ = set(x for x in str2)
#     return 1 in [c in str1 for c in set_]
    return any(letter in word1 for letter in word2)


def total_green_letter_prob(word, bow):
    # Finds total probability that you will get at least 1 green in any position by testing this word
    prob = 0
    for i, letter in enumerate(word):
        prob += bow[str(i)][letter]
    return prob

def prob_of_2_greens(word):
    # How do you calculate this? 
    pass

def total_yellow_letter_prob(word):
    prob = 0 
    
    
# Allow for repeat letters with max_repeats
def contains_repeats(word, search, max_repeats):
    s = set(letter for letter in search)
    repeats = 0
    for letter in word:
        if letter in s:
            repeats += 1
    if repeats <= max_repeats:
        return False
    else:
        return True

In [4]:
# word list with frequencies
wl = pd.read_csv(r'..\data\processed\wordle_google_freq_word_list.csv', index_col=0)
alphabet = 'abcdefghijklmnopqrstuvwxyz'
# Read in  Bags of Letters matrices (AKA bag-of-words = bow) 
with open(r'..\data\processed\bags_of_words.json', 'r') as infile:
    all_bow = json.load(infile, parse_int=False)
bow17 = all_bow['combined_wordle_google']
bowwordle = all_bow['all_wordle']
bowcur = all_bow['wordle_curated']

bow17lfl = calc_lfl(wl.index, bow17, colname='all')
bowwordlelfl = calc_lfl(wl.loc[wl['in_wordle']].index, bowwordle, colname='wordle')
bowcurlfl = calc_lfl(wl.loc[wl['in_wordle_curated']].index, bowcur, colname='curated')

lfl = pd.concat([bow17lfl, bowwordlelfl, bowcurlfl], axis=1, sort=True)
# lfl.to_csv(r'..\data\processed\letter_freq_likelihoods.csv')

## Question:

## What sequence of words provides the highest chance of finding greens and yellows, while also not repeating any letters (i.e. maximizing grays)? 

### Make a function that iterates through the top 200 words with highest letter-frequency probability and finds the sequence of 5 words that has no repeats 
### If it can't find a next word with no repeat, then take the word with just one repeat (i.e. len(set(word)) == 4) 
### Then sum the calculated letter-freq probabilities of all the words 

### This will give a probability of finding a green but it doesn't include the helpful contribution from simply covering more letters and finding more grays. 
### To account for grays, include a "if you have X gray letter, what % of words are excluded?"

## Important! The dataset for this optimization must use the entire wordle list, but with their letter-freq probabilities calculated from the curated list. 

## Then use the top 200 from curated as starting points for the calculation.

In [9]:
data = wl.loc[wl['in_wordle'], ['in_wordle_curated']]
data['wlh'] = calc_lfl(data.index, bowcur, colname='wlh')   # wlh = Word Likelihood from letter frequency

# tglp = Total Green Letter Probability
# sum of all probabilities of finding that letter in that position 
data['tglp'] = data.index.map(lambda x: total_green_letter_prob(x, bow=bowcur)) 
curated_sorted = data.sort_values(['in_wordle_curated', 'tglp'], ascending=False)
curated_words = curated_sorted.loc[curated_sorted['in_wordle_curated']].index
display(curated_words)
data = data.sort_values('tglp', ascending=False)
data['unique_letters'] = data.index.map(lambda x: len(set(x)))
data.head(10)

Index(['slate', 'sauce', 'slice', 'shale', 'saute', 'share', 'sooty', 'shine',
       'suite', 'crane',
       ...
       'lymph', 'jumbo', 'igloo', 'ethic', 'unzip', 'umbra', 'affix', 'ethos',
       'inbox', 'nymph'],
      dtype='object', name='w', length=2315)

Unnamed: 0_level_0,in_wordle_curated,wlh,tglp,unique_letters
w,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
saree,False,3.7e-05,0.680346,4
sooey,False,4.3e-05,0.678618,4
soree,False,3.4e-05,0.669546,4
saine,False,3.4e-05,0.666091,5
soare,False,3e-05,0.660043,5
saice,False,2.9e-05,0.653132,5
sease,False,3e-05,0.652268,3
seare,False,2.6e-05,0.64406,4
seine,False,2.7e-05,0.639309,4
slane,False,2.6e-05,0.639309,5


In [21]:
def calc_summed_tglp_and_repeats(sequence):
    repeats = set()
    pos_dict = {0: [], 1: [], 2: [], 3: [], 4: []}
    tglp = 0
    for word in sequence:
        for i, letter in enumerate(word):
            repeats.add(letter)
            if letter not in pos_dict[i]:
                tglp += bowcur[str(i)][letter]
            pos_dict[i].append(letter)
    return tglp, len(repeats)
test_sequence = ['saint', 'cored', 'bulky', 'whomp']
print(calc_summed_tglp_and_repeats(test_sequence))

def test_word(word, pos_dict, combined_words, max_num_repeat_prev_guesses, max_num_repeats_next_guess):
    """
    Optimize the guess sequence search algorithm with following conditions:
    1. Do not repeat letters previously guessed (maximize letter information based on grays/yellows/greens)
    2. Next gues should maximize number of new letters, up to num_repeats_allowed
    3. If you must guess with repeat letters, do not repeat letters in the same position as before
    
    Do these tests with word list ranked by TGLP, and again ranked by word likelihood. 
    
    Return True if the word being tested meets all the given criteria
    
    word: string
    pos_dict: dict of index: list of letters used in this position
    combined_words: string of combined words in sequence
    max_num_repeat_prev_guesses: int of maximum number of repeat letters allowed from previous guesses
    max_num_repeats_next_guess: int of maximum number of repeat letters allowed in next guess
    """
    
    # Check for condition #3
    for i, letter in word:
        if letter in pos_dict[i]:
            return False
    
    if not contains_any(word, combined_words, max_repeat=num_repeats_allowed) and \
        len(set(word) == (5-))
    
    

(1.698488120950324, 19)


In [12]:
words = data.sort_values('tglp', ascending=False).index
tglp_list = data['tglp'].to_list()
word_tglp_zip = zip(words, tglp_list)

start_words_list = data.loc[data['in_wordle_curated']].index
result = []

# List of tuples of (number of repeats from previous guesses, number of repeats in next guess)
# Increment each one, corresponding with a looser search tolerance
priority_list = [(0, 0), (1, 0), (1, 1), (2, 1), (2, 2), (3, 2), (3, 3),]

for i, start_word in enumerate(words[0:10]):
    num_repeat_letters = 5 - len(set(start_word))  # Start with number of repeats in start_word 
    combined_words = start_word
#     summed_tglp = data.loc[start_word, 'tglp']
    sequence = [start_word]
    
    for seq_index in range(5):
        for test_word, tglp in zip(words, tglp_list): 
            if not contains_any(test_word, combined_words) and (len(set(test_word)) == 5):
                sequence.append(test_word)
                summed_tglp += tglp
                combined_words += test_word
                break
            if test_word == 'imshi':
                # very last word was reached without finding a test_word with 5 unique letters
                # therefore it should test for len(set(test_word)) == 4
                for test_word, tglp in zip(words, tglp_list): 
                    if not contains_repeats(test_word, combined_words, 0) and (len(set(test_word)) >= 4):
                        num_repeat_letters += 1
                        sequence.append(test_word)
                        summed_tglp += tglp
                        combined_words += test_word
                        break
                        
                if test_word == 'imshi':
                    # very last word was reached without finding a test_word with 4 unique letters and at most 1 repeat
                    # therefore it should test for len(set(test_word)) == 4 and at most 1 repeat
                    for test_word, tglp in zip(words, tglp_list): 
                        if not contains_repeats(test_word, combined_words, 1) and (len(set(test_word)) >= 4):
                            num_repeat_letters += 1
                            sequence.append(test_word)
                            summed_tglp += tglp
                            combined_words += test_word
                            break
                        
    result.append(sequence)
#     result.append((start_word, summed_tglp, num_repeat_letters, sequence))

(1.698488120950324, 19)

In [15]:
# for thing in word_tglp:
#     print(thing)

In [11]:
result

[('saree', 1.5904967602591793, 1, ['bliny', 'chout']),
 ('sooey', 1.7442764578833694, 2, ['brant', 'child', 'jugum']),
 ('soree', 1.9416846652267816, 2, ['gaily', 'bundt', 'chack']),
 ('saine', 1.722246220302376, 1, ['borty', 'pluck', 'whiff']),
 ('soare', 1.4742980561555075, 0, ['bliny', 'dutch']),
 ('saice', 1.7624190064794816, 1, ['drony', 'flump', 'thigh']),
 ('sease', 1.9287257019438444, 3, ['crony', 'built', 'khaph']),
 ('seare', 1.5542116630669547, 1, ['bliny', 'chout']),
 ('seine', 1.8855291576673867, 2, ['coaly', 'brugh', 'tempt']),
 ('slane', 1.524406047516199, 0, ['pricy', 'fouth'])]