In [3]:
import requests
import pandas as pd
import math

# Wordle

When you play wordle, you guess a five letter word at random. Then your guess is secretly compared against the
actual answer. The game will return some information about your guess. Each letter of your guessed word will either
be 

1. grey - this letter doesn't appear in the answer 
2. yellow - this letter appears in the answer but in a different position 
3. green - this letter is in the answer AND its in the correct location. 
    
For simplicity we have taken this convention: 
1. 2 --> letter is in the word and is in the right location
2.  1 --> lettis is in the word, but not right location
3. 0 -- > letter is not in the word

CAVEAT: if you guess a word that has a letter repeated, THEN WHAT!?

### Example 1. 
guess = 'taste'

answer = 'stamp'

response = 11100 <--- note that the first is a one. The first 't' from 'taste' is in the answer, but in the wrong
position. The second 't' in 'taste' is in the answer, but also in the wrong position. The letter 't' only
appears once in the answer, so the first 't' will receive the 1 indicating there is a 't' but not in that location;
while the second 't' will receive the 0 indiciating there is exactly one 't' in the word and neither 't's from 
'taste' are in the right position

### Example 2
guess = 'taste'

answer = 'month'

response = 00020 <-- note that the first is a zero. The second 't' from 'taste' matched exactly, and the
answer has only one 't'. Thus we return a zero for the first entry indicating there is no additional 't' and a
two for the second 't' indicating the only 't' in the answer is in that position

### Example 3
guess = 'taste'

answer = 'stint'

response = 10110 <-- note that both the 't's from taste correspond to 1. This indicates that there are two 't's
in the answer, and neither are in the right position


wordles guesses are five letters long, and there are 3 possibilities that can occur at each spot in the word
so, there are 3**5 = 243 possibilities after a guess
'''

In [18]:
def get_freq(word):
    wdict = {}
    checker = {}
    for index,letter in enumerate(word):
        if letter in wdict: 
            wdict[letter] += 1
        else: 
            wdict[letter] = 1
            checker[letter] = 0
    return wdict,checker

def labeler(guessed_word,another_word):
    '''
    DESC:
    Returns the information with the wordle convention
    
    INPUTS:
    guessed_word: string of length five where entries are lower case letters
    another_word: string of length five where entries are lower case letters
    
    OUTPUT:
    output: string of length five where entries are 0,1, and 2 as the wordle convention
    
    APPROACH:
    We approach the problem naively, and then clean it up
    Naive: We loop through each letter of the guessed_word. compare with letters from another_word, and create a 
    five character string. We assign values of 0,1, and 2 from the convention mentioned above to the output string.
    Repeated letters in the guessed word cause issues with this approach. We have to ammend this approach
    
    Clean-Up: Simultaneously we track the frequency of how many times a letter from the guessed_word is used in 
    the another_word. We only need to track letters from the guessed_word that are ALSO contained in the 
    another_word. 
        1) When we have a letter that appears in another_word but in the wrong position, we do another check to 
        confirm that the information pertaining to this letter hasn't already been used more than the number of 
        times it appears in another_word. If confirmed we assign a 1 to the output and increase checker[letter] 
        by 1. Otherwise we assign a 0 to the output. 
        
        2)When we have a letter that appears in another_word AND in the correct position, we assign a 2 to the 
        outpu and increase checker[letter] by 1. We then traverse the output starting from the previous position 
        and heading to the initial position. We are looking for the first instance of when the output is 1 and 
        the corresponding letter is the letter we are iterating currently. If this is found we update the output 
        by replacing the corresponding 1 with a zero.
    
    Example:
    guessed_word = "mmmmm"
    another_word = "mummy"
    * adict is a frequency dictionary of the letters in another_word
    ** checker is a frequency dictionary that is populated when looping through letters of guessed_word
    iterations of the output
    output        | adict["m"]     | checker["m"]      
    ----------------------------------------------------------------------
    ""            | adict["m"] = 3 | checker["m"] = 0  
    "2"           | adict["m"] = 3 | checker["m"] = 1  
    "21"          | adict["m"] = 3 | checker["m"] = 2 
    "212"         | adict["m"] = 3 | checker["m"] = 3
    "2122"        | adict["m"] = 3 | checker["m"] = 4 <---Naive
    "2022"        | adict["m"] = 3 | checker["m"] = 4 <---Clena-Up
    "20220"       | adict["m"] = 3 | checker["m"] = 4
    '''
    
    adict,checker = get_freq(another_word)

    output = ''
    for index,letter in enumerate(guessed_word):
        if letter in another_word:
            if (guessed_word[index] == guessed_word[index]) & (another_word[index] == letter):
                output = output + '2'
                checker[letter] += 1
                if checker[letter] > adict[letter]:
                    # reverse through string and find last time letter appeared as a 1, and flip it to a 0
                    for _index in range(index):
                        if (guessed_word[index - _index - 1] == letter) & (output[index - _index - 1] == '1'):
                            output = output[:len(guessed_word[:index]) - 1 - _index] + '0' + output[len(guessed_word[:index]) - _index:]
                            break
            else:       
                if checker[letter] < adict[letter]:
                    output = output + '1'
                    checker[letter] += 1
                else:
                    output = output + '0'
    
        else:
            output = output + '0'
    return output

def entropy(word,df,bank = '_BANK'):
    # Apply labeler
    df[word] = df[bank].apply(lambda x: labeler(word,x))
    
    # Group entire data frame and get counts
    group = df[[bank,word]].groupby(word).count()
    
    # update column names and convert to percentage
    group = group.rename(columns = {bank:'prob_{}'.format(word)})
    group['prob_{}'.format(word)] = group['prob_{}'.format(word)] / df.shape[0]
    
    # compute entropy
    ev = group['prob_{}'.format(word)] * group['prob_{}'.format(word)].apply(math.log2)
    ev = - ev.sum()
    
    # update dataframe
    df = df[['_BANK','entropy']]
    return(ev)

In [25]:
if __name__ == "__main__":
    # Read wordle data from link below
    link = r"https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"
    f = requests.get(link)
    text = f.text.split("\n")
    
    # cast text as pandas DataFrame
    df = pd.DataFrame(text,columns = ['_BANK'])
    # initialize entropy column
    df['entropy'] = 0
    
    # compute entropies
    for index,word in enumerate(text):
        ev = entropy(word,df)
        df.loc[index,'entropy'] = ev
        df = df[['_BANK','entropy']]
        
    # save
    df.to_csv('entropies_computed.csv', index = False)