# Stats

Calculate various stats and frequencies among the words in Wordle.
This should help making guesses.

In [45]:
import numpy as np
import pandas as pd
from collections import Counter

In [1]:
# we have to use some previous code
import sys
sys.path.append('..')

In [2]:
from parse_data import read_parsed_words, DEFAULT_PARSED_WORDS_FILE

In [3]:
words = read_parsed_words()
len(words)

12972

## Letter Frequencies

How often is a letter part of a word? Don't double-count!

In [5]:
d = Counter()
for word in words:
    # unique letters in the word
    ls = set(word)
    for letter in ls:
        d.setdefault(letter, 0)
        d[letter] += 1

In [15]:
df = pd.DataFrame(d.items(), columns=['letter', 'count']).set_index('letter')
df

Unnamed: 0_level_0,count
letter,Unnamed: 1_level_1
a,5330
e,5705
h,1708
d,2298
l,3114
i,3589
g,1543
r,3909
t,3033
c,1920


Are there any letters not in the dataset?

In [46]:
len(df)

26

No, the dataset contains every letter

In [17]:
df['freq'] = df['count'] / len(words)

In [18]:
df.sort_values('count', ascending=False)

Unnamed: 0_level_0,count,freq
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
s,5936,0.457601
e,5705,0.439793
a,5330,0.410885
o,3911,0.301496
r,3909,0.301341
i,3589,0.276673
l,3114,0.240056
t,3033,0.233811
n,2787,0.214847
u,2436,0.187789


The most frequent letter in these words is actually 's'

## Most Frequent Letters by Position

Different letters have different frequencies depending on their position.

In [47]:
m = np.zeros(shape=(26, 5), dtype='uint32')
for word in words:
    for pos, letter in enumerate(word):
        letter_index = ord(letter) - 97
        assert letter_index >= 0 and letter_index < 26
        m[letter_index, pos] += 1
        
m

array([[ 737, 2263, 1236, 1074,  680],
       [ 909,   81,  335,  243,   59],
       [ 922,  176,  392,  411,  127],
       [ 685,   84,  390,  471,  823],
       [ 303, 1628,  882, 2327, 1522],
       [ 598,   24,  178,  233,   82],
       [ 638,   76,  364,  423,  143],
       [ 489,  546,  120,  235,  370],
       [ 165, 1383, 1051,  880,  280],
       [ 202,   11,   46,   29,    3],
       [ 376,   95,  272,  503,  259],
       [ 577,  699,  848,  771,  476],
       [ 693,  188,  511,  402,  182],
       [ 325,  345,  964,  788,  530],
       [ 262, 2096,  993,  698,  389],
       [ 859,  231,  364,  418,  147],
       [  78,   15,   13,    2,    4],
       [ 628,  940, 1198,  719,  673],
       [1565,   93,  533,  516, 3958],
       [ 815,  239,  616,  898,  727],
       [ 189, 1187,  667,  401,   67],
       [ 242,   52,  240,  156,    4],
       [ 413,  163,  271,  128,   64],
       [  16,   57,  133,   12,   70],
       [ 181,  271,  213,  108, 1301],
       [ 105,   29,  142,

In [48]:
letters = [chr(97 + i) for i in range(26)]
letters

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [32]:
df_pos = pd.DataFrame(m, columns=['pos_1', 'pos_2', 'pos_3', 'pos_4', 'pos_5'], index=letters)

In [33]:
df_pos

Unnamed: 0,pos_1,pos_2,pos_3,pos_4,pos_5
a,737,2263,1236,1074,680
b,909,81,335,243,59
c,922,176,392,411,127
d,685,84,390,471,823
e,303,1628,882,2327,1522
f,598,24,178,233,82
g,638,76,364,423,143
h,489,546,120,235,370
i,165,1383,1051,880,280
j,202,11,46,29,3


In [39]:
df_pos.sort_values(by='pos_1', ascending=False).head(5)

Unnamed: 0,pos_1,pos_2,pos_3,pos_4,pos_5
s,1565,93,533,516,3958
c,922,176,392,411,127
b,909,81,335,243,59
p,859,231,364,418,147
t,815,239,616,898,727


In [40]:
df_pos.sort_values(by='pos_2', ascending=False).head(5)

Unnamed: 0,pos_1,pos_2,pos_3,pos_4,pos_5
a,737,2263,1236,1074,680
o,262,2096,993,698,389
e,303,1628,882,2327,1522
i,165,1383,1051,880,280
u,189,1187,667,401,67


In [41]:
df_pos.sort_values(by='pos_3', ascending=False).head(5)

Unnamed: 0,pos_1,pos_2,pos_3,pos_4,pos_5
a,737,2263,1236,1074,680
r,628,940,1198,719,673
i,165,1383,1051,880,280
o,262,2096,993,698,389
n,325,345,964,788,530


In [42]:
df_pos.sort_values(by='pos_4', ascending=False).head(5)

Unnamed: 0,pos_1,pos_2,pos_3,pos_4,pos_5
e,303,1628,882,2327,1522
a,737,2263,1236,1074,680
t,815,239,616,898,727
i,165,1383,1051,880,280
n,325,345,964,788,530


In [43]:
df_pos.sort_values(by='pos_5', ascending=False).head(5)

Unnamed: 0,pos_1,pos_2,pos_3,pos_4,pos_5
s,1565,93,533,516,3958
e,303,1628,882,2327,1522
y,181,271,213,108,1301
d,685,84,390,471,823
t,815,239,616,898,727


It turns out the most common letter in the first and last positions is 's' (s by a wide margin as the last letter). But in positions 2 and 3, the most common letter is 'a', and it's 'e' in position 4.