See README.md

Meant to explore stats on 5 letter words for starting wordle

In [1]:
import collections
import itertools 
import pprint

import numpy as np
import pandas as pd

## Read in the scabble words and keep just the 5 letter words

In [2]:
words = pd.read_csv('./Collins Scrabble Words (2019).zip', header=None, skiprows=2, names=['words'], dtype=str)
words['len'] = words['words'].str.len()
words = words.loc[words['len'] == 5]
words

Unnamed: 0,words,len
2,AAHED,5.0
6,AALII,5.0
13,AARGH,5.0
16,AARTI,5.0
24,ABACA,5.0
...,...,...
279367,ZUZIM,5.0
279383,ZYGAL,5.0
279421,ZYGON,5.0
279449,ZYMES,5.0


## Reorganize into columns for really simple stats work

In [3]:
for i in range(1, 6): 
    words[i] = words['words'].str[i-1]
words


Unnamed: 0,words,len,1,2,3,4,5
2,AAHED,5.0,A,A,H,E,D
6,AALII,5.0,A,A,L,I,I
13,AARGH,5.0,A,A,R,G,H
16,AARTI,5.0,A,A,R,T,I
24,ABACA,5.0,A,B,A,C,A
...,...,...,...,...,...,...,...
279367,ZUZIM,5.0,Z,U,Z,I,M
279383,ZYGAL,5.0,Z,Y,G,A,L
279421,ZYGON,5.0,Z,Y,G,O,N
279449,ZYMES,5.0,Z,Y,M,E,S


## Get the 4 most common letter in each position

In [4]:
pos = {i:words[i].value_counts().iloc[0:4].index.to_list() for i in range(1, 6)}
pos

{1: ['S', 'C', 'B', 'P'],
 2: ['A', 'O', 'E', 'I'],
 3: ['A', 'R', 'I', 'O'],
 4: ['E', 'A', 'T', 'I'],
 5: ['S', 'E', 'Y', 'D']}

## Make words from these letter

In [5]:
words.set_index('words', inplace=True)

In [6]:
possbile_words = []

for i1, i2, i3, i4, i5 in itertools.product(*[range(len(pos[1]))]*5):
    try:
        wrd = f'{pos[1][i1]}{pos[2][i2]}{pos[3][i3]}{pos[4][i4]}{pos[5][i5]}'
        words.loc[wrd]
        possbile_words.append(wrd)
    except KeyError:  # not a scabble word
        pass
print(len(possbile_words))
print(possbile_words)

79
['SAREE', 'SARED', 'SARIS', 'SORES', 'SOREE', 'SORED', 'SORAS', 'SORTS', 'SOOEY', 'SOOTS', 'SOOTE', 'SOOTY', 'SEATS', 'SERES', 'SERED', 'SEITY', 'SIRES', 'SIREE', 'SIRED', 'SIRIS', 'CAAED', 'CARES', 'CARED', 'CARTS', 'CARTE', 'COATS', 'COATE', 'CORES', 'COREY', 'CORED', 'COITS', 'COOEE', 'COOEY', 'COOED', 'COOTS', 'CERES', 'CERED', 'CERTS', 'CERTY', 'CIRES', 'BAAED', 'BARES', 'BARED', 'BAITS', 'BOATS', 'BORES', 'BOREE', 'BORED', 'BORAS', 'BORTS', 'BORTY', 'BOITE', 'BOOED', 'BOOAY', 'BOOTS', 'BOOTY', 'BEATS', 'BEATY', 'BERES', 'BERAY', 'PARES', 'PARED', 'PARAS', 'PARAE', 'PARTS', 'PARTY', 'PARIS', 'PORES', 'PORED', 'PORAE', 'PORTS', 'PORTY', 'POOED', 'POOTS', 'PEATS', 'PEATY', 'PERES', 'PERTS', 'PERIS']


## Cull the list so that we don't have repeated letters


In [7]:
good_words = [v for v in possbile_words if len("".join(set(v)))== 5]
print(len(good_words))
pprint.pprint(good_words)


44
['SARED',
 'SORED',
 'SEITY',
 'SIRED',
 'CARES',
 'CARED',
 'CARTS',
 'CARTE',
 'COATS',
 'COATE',
 'CORES',
 'COREY',
 'CORED',
 'COITS',
 'CERTS',
 'CERTY',
 'CIRES',
 'BARES',
 'BARED',
 'BAITS',
 'BOATS',
 'BORES',
 'BORED',
 'BORAS',
 'BORTS',
 'BORTY',
 'BOITE',
 'BEATS',
 'BEATY',
 'BERAY',
 'PARES',
 'PARED',
 'PARTS',
 'PARTY',
 'PARIS',
 'PORES',
 'PORED',
 'PORAE',
 'PORTS',
 'PORTY',
 'PEATS',
 'PEATY',
 'PERTS',
 'PERIS']


## Score earch word by the rank in each position

In [8]:
scores = []
for w in good_words:
    scores.append([])
    for ii, l in enumerate(w, 1):
        scores[-1].append(pos[ii].index(l))

scores = np.sum(scores, axis=1)
ans = pd.DataFrame({'words':good_words, 'score':scores})
ans.sort_values('score').reset_index(drop=True)

Unnamed: 0,words,score
0,CARES,2
1,BARES,3
2,CORES,3
3,SARED,4
4,PARES,4
5,COATS,4
6,BORES,4
7,CARTS,4
8,CARTE,5
9,CARED,5


No idea if this actually a good way to think on this but it does give words with the most common letters in each position. The lower the score the more likely the letteres were to be common