# COMAP 2023 Wordle (Problem C) # 2316611

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
import urllib

In [3]:
DATA_FILE = '../data/Problem_C_Data_Wordle.xlsx'
df = pd.read_excel(DATA_FILE,header=1)
df.drop(columns=['Unnamed: 0'],inplace=True)
df.head()

Unnamed: 0,Date,Contest number,Word,Number of reported results,Number in hard mode,1 try,2 tries,3 tries,4 tries,5 tries,6 tries,7 or more tries (X)
0,2022-12-31,560,manly,20380,1899,0,2,17,37,29,12,2
1,2022-12-30,559,molar,21204,1973,0,4,21,38,26,9,1
2,2022-12-29,558,havoc,20001,1919,0,2,16,38,30,12,2
3,2022-12-28,557,impel,20160,1937,0,3,21,40,25,9,1
4,2022-12-27,556,condo,20879,2012,0,2,17,35,29,14,3


In [4]:
df.rename(columns={'Date':'date','Contest number':'contest_num','Word':'word','Number of  reported results':'num_results','Number in hard mode':'num_hardmode','1 try':'in1','2 tries':'in2','3 tries':'in3','4 tries':'in4','5 tries':'in5','6 tries':'in6','7 or more tries (X)':'over6'},inplace=True)

In [6]:
df['day_of_week'] = df.date.dt.day_of_week

In [7]:
df[~(df['word'].apply(len) == 5)]

Unnamed: 0,date,contest_num,word,num_results,num_hardmode,in1,in2,in3,in4,in5,in6,over6,day_of_week
15,2022-12-16,545,rprobe,22853,2160,0,6,24,32,24,11,3,4
35,2022-11-26,525,clen,26381,2424,1,17,36,31,12,3,0,5
246,2022-04-29,314,tash,106652,7001,2,19,34,27,13,4,1,4
353,2022-01-12,207,favor,137586,3073,1,4,15,26,29,21,4,2


In [8]:
df.loc[15,'word'] = 'probe'
df.loc[35,'word'] = 'clean'
df.loc[246,'word'] = 'trash'
df.loc[353,'word'] = 'favor'
df.loc[20,'word'] = 'naive'

In [9]:
df['letter1'] = df.word.apply(lambda x: x[0])
df['letter2'] = df.word.apply(lambda x: x[1])
df['letter3'] = df.word.apply(lambda x: x[2])
df['letter4'] = df.word.apply(lambda x: x[3])
df['letter5'] = df.word.apply(lambda x: x[4])

df['letter1_int'] = df.word.apply(lambda x: ord(x[0])-96)
df['letter2_int'] = df.word.apply(lambda x: ord(x[1])-96)
df['letter3_int'] = df.word.apply(lambda x: ord(x[2])-96)
df['letter4_int'] = df.word.apply(lambda x: ord(x[3])-96)
df['letter5_int'] = df.word.apply(lambda x: ord(x[4])-96)

In [10]:
# source: https://github.com/tabatkins/wordle-list
ALLOWED_WORDS_FILE = 'https://raw.githubusercontent.com/tabatkins/wordle-list/main/words'
allowed_words = urllib.request.urlopen(ALLOWED_WORDS_FILE).read().decode().split()

In [33]:
# getting the letter frequencies
letter_freq = {}
for word in allowed_words:
    for char in word:
        if char in letter_freq:
            letter_freq[char] += 1
        else:
            letter_freq[char] = 1

for key in letter_freq.keys():
    letter_freq[key] /= len(allowed_words*5)

print(letter_freq)

{'r': 0.06346684617973747, 'o': 0.07017165937394816, 's': 0.09853921238640188, 'a': 0.09596768764725681, 'j': 0.004604510265903736, 'e': 0.10037024570851565, 't': 0.04990912150790979, 'y': 0.032312352743184114, 'w': 0.015173342308986874, 'i': 0.058983507236620665, 'z': 0.006772130595759004, 'c': 0.030238976775496466, 'u': 0.039407606866374956, 'p': 0.03279703803433188, 'h': 0.026832716257152472, 'g': 0.02509592729720633, 'k': 0.02360148098283406, 'q': 0.0019522046449007068, 'd': 0.03682261864692023, 'b': 0.024893975092561426, 'f': 0.016694715583978458, 'l': 0.05089195557051498, 'm': 0.03250084146751935, 'n': 0.04682598451699765, 'x': 0.0043890945809491756, 'v': 0.010784247728037698}


In [26]:
# give each word a score based on letter frequencies in allowed word list
df['word_score'] = df.word.apply(lambda word: sum([letter_freq[letter] for letter in word]))

In [28]:
# average number of guesses (counting over 6 as 7 guesses)
df['avg_num_guesses'] = (df['in1']+df['in2']*2+df['in3']*3+df['in4']*4+df['in5']*5+df['in6']*6+df['over6']*7)/100

In [29]:
df.head()

Unnamed: 0,date,contest_num,word,num_results,num_hardmode,in1,in2,in3,in4,in5,...,letter3,letter4,letter5,letter1_int,letter2_int,letter3_int,letter4_int,letter5_int,word_score,avg_num_guesses
0,2022-12-31,560,manly,20380,1899,0,2,17,37,29,...,n,l,y,13,1,14,12,25,0.258499,4.34
1,2022-12-30,559,molar,21204,1973,0,4,21,38,26,...,l,a,r,13,15,12,1,18,0.312999,4.14
2,2022-12-29,558,havoc,20001,1919,0,2,16,38,30,...,v,o,c,8,1,22,15,3,0.233995,4.4
3,2022-12-28,557,impel,20160,1937,0,3,21,40,25,...,p,e,l,9,13,16,5,12,0.275544,4.15
4,2022-12-27,556,condo,20879,2012,0,2,17,35,29,...,n,d,o,3,15,14,4,15,0.254231,4.45


In [30]:
df.to_csv('../data/cleaned.csv')