# COMAP 2023 Wordle (Problem C) # 2316611

In [54]:
import pandas as pd
from pandarallel import pandarallel
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
import urllib

In [55]:
pandarallel.initialize()

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [56]:
DATA_FILE = '../data/Problem_C_Data_Wordle.xlsx'
df = pd.read_excel(DATA_FILE,header=1)
df.drop(columns=['Unnamed: 0'],inplace=True)
df.head()

Unnamed: 0,Date,Contest number,Word,Number of reported results,Number in hard mode,1 try,2 tries,3 tries,4 tries,5 tries,6 tries,7 or more tries (X)
0,2022-12-31,560,manly,20380,1899,0,2,17,37,29,12,2
1,2022-12-30,559,molar,21204,1973,0,4,21,38,26,9,1
2,2022-12-29,558,havoc,20001,1919,0,2,16,38,30,12,2
3,2022-12-28,557,impel,20160,1937,0,3,21,40,25,9,1
4,2022-12-27,556,condo,20879,2012,0,2,17,35,29,14,3


In [57]:
df.rename(columns={'Date':'date','Contest number':'contest_num','Word':'word','Number of  reported results':'num_results','Number in hard mode':'num_hardmode','1 try':'in1','2 tries':'in2','3 tries':'in3','4 tries':'in4','5 tries':'in5','6 tries':'in6','7 or more tries (X)':'over6'},inplace=True)

In [58]:
df['day_of_week'] = df.date.dt.day_of_week

In [59]:
df[~(df['word'].apply(len) == 5)]

Unnamed: 0,date,contest_num,word,num_results,num_hardmode,in1,in2,in3,in4,in5,in6,over6,day_of_week
15,2022-12-16,545,rprobe,22853,2160,0,6,24,32,24,11,3,4
35,2022-11-26,525,clen,26381,2424,1,17,36,31,12,3,0,5
246,2022-04-29,314,tash,106652,7001,2,19,34,27,13,4,1,4
353,2022-01-12,207,favor,137586,3073,1,4,15,26,29,21,4,2


In [60]:
df.loc[15,'word'] = 'probe'
df.loc[35,'word'] = 'clean'
df.loc[246,'word'] = 'trash'
df.loc[353,'word'] = 'favor'
df.loc[20,'word'] = 'naive'

In [61]:
df['letter1'] = df.word.apply(lambda x: x[0])
df['letter2'] = df.word.apply(lambda x: x[1])
df['letter3'] = df.word.apply(lambda x: x[2])
df['letter4'] = df.word.apply(lambda x: x[3])
df['letter5'] = df.word.apply(lambda x: x[4])

df['letter1_int'] = df.word.apply(lambda x: ord(x[0])-96)
df['letter2_int'] = df.word.apply(lambda x: ord(x[1])-96)
df['letter3_int'] = df.word.apply(lambda x: ord(x[2])-96)
df['letter4_int'] = df.word.apply(lambda x: ord(x[3])-96)
df['letter5_int'] = df.word.apply(lambda x: ord(x[4])-96)

In [62]:
# source: https://github.com/tabatkins/wordle-list
ALLOWED_WORDS_FILE = 'https://raw.githubusercontent.com/tabatkins/wordle-list/main/words'
allowed_words = urllib.request.urlopen(ALLOWED_WORDS_FILE).read().decode().split()

In [63]:
# getting the letter frequencies
# letter_freq = {}
# for word in allowed_words:
#     for char in word:
#         if char in letter_freq:
#             letter_freq[char] += 1
#         else:
#             letter_freq[char] = 1

# for key in letter_freq.keys():
#     letter_freq[key] /= len(allowed_words*5)

# print(letter_freq)

In [64]:
from util import word_freqs
df['word_score'] = word_freqs(df.word)

In [65]:
# average number of guesses (counting over 6 as 7 guesses)
df['avg_num_guesses'] = (df['in1']+df['in2']*2+df['in3']*3+df['in4']*4+df['in5']*5+df['in6']*6+df['over6']*7)/100

In [66]:
from util import occurrence_score
df.word.apply(lambda word: occurrence_score(word))

ImportError: cannot import name 'occurrence_score' from 'util' (/Users/miarodgers/Documents/GitHub/COMAP2023/src/util.py)

In [None]:
df.head()

Unnamed: 0,date,contest_num,word,num_results,num_hardmode,in1,in2,in3,in4,in5,...,letter4,letter5,letter1_int,letter2_int,letter3_int,letter4_int,letter5_int,word_score,avg_num_guesses,letter_score
0,2022-12-31,560,manly,20380,1899,0,2,17,37,29,...,l,y,13,1,14,12,25,1.292494,4.34,1.292494
1,2022-12-30,559,molar,21204,1973,0,4,21,38,26,...,a,r,13,15,12,1,18,1.564995,4.14,1.564995
2,2022-12-29,558,havoc,20001,1919,0,2,16,38,30,...,o,c,8,1,22,15,3,1.169976,4.4,1.169976
3,2022-12-28,557,impel,20160,1937,0,3,21,40,25,...,e,l,9,13,16,5,12,1.377718,4.15,1.377718
4,2022-12-27,556,condo,20879,2012,0,2,17,35,29,...,d,o,3,15,14,4,15,1.271154,4.45,1.271154


In [None]:
df.to_csv('../data/cleaned.csv',index=False)

In [None]:
# DONT RUN THIS AGAIN IT TAKES 43 MINUTES
# from util import get_freq
# allowed = pd.DataFrame({'word':allowed_words})
# allowed['word_occurrences']=allowed.word.parallel_apply(get_freq)

In [None]:
allowed = pd.read_csv('../data/allowed_words.csv')

In [None]:
allowed

Unnamed: 0,word,word_occurrences,freqs
0,rossa,45555,4.346454e-05
1,jetty,230229,2.196641e-04
2,wizzo,1091,1.040935e-06
3,cuppa,14677,1.400349e-05
4,cohoe,4427,4.223851e-06
...,...,...,...
14850,dunny,11477,1.095033e-05
14851,decal,63175,6.027598e-05
14852,fungs,695,6.631073e-07
14853,cadgy,271,2.585641e-07


In [None]:
allowed.word_occurrences.describe()

count    1.485500e+04
mean     1.962186e+06
std      1.788186e+07
min      0.000000e+00
25%      1.960500e+03
50%      1.590100e+04
75%      1.662200e+05
max      1.048096e+09
Name: word_occurrences, dtype: float64

In [None]:
allowed

Unnamed: 0,word,word_occurrences,freqs
0,rossa,45555,4.346454e-05
1,jetty,230229,2.196641e-04
2,wizzo,1091,1.040935e-06
3,cuppa,14677,1.400349e-05
4,cohoe,4427,4.223851e-06
...,...,...,...
14850,dunny,11477,1.095033e-05
14851,decal,63175,6.027598e-05
14852,fungs,695,6.631073e-07
14853,cadgy,271,2.585641e-07


In [None]:
allowed[allowed.word == 'trash'].freqs.values[0]

0.0016336464631694

In [None]:

# getting the letter frequencies
letter_freq = {}
for word in allowed_words:
    for char in word:
        if char in letter_freq:
            letter_freq[char] += 1
        else:
            letter_freq[char] = 1

for key in letter_freq.keys():
    letter_freq[key] /= len(allowed_words*5)

In [None]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler(feature_range=(0,1))
allowed['freqs'] = minmax.fit_transform(allowed[['freqs']])
allowed.freqs.describe()

count    14855.000000
mean         0.001872
std          0.017061
min          0.000000
25%          0.000002
50%          0.000015
75%          0.000159
max          1.000000
Name: freqs, dtype: float64

In [None]:
allowed.to_csv('../data/allowed_words.csv',index=False)