For the final model, I decided to pick logistic regression because it looked very promising and seemed to give good results.

In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from itertools import combinations
from itertools import permutations

import random

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier



In [20]:
word_data = pd.read_csv('../data/unigram_freq.csv')
word_data_top1000 = word_data.head(1000)

wheel_data = pd.read_csv('../data/wheeldata.csv')
wheel_data_nodup = wheel_data.drop_duplicates(subset=['Puzzle'])


In [21]:
# regression set up - duplicated from other file

wheel_data_nodup['Puzzle'] = wheel_data_nodup.Puzzle.astype('str')

excluded = {'R', 'S', 'T', 'L', 'N', 'E', ' '}
letters = [chr(i) for i in range(65, 91) if chr(i) not in excluded]

vowels = {'A', 'E', 'I', 'O', 'U'}
consonants = set(letters) - vowels


def get_feature_vector(puzzle, guess_set):
    puzzle = puzzle.upper()
    letter_counts = [puzzle.count(l) for l in letters]
    guess_vector = [1 if l in guess_set else 0 for l in letters]
    return letter_counts + guess_vector

def get_reveal_score(puzzle, guess_set): # we make a guess with a combo of letters 
    return sum(1 for c in puzzle.upper() if c in guess_set and c not in excluded)/len(puzzle)

X_combinations = []
y_regress = []

guess_list = [] # this list stores all the guesses we used

for idx, row in wheel_data_nodup.iterrows():
    puzzle = row['Puzzle']
    puzzle_letters = set(puzzle.upper()) - excluded

    guesses = []
    for i in range(1000):
        cons = random.sample(list(consonants), 3)
        vwls = random.sample(list(vowels), 1)
        guesses.append(cons+vwls)

    for guess in guesses:
        guess_set = set(guess)
        features = get_feature_vector(puzzle, guess_set)
        score = get_reveal_score(puzzle, guess_set)
        
        X_combinations.append(features)
        y_regress.append(score)
        guess_list.append(''.join(guess))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wheel_data_nodup['Puzzle'] = wheel_data_nodup.Puzzle.astype('str')


In [76]:
# linear regression model set up and training
# x_arr and y_arr reused for logistic code
X_arr = np.array(X_combinations)
y_arr = np.array(y_regress)

X_train, X_test, y_train, y_test = train_test_split(X_arr, y_arr, test_size=0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

y_pred = lin_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(X_arr[0])
print(y_arr)

print(y_pred)

print(f"Linear Regression MSE: {mse:.2f}")

# logistic regression model code

# classification target
y_class = [1 if score >= 0.5 else 0 for score in y_regress] 

X_train, X_test, y_train, y_test = train_test_split(X_combinations, y_class, test_size=0.2, random_state=42)

# fit to log model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)


# Confusion Matrix 
conf_matrix = confusion_matrix(y_test, log_model.predict(X_test))
# Accuracy
print("Confusion Matrix:\n", conf_matrix)



[0 0 0 1 0 0 2 1 0 0 0 1 1 0 2 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0
 0 0 0]
[0.         0.05882353 0.05882353 ... 0.15789474 0.21052632 0.05263158]
[0.09563837 0.11987457 0.13613771 ... 0.1325568  0.13045804 0.1134842 ]
Linear Regression MSE: 0.01
Confusion Matrix:
 [[286137      0]
 [   263      0]]


In [None]:

# Predict separately
y_pred_reg = lin_model.predict(X_test)                      # regression: expected letters revealed
y_pred_log_proba = log_model.predict_proba(X_test)[:, 1]    # logistic: probability of "good guess"

weights = []
for i in range(10):
    a = random.random() 
    b = random.random()
    a = a/(a+b)
    b = b/(a+b)
    weights.append((a, b))

for i in range(len(weights)):

    # Weighted average (tune weights as needed)
    ensemble_score = weights[i][0] * y_pred_log_proba + weights[i][1] * (y_pred_reg * 100 / (max(y_pred_reg) * 100))  # scale regression if needed

    # Final binary prediction with a threshold
    ensemble_pred_class = [1 if score > 0.45 else 0 for score in ensemble_score]

    # Evaluation
    print("Ensemble Accuracy:", accuracy_score(y_test, ensemble_pred_class))
    print("Ensemble MSE:", mean_squared_error(y_test, ensemble_pred_class))

    # Confusion Matrix 
    conf_matrix = confusion_matrix(y_test, ensemble_pred_class)
    if (conf_matrix[0][0] >= conf_matrix[0][1]) and (conf_matrix[1][1] >= conf_matrix[1][0]):
        print("Good!")
    else:
        print("Bad!")
        
    print("Confusion Matrix:\n", conf_matrix)
    # Feature importance            
    feature_importance = log_model.coef_[0]
    feature_importance = np.abs(feature_importance)  # Use absolute values for importance

Ensemble Accuracy: 0.9990817039106146
Ensemble MSE: 0.0009182960893854749
Bad!
Confusion Matrix:
 [[286137      0]
 [   263      0]]
Ensemble Accuracy: 0.9990817039106146
Ensemble MSE: 0.0009182960893854749
Bad!
Confusion Matrix:
 [[286137      0]
 [   263      0]]
Ensemble Accuracy: 0.8057157821229051
Ensemble MSE: 0.19428421787709496
Good!
Confusion Matrix:
 [[230608  55529]
 [   114    149]]
Ensemble Accuracy: 0.9990817039106146
Ensemble MSE: 0.0009182960893854749
Bad!
Confusion Matrix:
 [[286137      0]
 [   263      0]]
Ensemble Accuracy: 0.2587709497206704
Ensemble MSE: 0.7412290502793296
Bad!
Confusion Matrix:
 [[ 73850 212287]
 [     1    262]]
Ensemble Accuracy: 0.9990817039106146
Ensemble MSE: 0.0009182960893854749
Bad!
Confusion Matrix:
 [[286137      0]
 [   263      0]]
Ensemble Accuracy: 0.3506040502793296
Ensemble MSE: 0.6493959497206704
Bad!
Confusion Matrix:
 [[100159 185978]
 [     9    254]]
Ensemble Accuracy: 0.5830237430167597
Ensemble MSE: 0.4169762569832402
Good!

In [None]:
def predict_with_ensemble(letter_combination, puzzle):
    # Convert the letter combination to a set
    guess_set = set(letter_combination)
    
    # Generate the feature vector
    feature_vector = get_feature_vector(puzzle, guess_set)
    
    # Predict using the regression model
    reg_pred = lin_model.predict([feature_vector])[0]
    
    # Predict using the logistic model
    log_proba = log_model.predict_proba([feature_vector])[0][1]
    
    # Compute the ensemble score, this weight was chosen because it was what worked.
    ensemble_score = 0.7 * log_proba + 0.3 * (reg_pred / max(y_pred_reg))
    
    return {
        "ensemble_score": ensemble_score,
        "regression_prediction": reg_pred,
        "logistic_probability": log_proba
    }


{'ensemble_score': np.float64(0.27177938181131067), 'regression_prediction': np.float64(0.18974735174813356), 'logistic_probability': np.float64(0.00037425437066680777)}


In [105]:
# put the new model to use
best_combination = None
best_score = 0

hyp1_avg = 0
hyp2_avg = 0


for idx, row in wheel_data_nodup.iterrows():
    puzzle = row['Puzzle']
    puzzle_letters = set(puzzle.upper()) - excluded

    for i in range(10):
        guesses = []
        cons = random.sample(list(consonants), 3)
        vwls = random.sample(list(vowels), 1)
        guesses.append(cons + vwls)

    hyp1 = predict_with_ensemble(["D", "C", "M", "A"], puzzle)
    hyp2 = predict_with_ensemble(["G", "H", "P", "O"], puzzle)
    hyp1_avg += hyp1["ensemble_score"]
    hyp2_avg += hyp2["ensemble_score"]
    for guess in guesses:
        prediction = predict_with_ensemble(guess, puzzle)
        ensemble_score = prediction["ensemble_score"]
        print(ensemble_score)
        if ensemble_score > best_score:
            best_combination = guess
            best_score = ensemble_score
print("Best Combination: ", best_combination)
print("Avg Score for DCMA: ", hyp1_avg / len(wheel_data_nodup))
print("Avg Score for GHPO: ", hyp2_avg / len(wheel_data_nodup))

print("Best Score: ",best_score)

0.17137053793330823
0.19498883964005492
0.19792264443018398
0.14264327341920668
0.07244711079991754
0.13916596220862776
0.1082205268742715
0.23437123748034572
0.07044296557862322
0.20263027782675777
0.0753080432860716
0.09702890733074873
0.1819268087956631
0.24593137535261703
0.21286568784259646
0.08095182176282116
0.1574764592129688
0.18604769118551964
0.12439749100518982
0.23339339664760075
0.10564375960983326
0.05046450821676467
0.15387112078846515
0.1918753205340318
0.2538376186141956
0.15453864304250153
0.26049815890215583
0.18845398587156642
0.18004051705453988
0.24505169582918232
0.16178928229797132
0.22631939103896082
0.09546041818587056
0.13251302162161277
0.22021870077751976
0.17722617872679539
0.19608033242112047
0.1741473050889924
0.08796079436436051
0.16143633643070832
0.2269831914628247
0.07643617237548997
0.2316143996455419
0.1591901637065684
0.1600396313364003
0.20850126805482408
0.181837733987664
0.14284166728245826
0.20368205134655432
0.203816780223467
0.2096811472040