In [1]:
import os
import pickle
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from feature_engineering.data_processing import load_scrabble_data
from feature_engineering.quadrant_features import count_tiles_in_quadrants
from feature_engineering.utils import pretty_print_board

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
data_path = "../data/magpie-sims-400k.csv"
cached_path = "../data/cached_raw.pkl"
# Check if cached file exists
if os.path.exists(cached_path):
    print("Loading cached data...")
    with open(cached_path, "rb") as f:
        df_raw = pickle.load(f)
else:
    print("Processing and caching data...")
    df_raw = load_scrabble_data(data_path)
    
    # Save to cache
    with open(cached_path, "wb") as f:
        pickle.dump(df_raw, f)

# Verify DataFrame loaded
print(f"Data loaded with {len(df_raw)} rows")

Processing and caching data...
Data loaded in 19.7471 seconds
Data loaded with 395715 rows


In [3]:
# Apply quadrant analysis to each row
quadrant_features = df_raw["board"].apply(count_tiles_in_quadrants)

# Convert the dictionary output into separate DataFrame columns
df_quadrants = pd.DataFrame(quadrant_features.tolist())

# Rename columns for clarity
df_quadrants.rename(columns={
    "upper_left": "upper_left_tile_count",
    "upper_right": "upper_right_tile_count",
    "lower_left": "lower_left_tile_count",
    "lower_right": "lower_right_tile_count"
}, inplace=True)

# Merge back into original dataframe
df_raw = pd.concat([df_raw, df_quadrants], axis=1)

In [4]:
df_raw[["upper_left_tile_count", "upper_right_tile_count", "lower_left_tile_count", "lower_right_tile_count"]].describe()

Unnamed: 0,upper_left_tile_count,upper_right_tile_count,lower_left_tile_count,lower_right_tile_count
count,395715.0,395715.0,395715.0,395715.0
mean,8.213404,11.06819,11.166309,10.270723
std,7.835962,8.870918,8.709405,8.943973
min,0.0,0.0,0.0,0.0
25%,1.0,2.0,3.0,1.0
50%,6.0,11.0,11.0,9.0
75%,14.0,19.0,19.0,18.0
max,35.0,35.0,35.0,37.0


## Training

In [5]:
# Step 1: Prepare the features (X) and the target (y)

df_filtered = df_raw[df_raw.total_unseen_tiles >= 0]
df_filtered.drop(["board", "board_rep"], axis=1, inplace=True)

X = df_filtered.drop(columns=['winProb', 'expPointDiff'])
y = df_filtered['winProb']  # Predicting win probability as a continuous value

# Step 2: Split the data into train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)


In [6]:
X_train[:10]

Unnamed: 0,score_diff,total_unseen_tiles,leave_A,leave_B,leave_C,leave_D,leave_E,leave_F,leave_G,leave_H,...,unseen_V,unseen_W,unseen_X,unseen_Y,unseen_Z,unseen_?,upper_left_tile_count,upper_right_tile_count,lower_left_tile_count,lower_right_tile_count
0,12,93,1,0,0,0,1,0,0,0,...,2,2,1,1,1,1,0,0,0,0
1,0,97,1,0,0,0,1,0,0,0,...,2,2,1,2,1,1,0,0,0,0
2,12,93,1,0,0,0,2,0,0,0,...,2,2,1,1,1,1,0,0,0,0
3,12,93,1,0,0,0,2,0,0,0,...,2,2,1,1,1,1,0,0,0,0
4,6,93,1,0,0,0,1,0,0,0,...,2,2,1,1,1,1,0,0,0,0
5,12,93,1,0,0,0,1,0,0,0,...,2,2,1,1,1,1,0,0,0,0
6,12,93,1,0,0,0,1,0,0,0,...,2,2,1,1,1,1,0,0,0,0
7,12,93,1,0,0,0,1,0,0,0,...,2,2,1,1,1,1,0,0,0,0
8,6,93,1,0,0,0,1,0,0,0,...,2,2,1,1,1,1,0,0,0,0
9,6,93,1,0,0,0,1,0,0,0,...,2,2,1,1,1,1,0,0,0,0


In [13]:
model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_jobs=-1)

param_grid = {
    'learning_rate': [0.02, 0.05, 0.1, 0.2], 
    'max_depth': [6, 9, 12],  
    'n_estimators': [100, 500, 1000]
}

start_time = time.time()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score (negative MSE): {grid_search.best_score_}")

print(f"Training time: {training_time:.4f} seconds")

best_model = grid_search.best_estimator_

y_test_pred = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = test_mse ** 0.5
test_r2 = r2_score(y_test, y_test_pred)

print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R^2 Score: {test_r2:.4f}")

plt.show()


Best parameters: {'learning_rate': 0.02, 'max_depth': 9, 'n_estimators': 1000}
Best score (negative MSE): -0.005333922615043891
Training time: 172.6729 seconds
Test MSE: 0.0044
Test RMSE: 0.0667
Test R^2 Score: 0.9562


In [14]:
X_test_mod = X_test.copy()
X_test_mod["win_label"] = y_test
X_test_mod["win_pred"] = y_test_pred
X_test_mod["win_pred"] = X_test_mod["win_pred"].apply(lambda x: max(min(x, 1.0), 0))
X_test_mod["diff"] = (X_test_mod["win_pred"] - X_test_mod["win_label"]).apply(abs)
X_test_mod_sans_end_game = X_test_mod[X_test_mod.total_unseen_tiles >= 20]

test_mse = mean_squared_error(X_test_mod_sans_end_game["win_label"], X_test_mod_sans_end_game["win_pred"])
test_rmse = test_mse ** 0.5
test_r2 = r2_score(X_test_mod_sans_end_game["win_label"], X_test_mod_sans_end_game["win_pred"])

print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R^2 Score: {test_r2:.4f}")

Test MSE: 0.0005
Test RMSE: 0.0227
Test R^2 Score: 0.9924


In [10]:
model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_jobs=-1)


param_grid = {
    'learning_rate': [0.1], 
    'max_depth': [9],  
    'n_estimators': [1000]
}


start_time = time.time()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score (negative MSE): {grid_search.best_score_}")

print(f"Training time: {training_time:.4f} seconds")

best_model = grid_search.best_estimator_

y_test_pred = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = test_mse ** 0.5
test_r2 = r2_score(y_test, y_test_pred)

print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R^2 Score: {test_r2:.4f}")

plt.show()


Best parameters: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 1000}
Best score (negative MSE): -0.005450316423544121
Training time: 26.3745 seconds
Test MSE: 0.0044
Test RMSE: 0.0667
Test R^2 Score: 0.9562


In [12]:
X_test_mod = X_test.copy()
X_test_mod["win_label"] = y_test
X_test_mod["win_pred"] = y_test_pred
X_test_mod["win_pred"] = X_test_mod["win_pred"].apply(lambda x: max(min(x, 1.0), 0))
X_test_mod["diff"] = (X_test_mod["win_pred"] - X_test_mod["win_label"]).apply(abs)
X_test_mod_sans_end_game = X_test_mod[X_test_mod.total_unseen_tiles >= 20]

test_mse = mean_squared_error(X_test_mod_sans_end_game["win_label"], X_test_mod_sans_end_game["win_pred"])
test_rmse = test_mse ** 0.5
test_r2 = r2_score(X_test_mod_sans_end_game["win_label"], X_test_mod_sans_end_game["win_pred"])

print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R^2 Score: {test_r2:.4f}")

Test MSE: 0.0005
Test RMSE: 0.0225
Test R^2 Score: 0.9925
