In [1]:
import os
import pickle
import time
import importlib
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from features.data_processing import load_scrabble_data
from features.quadrant_features import count_tiles_in_quadrants
from game_logic.utils import pretty_print_board

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [3]:
cached_features_path = "../data/numerical_features.pkl"
df = pd.read_pickle(cached_features_path)

In [10]:
df.to_csv("../data/numerical_features.csv")

In [4]:
df.head()

Unnamed: 0,score_diff,total_unseen_tiles,leave_A,leave_B,leave_C,leave_D,leave_E,leave_F,leave_G,leave_H,...,accessible_TWS_count,accessible_DWS_count,available_TWS_TWS,available_DWS_TLS,available_DWS_DWS,available_DLS_TWS,quadrant_counts_upper_left,quadrant_counts_upper_right,quadrant_counts_lower_left,quadrant_counts_lower_right
0,12,93,1,0,0,0,1,0,0,0,...,2,2,0,0,1,0,0,0,0,0
1,0,97,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12,93,1,0,0,0,2,0,0,0,...,2,0,0,0,0,0,0,0,0,0
3,12,93,1,0,0,0,2,0,0,0,...,2,2,0,0,1,0,0,0,0,0
4,6,93,1,0,0,0,1,0,0,0,...,2,0,0,0,0,0,0,0,0,0


In [6]:
# Step 1: Prepare the features (X) and the target (y)

print(len(df))

X = df.drop(columns=['winProb', 'expPointDiff'])
y = df['winProb']  # Predicting win probability as a continuous value

# Step 2: Split the data into train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)


395715


In [8]:
import time
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Step 3: Set up the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_jobs=1)

# Define the parameter grid
param_grid = {
    'learning_rate': [0.1, 0.2], 
    'max_depth': [9, 12],  
    'n_estimators': [500, 1000]
}

# Custom function to track progress
def print_progress(grid_search):
    print("\nTraining Progress:")
    results = grid_search.cv_results_
    for i in range(len(results["params"])):
        print(f"Params: {results['params'][i]} - Mean MSE: {results['mean_test_score'][i]:.4f}")

# Time the grid search process
start_time = time.time()

# Step 5: Perform grid search with 3-fold cross-validation
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=param_grid, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1, 
    verbose=2  # This will print each iteration
)
grid_search.fit(X_train, y_train)

# Record the end time after training
end_time = time.time()
training_time = end_time - start_time

# Print progress
print_progress(grid_search)

# Print the best parameters and score
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best score (negative MSE): {grid_search.best_score_}")

# Print training time
print(f"Training time: {training_time:.4f} seconds")

# Step 6: Use the best model from grid search and evaluate on the test set
best_model = grid_search.best_estimator_

# Predict on the test set
y_test_pred = best_model.predict(X_test)

# Evaluate performance on the test set
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = test_mse ** 0.5
test_r2 = r2_score(y_test, y_test_pred)

print("\nTest Set Evaluation:")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R^2 Score: {test_r2:.4f}")

# Step 7: Visualize feature importance
xgb.plot_importance(best_model)
plt.show()


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END .learning_rate=0.1, max_depth=12, n_estimators=1000; total time=  50.4s
[CV] END .learning_rate=0.1, max_depth=12, n_estimators=1000; total time=  50.3s
[CV] END .learning_rate=0.1, max_depth=12, n_estimators=1000; total time=  50.0s
[CV] END ...learning_rate=0.1, max_depth=9, n_estimators=500; total time=  17.0s
[CV] END ...learning_rate=0.2, max_depth=9, n_estimators=500; total time=  16.3s
[CV] END ..learning_rate=0.1, max_depth=12, n_estimators=500; total time=  26.0s



KeyboardInterrupt

