In [1]:
import pandas as pd
import time
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import GridSearchCV

In [2]:
file_path = "../data/magpie-sims-400k.csv"

TILE_DIST = {
    "A": 9, "B": 2, "C": 2, "D": 4, "E": 12, "F": 2, "G": 3, "H": 2, "I": 9,
    "J": 1, "K": 1, "L": 4, "M": 2, "N": 6, "O": 8, "P": 2, "Q": 1, "R": 6,
    "S": 4, "T": 6, "U": 4, "V": 2, "W": 2, "X": 1, "Y": 2, "Z": 1, "?": 2
}

BASE_LEAVE = {k: 0 for k in TILE_DIST.keys()}

## Data Ingestion + Feature Engineering

In [6]:
def parse_run_tile_representation(data):
    """
    Parse a Scrabble run-tile representation into a 15x15 board.

    Args:
        data (str): The run-tile representation with rows separated by slashes.

    Returns:
        list[list[None|str]]: A 15x15 board where each cell is either None, a letter (A-Z), or a blank (a-z).
    """
    board = [[None for _ in range(15)] for _ in range(15)]  # Initialize 15x15 board with None

    rows = data.split('/')
    for row_index, row_data in enumerate(rows):
        col_index = 0
        i = 0
        
        while i < len(row_data):
            # Read consecutive empty squares (numbers)
            if row_data[i].isdigit():
                num_empty = 0
                while i < len(row_data) and row_data[i].isdigit():
                    num_empty = num_empty * 10 + int(row_data[i])
                    i += 1
                col_index += num_empty
            
            # Read tiles
            elif row_data[i].isalpha():
                board[row_index][col_index] = row_data[i]
                col_index += 1
                i += 1
            
            # Ignore unexpected characters
            else:
                i += 1

    return board

In [27]:
# Open the file and iterate through each line
cnt = 0
training_data = []

start_time = time.time()

with open(file_path, 'r') as file:
    for line in file:
        cnt += 1
        # Strip the line of leading/trailing whitespace
        line = line.strip()
        
        # Split the line by spaces (or any other delimiter as needed)
        parts = line.split()
        
        board_state = parts[0]
        board = parse_run_tile_representation(board_state)
        leave = parts[1].replace("/", "")
        opp_score, player_score = map(int, parts[2].split("/"))
        score_diff = player_score - opp_score
        _, winProb, expDiff = map(float, parts[3].split(","))
        
        unseen_tiles = dict(TILE_DIST)
        leave_dict = dict(BASE_LEAVE)
        
        for el in board_state:
            if not el.isalpha():
                continue
            if el.islower():
                unseen_tiles["?"] -= 1
            else:
                unseen_tiles[el] -= 1
        
        for el in leave:
            unseen_tiles[el] -= 1
            leave_dict[el] += 1
        
        leave_flattened = [leave_dict[letter] for letter in TILE_DIST.keys()]
        unseen_tiles_flattened = [unseen_tiles[letter] for letter in TILE_DIST.keys()]
        
        # Total number of unseen tiles remaining
        total_unseen_tiles = sum(unseen_tiles.values())
        
        # Create a row for this data point with columns for each letter
        row = {
            "board": board,
            "board_rep": board_state, 
            "score_diff": score_diff,
            "total_unseen_tiles": total_unseen_tiles,
            **{f"leave_{letter}": leave_dict[letter] for letter in TILE_DIST.keys()},
            **{f"unseen_{letter}": unseen_tiles[letter] for letter in TILE_DIST.keys()},
            "winProb": winProb,
            "expPointDiff": expDiff
        }
        
        # Append the row to the training data list
        training_data.append(row)
        
# Record the end time
end_time = time.time()

# Calculate the total time taken
elapsed_time = end_time - start_time
print(f"Total execution time: {elapsed_time:.4f} seconds")

Total execution time: 19.9459 seconds


In [17]:
def extract_board_features(board):
    new_board = np.zeros([15, 15])
    for i in range(15):
        for j in range(15):
            if(board[i][j]):
                new_board[i][j] = 1
    return new_board

In [19]:
df = pd.DataFrame(training_data)
df["board"] = df["board"].apply(extract_board_features)

In [30]:
df.columns

Index(['board', 'score_diff', 'total_unseen_tiles', 'leave_A', 'leave_B',
       'leave_C', 'leave_D', 'leave_E', 'leave_F', 'leave_G', 'leave_H',
       'leave_I', 'leave_J', 'leave_K', 'leave_L', 'leave_M', 'leave_N',
       'leave_O', 'leave_P', 'leave_Q', 'leave_R', 'leave_S', 'leave_T',
       'leave_U', 'leave_V', 'leave_W', 'leave_X', 'leave_Y', 'leave_Z',
       'leave_?', 'unseen_A', 'unseen_B', 'unseen_C', 'unseen_D', 'unseen_E',
       'unseen_F', 'unseen_G', 'unseen_H', 'unseen_I', 'unseen_J', 'unseen_K',
       'unseen_L', 'unseen_M', 'unseen_N', 'unseen_O', 'unseen_P', 'unseen_Q',
       'unseen_R', 'unseen_S', 'unseen_T', 'unseen_U', 'unseen_V', 'unseen_W',
       'unseen_X', 'unseen_Y', 'unseen_Z', 'unseen_?', 'winProb',
       'expPointDiff'],
      dtype='object')

In [25]:
df["board"][1]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.

## Training

In [31]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame
num_rows = len(df)
split_index = int(0.8 * num_rows)

train_data = df.iloc[:split_index]
test_data = df.iloc[split_index:]

print(f"Training data size: {len(train_data)}, Test data size: {len(test_data)}")

ModuleNotFoundError: No module named 'tensorflow'