In [115]:
import os
import pickle
import time
import importlib
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from features.data_processing import load_scrabble_data
from features.quadrant_features import count_tiles_in_quadrants
from game_logic.utils import pretty_print_board

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
data_path = "../data/magpie-sims-400k.csv"
cached_path = "../data/cached_raw.pkl"
# Check if cached file exists
if os.path.exists(cached_path):
    print("Loading cached data...")
    with open(cached_path, "rb") as f:
        df_raw = pickle.load(f)
else:
    print("Processing and caching data...")
    df_raw = load_scrabble_data(data_path)
    
    # Save to cache
    with open(cached_path, "wb") as f:
        pickle.dump(df_raw, f)

# Verify DataFrame loaded
print(f"Data loaded with {len(df_raw)} rows")

Loading cached data...
Data loaded with 395715 rows


In [3]:
from game_logic.dawg import DAWG

# Define dictionary path
dict_path = "../data/serialized_dawg_CSW24.bin"

# Initialize DAWG
dawg = DAWG()

# Load and Deserialize
with open(dict_path, "rb") as f:
    dawg_data = f.read()  # Read entire binary file
    dawg.deserialize(dawg_data)  # Deserialize into DAWG structure

print("DAWG loaded successfully!")


DAWG loaded successfully!


In [4]:
words = dawg.get_all_words()

In [77]:
import game_logic.crosschecks  # Import the full module
import importlib

importlib.reload(game_logic.crosschecks)

# Now call the function
from game_logic.crosschecks import find_anchors_with_cross_checks

from typing import List
import numpy as np

import game_logic.utils

importlib.reload(game_logic.utils)

from game_logic.types import Board, CrossCheckBoard
from game_logic.utils import transpose, pretty_print_board_with_crosschecks


In [58]:
start_time = time.time()
df_raw["cs_h"] = df_raw["board"].apply(lambda b: find_anchors_with_cross_checks(b, dawg))
end_time = time.time()

elapsed_time = time.time() - start_time
print(f"Data loaded in {elapsed_time:.4f} seconds")

Data loaded in 242.7373 seconds


In [59]:
start_time = time.time()
df_raw["cs_v"] = df_raw["board"].apply(lambda b: transpose(find_anchors_with_cross_checks(transpose(b), dawg)))
end_time = time.time()

elapsed_time = time.time() - start_time
print(f"Data loaded in {elapsed_time:.4f} seconds")

Data loaded in 280.4659 seconds


In [86]:
row = 214751
board = df_raw["board"][row]
cs_v = df_raw["cs_v"][row]
cs_h = df_raw["cs_h"][row]

In [87]:
pretty_print_board(board)

 ★  .  .  ◆  .  .  .  ★  A  .  .  ◆  .  .  ★ 
 .  ✦  .  .  .  ▲  .  .  N  ▲  .  .  .  ✦  . 
 .  .  ✦  .  .  .  ◆  .  O  F  .  .  ✦  .  . 
 ◆  .  .  ✦  .  .  .  ◆  A  R  P  A  .  .  ◆ 
 .  .  .  .  ✦  .  .  .  .  I  ✦  .  .  .  . 
 .  ▲  .  .  .  ▲  .  .  .  V  .  .  .  ▲  . 
 .  .  ◆  .  .  .  ◆  .  ◆  O  H  .  ◆  .  . 
 ★  .  .  ◆  .  .  G  A  L  L  O  W  .  .  ★ 
 .  .  ◆  .  .  .  ◆  .  ◆  E  M  .  ◆  .  . 
 .  ▲  .  .  .  ▲  .  .  .  D  I  .  .  ▲  . 
 .  .  .  .  ✦  .  .  .  .  .  E  .  .  .  . 
 ◆  .  .  ✦  .  .  .  ◆  .  .  .  ✦  .  .  ◆ 
 .  .  ✦  .  .  .  ◆  .  ◆  .  .  .  ✦  .  . 
 .  ✦  .  .  .  ▲  .  .  .  ▲  .  .  .  ✦  . 
 ★  .  .  ◆  .  .  .  ★  .  .  .  ◆  .  .  ★ 


In [88]:
pretty_print_board_with_crosschecks(board, cs_h)

 ★  .  .  ◆  .  .  .  ○  A  ○  .  ◆  .  .  ★ 
 .  ✦  .  .  .  ▲  .  ○  N  0  .  .  .  ✦  . 
 .  .  ✦  .  .  .  ◆  ○  O  F  2  15 ✦  .  . 
 ◆  .  .  ✦  .  .  .  ○  A  R  P  A  ○  .  ◆ 
 .  .  .  .  ✦  .  .  .  1  I  4  16 .  .  . 
 .  ▲  .  .  .  ▲  .  .  ○  V  0  .  .  ▲  . 
 .  .  ◆  .  .  .  2  15 2  O  H  3  ◆  .  . 
 ★  .  .  ◆  .  ○  G  A  L  L  O  W  ○  .  ★ 
 .  .  ◆  .  .  .  3  16 3  E  M  2  ◆  .  . 
 .  ▲  .  .  .  ▲  .  .  ○  D  I  ○  .  ▲  . 
 .  .  .  .  ✦  .  .  .  .  0  E  ○  .  .  . 
 ◆  .  .  ✦  .  .  .  ◆  .  .  2  ✦  .  .  ◆ 
 .  .  ✦  .  .  .  ◆  .  ◆  .  .  .  ✦  .  . 
 .  ✦  .  .  .  ▲  .  .  .  ▲  .  .  .  ✦  . 
 ★  .  .  ◆  .  .  .  ★  .  .  .  ◆  .  .  ★ 


In [89]:
pretty_print_board_with_crosschecks(board, cs_v)

 ★  .  .  ◆  .  .  .  15 A  16 .  ◆  .  .  ★ 
 .  ✦  .  .  .  ▲  .  5  N  5  .  .  .  ✦  . 
 .  .  ✦  .  .  .  ◆  3  O  F  2  ○  ✦  .  . 
 ◆  .  .  ✦  .  .  .  0  A  R  P  A  1  .  ◆ 
 .  .  .  .  ✦  .  .  .  14 I  6  ○  .  .  . 
 .  ▲  .  .  .  ▲  .  .  0  V  0  .  .  ▲  . 
 .  .  ◆  .  .  .  ○  ○  8  O  H  3  ◆  .  . 
 ★  .  .  ◆  .  0  G  A  L  L  O  W  1  .  ★ 
 .  .  ◆  .  .  .  ○  ○  6  E  M  4  ◆  .  . 
 .  ▲  .  .  .  ▲  .  .  0  D  I  11 .  ▲  . 
 .  .  .  .  ✦  .  .  .  .  15 E  13 .  .  . 
 ◆  .  .  ✦  .  .  .  ◆  .  .  ○  ✦  .  .  ◆ 
 .  .  ✦  .  .  .  ◆  .  ◆  .  .  .  ✦  .  . 
 .  ✦  .  .  .  ▲  .  .  .  ▲  .  .  .  ✦  . 
 ★  .  .  ◆  .  .  .  ★  .  .  .  ◆  .  .  ★ 


In [94]:
from typing import List, Tuple
from game_logic.types import Board, CrossCheckBoard

def compute_8_letter_bingo_lanes(
    board: Board, crosscheck_board_h: CrossCheckBoard, crosscheck_board_v: CrossCheckBoard
) -> List[Tuple[int, int, str, int]]:
    """
    Computes 8-letter bingo lanes by checking spaces around existing tiles.

    Args:
        board (Board): The Scrabble board (15x15 grid).
        crosscheck_board_h (CrossCheckBoard): Horizontal cross-check constraints.
        crosscheck_board_v (CrossCheckBoard): Vertical cross-check constraints.

    Returns:
        List[Tuple[int, int, str, int]]: A list of (row, col, direction, lane_size) tuples.
    """
    bingo_lanes = []

    def is_valid_extension(r, c, r_next, c_next, crosscheck_board):
        """Returns True if we can extend a word into this space and the next space."""
        if not (0 <= r < 15 and 0 <= c < 15):
            return False  # Out of bounds
        if board[r][c] is not None:
            return False  # Occupied
        if crosscheck_board[r][c] is not None and len(crosscheck_board[r][c].valid_letters) == 0:
            return False  # Cross-check restriction
        if 0 <= r_next < 15 and 0 <= c_next < 15 and (
            board[r_next][c_next] is not None
        ):
            return False  # Next space blocked
        return True

    def has_adjacent_tile_in_direction(r, c, direction):
        """Checks if a tile has an adjacent tile in the given direction."""
        if direction == "H":
            return (c > 0 and board[r][c - 1] is not None) or (c < 14 and board[r][c + 1] is not None)
        if direction == "V":
            return (r > 0 and board[r - 1][c] is not None) or (r < 14 and board[r + 1][c] is not None)
        return False

    # Scan board for existing tiles
    for row in range(15):
        for col in range(15):
            if board[row][col] is None:
                continue  # Skip empty spaces

            # Horizontal playability: Skip if adjacent horizontal tile exists
            if not has_adjacent_tile_in_direction(row, col, "H"):
                left_spaces = 0
                right_spaces = 0

                for i in range(1, 8):
                    if is_valid_extension(row, col - i, row, col - i - 1, crosscheck_board_h):
                        left_spaces += 1
                    else:
                        break

                for i in range(1, 8):
                    if is_valid_extension(row, col + i, row, col + i + 1, crosscheck_board_h):
                        right_spaces += 1
                    else:
                        break

                lane_size_h = max(0, left_spaces + right_spaces - 6)
                if lane_size_h > 0:
                    bingo_lanes.append((row, col, "H", lane_size_h))

            # Vertical playability: Skip if adjacent vertical tile exists
            if not has_adjacent_tile_in_direction(row, col, "V"):
                up_spaces = 0
                down_spaces = 0

                for i in range(1, 8):
                    if is_valid_extension(row - i, col, row - i - 1, col, crosscheck_board_v):
                        up_spaces += 1
                    else:
                        break

                for i in range(1, 8):
                    if is_valid_extension(row + i, col, row + i + 1, col, crosscheck_board_v):
                        down_spaces += 1
                    else:
                        break

                lane_size_v = max(0, up_spaces + down_spaces - 6)
                if lane_size_v > 0:
                    bingo_lanes.append((row, col, "V", lane_size_v))

    return bingo_lanes


In [95]:
compute_8_letter_bingo_lanes(board, cs_h, cs_v)

[(0, 8, 'H', 7),
 (1, 8, 'H', 1),
 (4, 9, 'H', 6),
 (5, 9, 'H', 1),
 (7, 6, 'V', 8),
 (7, 7, 'V', 4),
 (7, 11, 'V', 3)]

In [101]:
pretty_print_board(board)

 ★  .  .  ◆  .  .  .  ★  A  .  .  ◆  .  .  ★ 
 .  ✦  .  .  .  ▲  .  .  N  ▲  .  .  .  ✦  . 
 .  .  ✦  .  .  .  ◆  .  O  F  .  .  ✦  .  . 
 ◆  .  .  ✦  .  .  .  ◆  A  R  P  A  .  .  ◆ 
 .  .  .  .  ✦  .  .  .  .  I  ✦  .  .  .  . 
 .  ▲  .  .  .  ▲  .  .  .  V  .  .  .  ▲  . 
 .  .  ◆  .  .  .  ◆  .  ◆  O  H  .  ◆  .  . 
 ★  .  .  ◆  .  .  G  A  L  L  O  W  .  .  ★ 
 .  .  ◆  .  .  .  ◆  .  ◆  E  M  .  ◆  .  . 
 .  ▲  .  .  .  ▲  .  .  .  D  I  .  .  ▲  . 
 .  .  .  .  ✦  .  .  .  .  .  E  .  .  .  . 
 ◆  .  .  ✦  .  .  .  ◆  .  .  .  ✦  .  .  ◆ 
 .  .  ✦  .  .  .  ◆  .  ◆  .  .  .  ✦  .  . 
 .  ✦  .  .  .  ▲  .  .  .  ▲  .  .  .  ✦  . 
 ★  .  .  ◆  .  .  .  ★  .  .  .  ◆  .  .  ★ 


In [97]:
pretty_print_board_with_crosschecks(board, cs_v)

 ★  .  .  ◆  .  .  .  15 A  16 .  ◆  .  .  ★ 
 .  ✦  .  .  .  ▲  .  5  N  5  .  .  .  ✦  . 
 .  .  ✦  .  .  .  ◆  3  O  F  2  ○  ✦  .  . 
 ◆  .  .  ✦  .  .  .  0  A  R  P  A  1  .  ◆ 
 .  .  .  .  ✦  .  .  .  14 I  6  ○  .  .  . 
 .  ▲  .  .  .  ▲  .  .  0  V  0  .  .  ▲  . 
 .  .  ◆  .  .  .  ○  ○  8  O  H  3  ◆  .  . 
 ★  .  .  ◆  .  0  G  A  L  L  O  W  1  .  ★ 
 .  .  ◆  .  .  .  ○  ○  6  E  M  4  ◆  .  . 
 .  ▲  .  .  .  ▲  .  .  0  D  I  11 .  ▲  . 
 .  .  .  .  ✦  .  .  .  .  15 E  13 .  .  . 
 ◆  .  .  ✦  .  .  .  ◆  .  .  ○  ✦  .  .  ◆ 
 .  .  ✦  .  .  .  ◆  .  ◆  .  .  .  ✦  .  . 
 .  ✦  .  .  .  ▲  .  .  .  ▲  .  .  .  ✦  . 
 ★  .  .  ◆  .  .  .  ★  .  .  .  ◆  .  .  ★ 


In [98]:
from tqdm import tqdm

# Compute bingo lanes for each board in the dataset
df_raw["8_letter_bingo_lanes_list"] = [
    compute_8_letter_bingo_lanes(board, cs_v, cs_h)
    for board, cs_v, cs_h in tqdm(zip(df_raw["board"], df_raw["cs_v"], df_raw["cs_h"]), total=len(df_raw))
]

df_raw["8_letter_bingos"] = df_raw["8_letter_bingo_lanes_list"].apply(
    lambda lanes: sum(lane[3] for lane in lanes) if lanes else 0
)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 395715/395715 [00:45<00:00, 8698.27it/s]


In [107]:
from typing import List, Tuple
from game_logic.types import Board, CrossCheckBoard

def compute_7_letter_bingo_lanes(
    board: Board, crosscheck_board_h: CrossCheckBoard, crosscheck_board_v: CrossCheckBoard
) -> List[Tuple[int, int, str, int]]:
    """
    Computes 7-letter bingo lanes by checking valid cross-check spaces.
    A play can only go through one cross-check constraint before stopping.

    Args:
        board (Board): The Scrabble board (15x15 grid).
        crosscheck_board_h (CrossCheckBoard): Horizontal cross-check constraints.
        crosscheck_board_v (CrossCheckBoard): Vertical cross-check constraints.

    Returns:
        List[Tuple[int, int, str, int]]: A list of (row, col, direction, lane_size) tuples.
    """

    # If board is empty, the only valid lane is (7, 7, "H", 1)
    if board[7][7] is None:
        return [(7, 7, "H", 1)]

    bingo_lanes = []

    def is_valid_bingo_start(r, c, crosscheck_board, direction):
        """Checks if a space is a valid starting point for a 7-letter bingo."""
        if board[r][c] is not None:
            return False  # Cannot play through an existing tile
        if crosscheck_board[r][c] is None or crosscheck_board[r][c].is_open_square:
            return False  # Must be a constrained space
        if len(crosscheck_board[r][c].valid_letters) == 0:
            return False  # No valid letters to play through

        # Ensure the space is not trapped between two tiles
        if direction == "H":
            if (c > 0 and board[r][c - 1] is not None) or (c < 14 and board[r][c + 1] is not None):
                return False  # Tile on both left & right
        elif direction == "V":
            if (r > 0 and board[r - 1][c] is not None) or (r < 14 and board[r + 1][c] is not None):
                return False  # Tile above & below

        return True  # Valid starting point

    def count_empty_spaces(r, c, dr, dc, crosscheck_board):
        """Counts empty spaces until hitting a tile or another cross-check."""
        count = 0

        for i in range(1, 7):  # Maximum 7-letter word placement
            r_next, c_next = r + i * dr, c + i * dc

            # Out of bounds check
            if not (0 <= r_next < 15 and 0 <= c_next < 15):
                break  

            # Stop at the first occupied tile
            if board[r_next][c_next] is not None:
                break  

            # Stop at the first cross-check (we only play through one)
            if crosscheck_board[r_next][c_next] is not None:
                break  

            count += 1  # Valid empty space

        return count

    # Iterate through the board looking for valid cross-check spaces
    for row in range(15):
        for col in range(15):
            # Check horizontal lanes
            if is_valid_bingo_start(row, col, crosscheck_board_h, "H"):
                left_spaces = count_empty_spaces(row, col, 0, -1, crosscheck_board_h)
                right_spaces = count_empty_spaces(row, col, 0, 1, crosscheck_board_h)
                lane_size_h = max(0, left_spaces + right_spaces - 5)  # Need 6 open spaces around a tile
                if lane_size_h > 0:
                    bingo_lanes.append((row, col, "H", lane_size_h))

            # Check vertical lanes
            if is_valid_bingo_start(row, col, crosscheck_board_v, "V"):
                up_spaces = count_empty_spaces(row, col, -1, 0, crosscheck_board_v)
                down_spaces = count_empty_spaces(row, col, 1, 0, crosscheck_board_v)
                lane_size_v = max(0, up_spaces + down_spaces - 5)  # Need 6 open spaces around a tile
                if lane_size_v > 0:
                    bingo_lanes.append((row, col, "V", lane_size_v))

    return bingo_lanes


In [112]:
row = 324751
board = df_raw["board"][row]
cs_v = df_raw["cs_v"][row]
cs_h = df_raw["cs_h"][row]

compute_7_letter_bingo_lanes(board, cs_h, cs_v)

[(4, 1, 'V', 1)]

In [113]:
pretty_print_board(board)

 ★  F  .  B  .  .  .  D  I  S  S  o  L  V  E 
 .  A  K  A  .  ▲  .  .  .  T  .  .  .  ✦  . 
 .  G  A  T  .  .  U  N  B  O  I  L  E  D  . 
 ◆  .  O  H  M  .  G  Y  A  N  .  ✦  .  .  ◆ 
 .  .  N  .  I  .  .  .  .  Y  ✦  .  .  .  . 
 .  ▲  .  .  g  ▲  .  .  .  I  .  .  N  ▲  . 
 .  .  ◆  .  R  .  ◆  A  W  N  .  .  A  .  . 
 ★  .  .  W  A  R  T  H  O  G  .  ◆  I  .  ★ 
 .  .  ◆  .  I  .  ◆  .  P  .  .  .  L  .  . 
 .  ▲  .  .  N  ▲  .  .  .  J  A  P  E  ▲  . 
 .  .  .  D  E  V  I  A  T  O  R  .  R  .  . 
 ◆  .  .  ✦  .  .  .  U  .  E  C  U  S  .  ◆ 
 .  .  ✦  .  .  .  ◆  R  ◆  .  .  .  ✦  .  . 
 .  ✦  .  .  .  ▲  .  E  .  ▲  .  .  .  ✦  . 
 ★  .  .  ◆  .  .  .  I  .  .  .  ◆  .  .  ★ 


In [114]:

# Compute bingo lanes for each board in the dataset
df_raw["7_letter_bingo_lanes_list"] = [
    compute_7_letter_bingo_lanes(board, cs_v, cs_h)
    for board, cs_v, cs_h in tqdm(zip(df_raw["board"], df_raw["cs_v"], df_raw["cs_h"]), total=len(df_raw))
]

df_raw["7_letter_bingos"] = df_raw["7_letter_bingo_lanes_list"].apply(
    lambda lanes: sum(lane[3] for lane in lanes) if lanes else 0
)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 395715/395715 [00:44<00:00, 8882.81it/s]


In [117]:
df_raw.to_pickle("scrabble_data_rnd.pkl")

In [118]:
df_raw.memory_usage(deep=True).sum() / (1024**2)  # Convert bytes to MB

np.float64(501.7918510437012)

In [120]:
df_raw.memory_usage(deep=True).sort_values(ascending=False)[:10] / (1024**2)  # MB per column


board                        72.457581
cs_v                         72.457581
cs_h                         72.457581
board_rep                    61.036774
8_letter_bingo_lanes_list    58.829193
7_letter_bingo_lanes_list    24.166458
unseen_J                      3.019066
unseen_O                      3.019066
unseen_N                      3.019066
unseen_M                      3.019066
dtype: float64