In [1]:
import os
import io
import chess
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from chess_dataset import ChessDataImporter
from mech_interp.fixTL import make_official

In [3]:
DATA_DIR = "chess_data/"
prefix = "lichess_"

input_file = f'{DATA_DIR}lichess_uci.csv'
output_file = f'{DATA_DIR}lichess_6gb_filtered.csv'

In [4]:
if not os.path.exists(input_file):
    dataset_path = "austindavis/chess_mi"
    file_path = "lichess_uci.zip"
    dataset = load_dataset(dataset_path, data_files=file_path)
    df = pd.DataFrame(dataset['train'])
    df.to_csv(input_file, index=False)

In [None]:
df = pd.read_csv(f'{DATA_DIR}{prefix}100mb_checkpoint.csv')

In [None]:
MODEL_NAME = make_official()
tokenizer: PreTrainedTokenizerFast = PreTrainedTokenizerFast.from_pretrained(MODEL_NAME)

In [None]:
encoded = tokenizer.batch_encode_plus(df['transcript'].tolist(),add_special_tokens=True, return_offsets_mapping=True)

In [None]:
df['input_ids'] = encoded['input_ids']
df['offsets'] = encoded['offset_mapping']

len_df = df['input_ids'].apply(lambda x: len(x))
print(len_df.describe())

game_length_in_tokens = 126

# # Data setup. All games must have same number of tokens. 50% are >= 134 moves. I will discard all games less than 126, and truncate the rest to 126.
filtered_df = df[df['input_ids'].apply(lambda x: len(x) >= game_length_in_tokens)].copy()
filtered_df.loc[:, 'input_ids'] = filtered_df['input_ids'].apply(lambda x: x[:game_length_in_tokens])

len_df = filtered_df['input_ids'].apply(lambda x: len(x))
print(len_df.describe())

# Now we have all games that are encoded to 127 tokens. Need to correct the transcripts accordingly
def truncate_transcript(row):
    input_ids_length = len(row['input_ids'])
    if input_ids_length > 0 and input_ids_length <= len(row['offsets']):
        end_char = row['offsets'][input_ids_length - 1][-1]
        return row['transcript'][:end_char]
    else:
        return row['transcript']

filtered_df['truncated_transcript'] = filtered_df.apply(truncate_transcript, axis = 1)

len_df = filtered_df['transcript'].apply(lambda x: len(x))
print(len_df.describe())

# Finally, I would like to rule out games with promotions because 
# 1. those games token positions are slightly offset, and
# 2. promotion tokens leak information to the model
has_promote = filtered_df['truncated_transcript'].apply(
    lambda x: any(len(word) == 5 for word in x.split())
)
filtered_df = filtered_df[~has_promote]
len_df = filtered_df['truncated_transcript'].apply(lambda x: len(x))
print(len_df.describe())


So, there's 144,220 games with exactly 312 characters in their truncated transcript. None of these games include pawn promotion, and all of them encode to exactly 126 token_ids.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

# Function to create binned columns and bin index columns
def create_binned_columns(df, column_name):
    binned_column_name = f'{column_name}Binned'
    bin_index_column_name = f'{column_name}BinIndex'
    
    # Create quantile-based bins
    num_bins = 6
    # Create quantile-based bins with range labels, dropping duplicates if necessary
    df[binned_column_name], bins = pd.qcut(df[column_name], q=num_bins, retbins=True, duplicates='drop')

    # Convert bin labels to strings and assign to the column
    df[binned_column_name] = df[binned_column_name].apply(lambda x: f'({x.left}, {x.right}]')

    # Create bin index column
    df[bin_index_column_name] = pd.qcut(df[column_name], q=num_bins, labels=False, duplicates='drop')

# Apply the function to both WhiteElo and BlackElo
create_binned_columns(filtered_df, 'WhiteElo')
create_binned_columns(filtered_df, 'BlackElo')

filtered_df.to_csv(output_file, index=False)

# Plotting
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 8))

# Histogram for WhiteElo
axes[0].hist(filtered_df['WhiteElo'], bins=30, color='blue', alpha=0.7)
axes[0].set_title('WhiteElo Distribution')
axes[0].set_xlabel('WhiteElo')
axes[0].set_ylabel('Frequency')

# Bar chart for WhiteEloBinned
bin_counts = filtered_df['WhiteEloBinned'].value_counts()
axes[1].bar(bin_counts.index.astype(str), bin_counts.values, color='green', alpha=0.7)
axes[1].set_title('WhiteElo Binned Distribution')
axes[1].set_xlabel('WhiteElo Bins')
axes[1].set_ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()



In [None]:
filtered_df = filtered_df.rename(columns={'transcript': 'complete_transcript',
                                          'truncated_transcript':'transcript'})
print(filtered_df['WhiteEloBinned'].value_counts())
print(filtered_df.head())

In [None]:
filtered_df.to_csv(output_file,index=False)

# Add FEN board State

In [5]:
df = pd.read_csv(f'{DATA_DIR}{prefix}train.csv')
print(len(df))
df = pd.concat([df,pd.read_csv(f'{DATA_DIR}{prefix}test.csv')])
print(len(df))



129798
144220


In [6]:
from mech_interp.utils import uci_to_board
skinny_df = df[['transcript', 'input_ids']]
fen_stack = []
for transcript in tqdm(skinny_df['transcript']):
    board_stack = uci_to_board(transcript.strip(),force=False,fail_silent=True, verbose=False, as_board_stack=True)
    fen_stack.append([board.fen() for board in board_stack])

100%|██████████| 144220/144220 [23:16<00:00, 103.24it/s]


In [24]:
df['fen_stack'] = fen_stack
df.to_csv(output_file, index=False)

# Split Dataset 

In [56]:
import pandas as pd
# df = pd.read_csv(output_file)

print(len(df))

# Split df into a train and test split
train = df.sample(frac=0.9, random_state=200)
test = df.drop(train.index)

print(len(train))
print(len(test))

# # Save the train and test splits to csv
# train.to_csv(f'{DATA_DIR}{prefix}train.csv', index=False)
# test.to_csv(f'{DATA_DIR}{prefix}test.csv', index=False)

train.to_pickle(f'{DATA_DIR}{prefix}train.pkl')
test.to_pickle(f'{DATA_DIR}{prefix}test.pkl')

144220
129798
14422


In [3]:
df = pd.read_csv(output_file)


In [5]:
pd.to_pickle(df, output_file.replace(".csv",".pkl"))

Index(['WhiteElo', 'BlackElo', 'Result', 'complete_transcript', 'input_ids',
       'offsets', 'transcript', 'WhiteEloBinned', 'WhiteEloBinIndex',
       'BlackEloBinned', 'BlackEloBinIndex', 'fen_stack'],
      dtype='object')

In [24]:
df['input_ids'] = df['input_ids'].apply(lambda x: [int(i) for i in x.strip("[]").split(', ')])


In [38]:
import ast
df['offsets'] = df['offsets'].apply(lambda x: ast.literal_eval(x))

In [46]:
df['fen_stack'] = df['fen_stack'].apply(ast.literal_eval)

In [59]:
for col in test2.columns:
    print(f"{col}: {type(df[col][0])}")

WhiteElo: <class 'numpy.int64'>
BlackElo: <class 'numpy.int64'>
Result: <class 'str'>
complete_transcript: <class 'str'>
input_ids: <class 'list'>
offsets: <class 'list'>
transcript: <class 'str'>
WhiteEloBinned: <class 'str'>
WhiteEloBinIndex: <class 'numpy.int64'>
BlackEloBinned: <class 'str'>
BlackEloBinIndex: <class 'numpy.int64'>
fen_stack: <class 'list'>
