In [None]:
import pandas as pd
import re

I use this notebook for manipulating the outputs of my chess_gpt_eval repository and doing various experiments with it. These games are generally outputs of playing stockfish vs stockfish or Chess-GPT against stockfish. For standard uses, you shouldn't need to use this notebook.

In [None]:
DATA_DIR = "data/"
prefix = "rand_test_2"

input_file = f'{DATA_DIR}{prefix}.csv'
output_file = f'{DATA_DIR}filtered_{prefix}.csv'

In [None]:
df = pd.read_csv(input_file)
grouped = df.groupby('player_two')


def format_transcript(game: str) -> str:
    new_game = ';' + game.split('\n\n')[1]
    new_game = re.sub(r"(\d+\.) ", r"\1", new_game)
    return new_game

def format_player_name(name: str) -> str:
    """This will go from e.g. "Stockfish 0" to "0"."""
    return name.split(' ')[1]


df['transcript'] = df['transcript'].apply(format_transcript)
df['player_two'] = df['player_two'].apply(format_player_name)

for game in df.head()['transcript']:
    print(game)
    print()

In [None]:
len_df = df['transcript'].apply(lambda x: len(x))
print(len_df.describe())

game_length_in_chars = 356

# Data setup. All games must have same length. 50% are >= 690 moves. I will discard all games less than 680, and truncate the rest to 680.
filtered_df = df[df['transcript'].apply(lambda x: len(x) >= game_length_in_chars)].copy()
filtered_df.loc[:, 'transcript'] = filtered_df['transcript'].apply(lambda x: x[:game_length_in_chars])

len_df = filtered_df['transcript'].apply(lambda x: len(x))
print(len_df.describe())

move_count_df = filtered_df['transcript'].apply(lambda x: len(x.split()))
move_count = move_count_df.describe()
print("move count", move_count_df.describe())
quarter_percentile = move_count['25%']
print("quarter percentile", quarter_percentile)

# Now I need to filter out games that are too short. I will discard all games less than 25th percentile  moves.
filtered_df = filtered_df[filtered_df['transcript'].apply(lambda x: len(x.split()) >= quarter_percentile)]
print(filtered_df.describe())
print(filtered_df.head())

filtered_df.to_csv(output_file, index=False)

move_count_df = filtered_df['transcript'].apply(lambda x: len(x.split()))
print(move_count_df.describe())

In [None]:
print(len(filtered_df))
player_two_group_sizes = filtered_df.groupby('player_two').size()
print(player_two_group_sizes)

In [None]:
# shuffle all rows of the dataset

df = pd.read_csv(output_file)
df = df.sample(frac=1, random_state=200).reset_index(drop=True)
df.to_csv(output_file, index=False)

In [None]:
import pandas as pd
df = pd.read_csv(output_file)

print(len(df))

# Split df into a train and test split
train = df.sample(frac=0.5, random_state=200)
test = df.drop(train.index)

print(len(train))
print(len(test))

# Save the train and test splits to csv
train.to_csv(f'{DATA_DIR}{prefix}train.csv', index=False)
test.to_csv(f'{DATA_DIR}{prefix}test.csv', index=False)