In [1]:
# import pandas as pd

# # For testing: Create a small df

# # Read the CSV file
# df = pd.read_csv('mac_dataset.csv')

# # Randomly sample 10000 rows
# sampled_df = df.sample(n=10000)

# # Sort the sampled DataFrame by the length of the 'transcript' column
# # sampled_df.sort_values(by='transcript', key=lambda x: x.str.len(), ascending=True, inplace=True)

# # Save the sampled and sorted DataFrame to a new CSV file
# sampled_df.to_csv('test_input_dataset.csv', index=False)


In [14]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

input_file = 'mac_dataset.csv'
input_file = 'test_input_dataset.csv'

def dedup_dataset(input_file: str):
# Step 1: Deduplicate games
    df = pd.read_csv(input_file)

    unique_transcripts = set()
    rows_to_delete = []

    print(f"before: {len(df)}")

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        transcript = row['transcript']
        if transcript not in unique_transcripts:
            unique_transcripts.add(transcript)
        else:
            rows_to_delete.append(index)

    df.drop(rows_to_delete, inplace=True)

    print(f"after: {len(df)}")

    df.to_csv(input_file, index=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [15]:
import pandas as pd
import random

# Step 2: Add player skill to beginning of every transcript
# At the end of this, we will save only the 'transcript' column to 'input_dataset.csv', as other info is no longer needed

def map_skill_to_int(skill: int) -> str:
    if skill == 20:
        return '9'
    if random.random() > 0.5:
        return '?'
    if skill == -2:
        return '0'
    # Define the original and target ranges
    original_min, original_max = -1, 19
    target_min, target_max = 1, 8

    # Calculate the total number of values in each range
    original_range = original_max - original_min
    target_range = target_max - target_min

    # Scale the original value to the target range
    scaled_value = ((skill - original_min) / original_range) * target_range + target_min

    # Round and return the scaled value, making sure it stays within the target range
    return str(min(target_max, max(target_min, round(scaled_value))))

def transform_row(row):
    # Split and get the number from player_one and player_two
    player_one_number = int(row['player_one'].split()[1])
    player_two_number = int(row['player_two'].split()[1])

    # Apply map_value to these numbers
    mapped_player_one = map_skill_to_int(player_one_number)
    mapped_player_two = map_skill_to_int(player_two_number)

    transcript = row['transcript'].split('\n\n')[1]

    # Prepend the transcript with the formatted string
    row['transcript'] = f"[{mapped_player_one},{mapped_player_two}]{transcript}"

    return row

# skills = range(-2,21)

# for skill in skills:
#     print(skill, map_skill_to_int(skill))
# df_subset = df.sample(10)
# df_transformed = df_subset.apply(transform_row, axis=1)
# print(df_transformed)

In [16]:
df = pd.read_csv(input_file)
df = df.parallel_apply(transform_row, axis=1)
df['transcript'].to_csv('input_dataset.csv', index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1250), Label(value='0 / 1250'))), …

In [17]:
import chess
from stockfish import Stockfish
import random
import multiprocessing

def terminate_stockfish():
    input_queue.put(("EXIT", None))
    stockfish_process.join()

def stockfish_worker(input_queue, output_queue):
    mac_path = "stockfish"
    linux_path = "/usr/games/stockfish"
    # self._engine = chess.engine.SimpleEngine.popen_uci(linux_path)
    stockfish = Stockfish(mac_path)
    stockfish.set_depth(10) 
    while True:
        fen, move = input_queue.get()
        if fen == "EXIT":
            break
        stockfish.set_fen_position(fen)
        evaluation = stockfish.get_evaluation()
        output_queue.put((move, evaluation))

# Step 3: Randomly insert centipawn
def map_eval_to_int(evaluation: dict) -> int:
    if evaluation['type'] == 'mate':
        # for example, 3 would be mate in 3 for white, -2 is mate in 2 for black
        if evaluation['value'] > 0:
            return 9
        else:
            return -9
    
    # if not mate, must be centipawn advantage
    value = evaluation['value']

    if value > 700:
        return 8
    elif value < -700:
        return -8
    original_min, original_max = -700, 700
    target_min, target_max = -7, 7

    # Calculate the total number of values in each range
    original_range = original_max - original_min
    target_range = target_max - target_min

    # Scale the original value to the target range
    scaled_value = ((value - original_min) / original_range) * target_range + target_min

    # Round and return the scaled value, making sure it stays within the target range
    return str(min(target_max, max(target_min, round(scaled_value))))
    
def game_over_to_value(board_result: str) -> int:
    result_map = {"1-0": 9, "0-1": -9, "1/2-1/2": 0}
    return result_map[board_result]

def insert_centipawn(moves_string: str, depth: int = 9, frequency: float = 0.03) -> str:
    # Create a new board
    board = chess.Board()

    
    eval_results = []

    new_moves_string = ""

    # Apply each move to the board
    for move in moves_string.split():
        # Skip move numbers
        if '.' in move:
            board.push_san(move.split(".")[1])
        else:
            board.push_san(move)

        new_moves_string += move + " "
        if random.random() < frequency:
            # Check for checkmate or draw
            eval = ""
            if board.result() != "*":
                # eval_results.append(game_over_to_value(board.result()))
                eval = " <" + str(game_over_to_value(board.result())) + "> "
            else:
                input_queue.put((board.fen(), move))
                eval_move, evaluation = output_queue.get()
                assert eval_move == move  # Ensure correct correspondence
                eval = "<" + str(map_eval_to_int(evaluation)) + " "
                new_moves_string += eval

            new_moves_string += eval

    # Display the board position after the moves
    # print(board.result())

    # return eval_results
    return new_moves_string

tqdm.pandas()
input_queue = multiprocessing.Queue()
output_queue = multiprocessing.Queue()

stockfish_process = multiprocessing.Process(target=stockfish_worker, args=(input_queue, output_queue))
stockfish_process.start()

df = pd.read_csv('input_dataset.csv')
df['transcript'] = df['transcript'].progress_apply(lambda x: insert_centipawn(x, depth=10, frequency=0.03))
terminate_stockfish()

# your_list = range(-800, 800, 40)
# mapped_list = [map_eval_to_int({"type":"cp", "value":x}) for x in your_list]

# for i in range(len(mapped_list)):
#     print(mapped_list[i], your_list[i])

  5%|▌         | 532/10000 [00:45<13:26, 11.75it/s]


KeyboardInterrupt: 

In [18]:
terminate_stockfish()

In [45]:
# For testing centipawn

# import matplotlib.pyplot as plt

# df = pd.read_csv(input_file)
# index = 1
# selected_rows = df.groupby('result').first().reset_index()
# selected_rows = selected_rows.apply(transform_row, axis=1)
# print(selected_rows.iloc[index]['transcript'])
# selected_rows['transcript'] = selected_rows['transcript'].apply(lambda x: insert_centipawn(x, depth=10))


# For graphing results

# print(selected_rows.iloc[index]['transcript'])
# print(selected_rows.iloc[index])

# new_moves = insert_centipawn(selected_rows.iloc[index]['transcript'], depth=10)
# print(new_moves)

# Make insert centipawn return list to graph evals
# ten = insert_centipawn(selected_rows.iloc[index]['transcript'], depth=10)
# nine = insert_centipawn(selected_rows.iloc[index]['transcript'], depth=9)
# ten = [int(i) for i in ten]
# nine = [int(i) for i in nine]


# plt.plot(ten, label='ten', color='blue')
# plt.plot(nine, color='red')

# plt.show()

 30%|███       | 3023/10000 [00:09<00:21, 325.81it/s]


KeyboardInterrupt: 

In [55]:
def transform_text(text: str) -> list[str]:
    full_chunk_size = 1023
    header_size = 5
    chunk_size = full_chunk_size - header_size

    length = len(text)

    if length < full_chunk_size:
        return text
    
    header = text[:header_size]

    text = text[header_size:]
    length = len(text)

    # Calculate the number of chunks needed
    num_chunks = (length + chunk_size - 1) // chunk_size

    chunks = []

    for i in range(num_chunks):
        start_index = length - (i + 1) * chunk_size
        end_index = length - i * chunk_size
        start_index = max(start_index, 0)  # Ensure the start index is not negative

        chunk = header + text[start_index:end_index]
        chunks.append(chunk)

    # Reverse the list to maintain the chronological order
    chunks.reverse()

    if len(chunks[0]) < 511:
        chunks.pop(0)

    return chunks

In [87]:
df['transcript'] = df['transcript'].parallel_apply(transform_text)
df = df.explode('transcript')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1250), Label(value='0 / 1250'))), …

In [89]:
print(len(df))
df.to_csv('input_dataset.csv')

10344


In [59]:
# import pandas as pd

# # For testing transform text

# df = pd.read_csv(input_file)
# df = df.apply(transform_row, axis=1)

# filtered_df = df[df['transcript'].apply(len) > 2500]

# text = filtered_df.iloc[0]['transcript']

# print(len(df))
# print(len(filtered_df))
# print(len(filtered_df)/len(df))

# chunks = transform_text(text)
# text1 = chunks[0]

# for chunk in chunks:
#     print(len(chunk))
#     print(chunk)

# chunks2 = transform_text(text1)
# print(len(chunks2))
# print(chunks2)

In [90]:
import pandas as pd

# Sort df by transcript length for batching

df = pd.read_csv('input_dataset.csv', usecols=['transcript'])
df['length'] = df['transcript'].parallel_apply(len)

df.sort_values(by='length', inplace=True)
# df.to_csv('input_dataset.csv', index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1293), Label(value='0 / 1293'))), …

In [91]:
output_filename = input_file.replace('.csv', '_blocks.csv')

In [92]:
import pandas as pd
from collections import deque
from tqdm import tqdm  # Import tqdm
import random

# Read the CSV file
# df = pd.read_csv('input_dataset.csv')

# Prepare the new dataset for blocks
blocks = []
remaining_games = deque(df['transcript'].tolist())  # Use deque for efficient pops from the left

original_length = len(remaining_games)  # Store the original length

# Block size limit
block_size = 1024

# Initialize the progress bar
with tqdm(total=original_length, desc="Processing") as pbar:
    while remaining_games:
        block = ';'
        # Select the next game
        next_game = remaining_games.pop()
        block += next_game
        while len(block) < block_size and remaining_games:
            # if len(df) > 21:
            #     random_idx = random.randint(0, 20)
            #     next_game = remaining_games[random_idx]
            #     remaining_games[random_idx] = ""
            # else:
            next_game = remaining_games.popleft()
            block += ';' + next_game
            if len(block) > block_size:
                # If the game makes the block too long, re-add it to the dataset
                if len(remaining_games) > 100:
                    remaining_games.insert(99, next_game)
                else:
                    break
                break

        if len(block) >= block_size:
            # Add the block to the blocks list
            blocks.append(block[:block_size])

        # Update the progress bar
        pbar.update(original_length - len(remaining_games) - pbar.n)

# Create a new DataFrame for the blocks
blocks_df = pd.DataFrame(blocks, columns=['transcript'])

# Save the blocks to a new CSV file
blocks_df.to_csv(output_filename, index=False)


Processing: 100%|██████████| 10344/10344 [00:00<00:00, 386349.42it/s]


In [93]:
import pandas as pd

df = pd.read_csv(output_filename)
df['length'] = df['transcript'].apply(len)
print(df['length'].describe())

count    7199.0
mean     1024.0
std         0.0
min      1024.0
25%      1024.0
50%      1024.0
75%      1024.0
max      1024.0
Name: length, dtype: float64


In [94]:
import pandas as pd

# Shuffle dataset

temp_output_filename = "full2_dataset.csv"
# output_filename = temp_output_filename

df = pd.read_csv(output_filename)
df_shuffled = df.sample(frac=1).reset_index(drop=True)

# Save the shuffled DataFrame to the same CSV file
df_shuffled.to_csv(output_filename, index=False)