In [1]:
import pandas as pd

df = pd.read_csv('players_data.csv') #Import player data prepared previously

In [2]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,k,rapid_rating,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag
0,0,10688862,"A Abdel Maabod, Hoda",EGY,F,,,,,,,,,,,,,,2009.0,w


In [None]:
lookup_table = df[['name', 'fideid']].set_index('name').to_dict()['fideid'] #Create a dictionary that maps name to fide id

In [None]:
import chess.pgn
import glob
from tqdm import tqdm
import pickle

pgn_files = glob.glob('unzipped/unzipped/*')

class MyGameBuilder(chess.pgn.GameBuilder):
    def handle_error(self, error: Exception) -> None:
        pass  # Ignore error

already_seen = set()
def read_games(pgn_files):
    global already_seen
    file_count = 0
    games = []
    for pgn_file in tqdm(pgn_files):

      with open(pgn_file, errors='ignore') as f:

          pgn_game = True
          while True:          
              try:
                pgn_game = chess.pgn.read_game(f, Visitor=MyGameBuilder)
                if not pgn_game:
                    break
                if "WhiteFideId" not in pgn_game.headers or "BlackFideId" not in pgn_game.headers: #If fide id is not found, use player's name to find it            
                    pgn_game.headers['WhiteFideId'] = str(lookup_table[pgn_game.headers['White']])
                    pgn_game.headers['BlackFideId'] = str(lookup_table[pgn_game.headers['Black']])           
                key = pgn_game.headers['WhiteFideId'] + pgn_game.headers['BlackFideId'] + pgn_game.headers['Date']
                if key not in already_seen: #Games colleceted from multiple sources, some might appear multiple times, ignore duplicates
                    already_seen.add(key)            
                    games.append(pgn_game)
                    if len(games) >= 100_000: #Save to drive when there are more than 100,000 games, too big for RAM
                        try:
                            file_count += 1
                            with open(f"games/games-{file_count}.pkl", "wb") as file:
                                pickle.dump(games, file)
                        except:
                            pass
                        games = []
              except Exception as e:
                if type(e) is not KeyError:
                    print(e)
                pass
    file_count += 1
    with open(f"games/games-{file_count}.pkl", "wb") as file:
        pickle.dump(games, file)
    games = []
    #return games

read_games(pgn_files)

In [1]:
import chess.polyglot
import chess
#Load opening book to be able to ignore opening theory moves
opening_book_file = 'M11.2.bin'
opening_book = chess.polyglot.open_reader(opening_book_file)

In [4]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,k,rapid_rating,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag
0,0,10688862,"A Abdel Maabod, Hoda",EGY,F,,,,,,,,,,,,,,2009.0,w


In [None]:
lookup_table = df[['fideid', 'sex']].set_index('fideid').to_dict()['sex'] #Table that maps player's Fide id to their gender

In [None]:
import glob
import pickle
from tqdm import tqdm
#Convert the chess games save to the drive to arrays of Xs and Ys
def get_Xs_boards_ys(games):
    Xs_boards = []
    ys = []
    for game in tqdm(games):

      if "WhiteFideId" not in game.headers or "BlackFideId" not in game.headers:
        continue

      board_list = []


      out = False #Out of opening book
      temp_board = chess.Board()

      try:  
          for move in game.mainline_moves():
            temp_board.push(move)
            if not out:
              try:
                opening_book.find(temp_board)
              except: #No longer found in books
                out = True
            if out: #Add moves that are out of theory
              board_list.append(temp_board.copy())
              if len(board_list) > 40:
                    break

          if len(board_list) < 4:
            continue
          whitegender = lookup_table[int(game.headers["WhiteFideId"])]
          blackgender = lookup_table[int(game.headers["BlackFideId"])]
          ys.append(whitegender) #Each game has two samples, first predict white's gender and then black's
          ys.append(blackgender)
          Xs_boards.append(board_list)
      except:
          pass
    return Xs_boards, ys
#For each file of games, convert to list of boards and target values
for path in glob.glob("games/*"):
    try:
        with open(path, "rb") as games_file:
            print(f'loading {path}')
            games = pickle.load(games_file)
            Xs_boards, ys = get_Xs_boards_ys(games)

            name = path.split('/')[-1]
            with open(f"boards/Xs_boards-{name}", "wb") as file:
                pickle.dump(Xs_boards, file)
            with open(f"boards/ys_boards-{name}", "wb") as file:
                pickle.dump(ys, file)
    except Exception:
        print(f'failed to open file {path}')

In [1]:
import numpy as np
import chess

def convert_board_to_one_hot(board):
    piece_mapping = {
        chess.PAWN: 0,
        chess.KNIGHT: 1,
        chess.BISHOP: 2,
        chess.ROOK: 3,
        chess.QUEEN: 4,
        chess.KING: 5
    }

    one_hot_board = np.zeros((17, 8, 8), dtype=np.int8)

    # Fill piece positions
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece is not None:
            piece_type = piece.piece_type
            piece_color = piece.color
            channel = (piece_mapping[piece_type] + (6 if piece_color else 0))
            one_hot_board[channel, chess.square_rank(square), chess.square_file(square)] = 1

    # Fill castling rights
    castling_rights = board.castling_rights
    if castling_rights & chess.BB_H1:
        one_hot_board[12, 0, 7] = 1  # White kingside
    if castling_rights & chess.BB_A1:
        one_hot_board[12, 0, 0] = 1  # White queenside
    if castling_rights & chess.BB_H8:
        one_hot_board[13, 7, 7] = 1  # Black kingside
    if castling_rights & chess.BB_A8:
        one_hot_board[13, 7, 0] = 1  # Black queenside

    # Fill active color
    if board.turn == chess.WHITE:
        one_hot_board[14, :, :] = 1

    # Fill en passant square
    if board.ep_square is not None:
        rank = chess.square_rank(board.ep_square)
        file = chess.square_file(board.ep_square)
        one_hot_board[15, rank, file] = 1

    # Fill halfmove clock
    halfmove_clock = board.halfmove_clock
    one_hot_board[16, :, :] = halfmove_clock

    return one_hot_board

def boards_to_vectors(boards):
    return list(map(convert_board_to_one_hot, boards))

In [3]:
import numpy as np
from tqdm import tqdm
import pickle
import glob
import gc

#Convert games to the model's input format

def boards_to_xs(Xs_boards):
    Xs = []
    for board_list in tqdm(Xs_boards):
        vectors = boards_to_vectors(board_list)    
        first = board_list[0]
        #Create lists of tuples each tuple is a board before a move and the board after a move
        #Create a list for each player in the game
        first_boards = []
        for i in range(0, len(board_list) - 1, 2):    
            vector = np.concatenate((vectors[i], vectors[i + 1]))
            first_boards.append(vector)        

        second_boards = []
        for i in range(1, len(board_list) - 1, 2):
            vector = np.concatenate((vectors[i], vectors[i + 1]))
            second_boards.append(vector)        

        if first.turn: #White is the first y that corresponds these boards, black is the second
            white_boards = first_boards
            black_boards = second_boards
        else:
            white_boards = second_boards
            black_boards = first_boards

        Xs.append(white_boards)
        Xs.append(black_boards)
    return Xs


def pad_xs(Xs):
    max_shape = max([len(data) for data in Xs])
    padded_Xs = []
    for data in tqdm(Xs):
        data = np.asarray(data)
        pad_width = [(max_shape - data.shape[0], 0), (0, 0), (0, 0), (0, 0)]
        padded_data = np.pad(data, pad_width, mode='constant')
        padded_Xs.append(padded_data)

    padded_Xs = np.asarray(padded_Xs)
    padded_Xs = padded_Xs.reshape((-1, padded_Xs.shape[1], padded_Xs.shape[3], padded_Xs.shape[4], padded_Xs.shape[2]))
    return padded_Xs


for i, path in enumerate(glob.glob("boards/Xs*")):
    try:
        with open(path, "rb") as boards_file:
            print(f'loading {path}')
            boards = pickle.load(boards_file)
            
            
        Xs = boards_to_xs(boards)
        gc.collect()
        Xs = pad_xs(Xs)
        gc.collect()
        file_num = path.split('-')[-1].split('.')[0]
        name = f'/sise/liorrk-group/DataSets/Datamining/xs_data/xs-{file_num}'
        np.save(name, Xs)
        del Xs
        del boards_file
        del boards
        gc.collect()
        #with open(f"xs_data/{name}.pkl", "wb") as file:
            #pickle.dump(Xs, file)
                
    except Exception as e:
        print(e)
        print(f'failed to open file {path}')

loading boards/Xs_boards-games-17.pkl


100%|██████████| 97074/97074 [03:52<00:00, 418.38it/s]
100%|██████████| 194148/194148 [00:36<00:00, 5326.75it/s]


loading boards/Xs_boards-games-18.pkl


100%|██████████| 97491/97491 [03:40<00:00, 441.26it/s]
100%|██████████| 194982/194982 [00:33<00:00, 5907.87it/s]


loading boards/Xs_boards-games-19.pkl


100%|██████████| 96972/96972 [03:43<00:00, 433.58it/s]
100%|██████████| 193944/193944 [00:30<00:00, 6293.49it/s]


loading boards/Xs_boards-games-20.pkl


100%|██████████| 97424/97424 [03:46<00:00, 430.50it/s]
100%|██████████| 194848/194848 [00:32<00:00, 5917.30it/s]


loading boards/Xs_boards-games-21.pkl


100%|██████████| 97480/97480 [03:47<00:00, 429.31it/s]
100%|██████████| 194960/194960 [00:33<00:00, 5859.85it/s]


loading boards/Xs_boards-games-22.pkl


100%|██████████| 61078/61078 [02:17<00:00, 445.45it/s]
100%|██████████| 122156/122156 [00:14<00:00, 8596.86it/s]


In [7]:
#Convery target array of 'M' and 'F' to 0 for Male and 1 for Female

def convert_ys(ys):
    ys = list(map(lambda x: 0 if x == 'M' else 1, ys)) #Convert 'M' and 'F' to 0 and 1
    return ys


for i, path in enumerate(glob.glob("boards/ys*")):
    try:
        with open(path, "rb") as ys_file:
            print(f'loading {path}')
            ys = pickle.load(ys_file)
            
        file_name = path.split('-')[-1].split('.')[0]
        name = f'/sise/liorrk-group/DataSets/Datamining/xs_data/ys-{file_name}'
        ys = convert_ys(ys)
        np.save(name, ys)
                
    except Exception as e:
        print(e)
        print(f'failed to open file {path}')

loading boards/ys_boards-games-1.pkl
loading boards/ys_boards-games-3.pkl
loading boards/ys_boards-games-4.pkl
loading boards/ys_boards-games-5.pkl
loading boards/ys_boards-games-6.pkl
loading boards/ys_boards-games-7.pkl
loading boards/ys_boards-games-8.pkl
loading boards/ys_boards-games-10.pkl
loading boards/ys_boards-games-11.pkl
loading boards/ys_boards-games-12.pkl
loading boards/ys_boards-games-13.pkl
loading boards/ys_boards-games-14.pkl
loading boards/ys_boards-games-15.pkl
loading boards/ys_boards-games-16.pkl
loading boards/ys_boards-games-17.pkl
loading boards/ys_boards-games-18.pkl
loading boards/ys_boards-games-19.pkl
loading boards/ys_boards-games-20.pkl
loading boards/ys_boards-games-21.pkl
loading boards/ys_boards-games-22.pkl
