In [14]:
import chess
import chess.pgn
import chess_encode
import chess_decode


import time

filepath = "..\\data_pgn\\lichess_db_standard_rated_2014-07.pgn"
intermediate_filepath = "..\\data_v2\\intermediate_non_bullet.txt"
tokens_filepath = "..\\data_v2\\tokens.txt"

In [6]:

pgn = open(filepath)

first_game = chess.pgn.read_game(pgn)
second_game = chess.pgn.read_game(pgn)

print(first_game.headers["Event"])
print(int(first_game.headers["BlackElo"]))
print(int(first_game.headers["WhiteElo"]))


Rated Blitz game
1518
1567


In [7]:
def encode_game_with_check(game):
    encoder = chess_encode.EncoderBoard()
    decoder = chess_decode.DecoderBoard()
    moves = []

    for move in game.mainline_moves():
        enc_move = encoder.EncodeMove(move)
        if enc_move == None:
            break
        moves.append(enc_move)
        dec_move = decoder.DecodeMove(enc_move)
        assert move == dec_move
    return moves

In [None]:
pgn = open(filepath)

first_game = chess.pgn.read_game(pgn)

encode_game_with_check(first_game)


In [9]:
def try_parse_int(s, val):
  try:
    return int(s)
  except ValueError:
    return val

def clean_elo_str(elo):
    return try_parse_int(elo.replace('?', ''), 0)

In [16]:
from collections import defaultdict

event = defaultdict(int)
elos = defaultdict(int)

seconds = time.time()
total = time.time()

token_set = set()

with open(filepath) as pgn:
    with open(intermediate_filepath, "w") as out_file:
        for i in range(100000):
            if i % 5000 == 0:        
                print (i, round(time.time() - seconds, 2), "sec")
                print ("total", round(time.time() - total, 2), "sec")
                seconds = time.time()
                print (event)
                print (elos)
                print ("")

            game = chess.pgn.read_game(pgn)
            ev = " ".join(game.headers["Event"].split()[:2])
            if 'Bullet' in ev:
                continue 
            el1 = clean_elo_str(game.headers["BlackElo"]) // 100
            el2 = clean_elo_str(game.headers["WhiteElo"]) // 100

#             if el1 < 18 or el2 < 18:
#                 continue

            move_seq = encode_game_with_check(game)
            token_set.update(move_seq)
            out_file.write(" ".join(move_seq) + "\n")

            event[ev] += 1
            elos[el1] += 1
            elos[el2] += 1

token_list = sorted(list(token_set))

with open(tokens_filepath, "w") as out_file:
    out_file.write("\n".join(token_list) + "\n")

0 0.0 sec
total 0.0 sec
defaultdict(<class 'int'>, {})
defaultdict(<class 'int'>, {})

5000 12.08 sec
total 12.08 sec
defaultdict(<class 'int'>, {'Rated Blitz': 1805, 'Rated Classical': 1055, 'Rated Correspondence': 18})
defaultdict(<class 'int'>, {15: 1064, 13: 565, 14: 790, 18: 479, 19: 230, 12: 431, 16: 948, 11: 172, 17: 859, 21: 28, 20: 96, 8: 4, 9: 18, 7: 4, 10: 58, 23: 2, 24: 4, 25: 4})

10000 11.99 sec
total 24.07 sec
defaultdict(<class 'int'>, {'Rated Blitz': 3856, 'Rated Classical': 2386, 'Rated Correspondence': 31})
defaultdict(<class 'int'>, {15: 2362, 13: 1167, 14: 1770, 18: 1026, 19: 428, 12: 796, 16: 2176, 11: 358, 17: 2002, 21: 47, 20: 198, 8: 19, 9: 27, 7: 11, 10: 138, 23: 2, 24: 4, 25: 4, 22: 11})

15000 11.79 sec
total 35.85 sec
defaultdict(<class 'int'>, {'Rated Blitz': 5750, 'Rated Classical': 3678, 'Rated Correspondence': 46})
defaultdict(<class 'int'>, {15: 3439, 13: 1794, 14: 2691, 18: 1584, 19: 692, 12: 1165, 16: 3354, 11: 476, 17: 3045, 21: 103, 20: 331, 8: 19,