In [153]:
import chess
import chess.pgn
import time

filepath = "C:\\Users\\anton\\Documents\\code\\chessGPT\\data\\lichess_db_standard_rated_2014-07.pgn"
intermediate_filepath = "C:\\Users\\anton\\Documents\\code\\chessGPT\\data\\intermediate.txt"
tokens_filepath = "C:\\Users\\anton\\Documents\\code\\chessGPT\\data\\tokens.txt"

In [2]:
board = chess.Board()
board.legal_moves

<LegalMoveGenerator at 0x1b178e553a0 (Nh3, Nf3, Nc3, Na3, h3, g3, f3, e3, d3, c3, b3, a3, h4, g4, f4, e4, d4, c4, b4, a4)>

In [3]:
with open(filepath, "rb") as f:
    num_lines = sum(1 for _ in f)
print (num_lines)


18878980


In [4]:

pgn = open(filepath)

first_game = chess.pgn.read_game(pgn)
second_game = chess.pgn.read_game(pgn)

first_game.headers["Event"]


'Rated Blitz game'

In [5]:
seconds = time.time()

with open(filepath) as pgn:
    for i in range(1000):
        game = chess.pgn.read_game(pgn)
        
print (round(time.time() - seconds, 2), "sec")

2.2 sec


In [49]:
def initial_setup():
    piece_to_square = {}

    piece_to_square["W:R1"] = chess.A1
    piece_to_square["W:N1"] = chess.B1
    piece_to_square["W:BD"] = chess.C1
    piece_to_square["W:Q"]  = chess.D1
    piece_to_square["W:K"]  = chess.E1
    piece_to_square["W:BL"] = chess.F1
    piece_to_square["W:N2"] = chess.G1
    piece_to_square["W:R2"] = chess.H1

    piece_to_square["B:R1"] = chess.A8
    piece_to_square["B:N1"] = chess.B8
    piece_to_square["B:BL"] = chess.C8
    piece_to_square["B:Q"]  = chess.D8
    piece_to_square["B:K"]  = chess.E8
    piece_to_square["B:BD"] = chess.F8
    piece_to_square["B:N2"] = chess.G8
    piece_to_square["B:R2"] = chess.H8
    
    square_to_piece = {}
    
    for piece in piece_to_square.keys():
        square_to_piece[piece_to_square[piece]] = piece
    
    return piece_to_square, square_to_piece

initial_setup()


({'W:R1': 0,
  'W:N1': 1,
  'W:BD': 2,
  'W:Q': 3,
  'W:K': 4,
  'W:BL': 5,
  'W:N2': 6,
  'W:R2': 7,
  'B:R1': 56,
  'B:N1': 57,
  'B:BL': 58,
  'B:Q': 59,
  'B:K': 60,
  'B:BD': 61,
  'B:N2': 62,
  'B:R2': 63},
 {0: 'W:R1',
  1: 'W:N1',
  2: 'W:BD',
  3: 'W:Q',
  4: 'W:K',
  5: 'W:BL',
  6: 'W:N2',
  7: 'W:R2',
  56: 'B:R1',
  57: 'B:N1',
  58: 'B:BL',
  59: 'B:Q',
  60: 'B:K',
  61: 'B:BD',
  62: 'B:N2',
  63: 'B:R2'})

In [7]:
pgn = open(filepath)

game = chess.pgn.read_game(pgn)

print(game.board().piece_at(chess.A1))
print(game.board().piece_at(chess.A3))

print(list(game.mainline_moves())[0])
print(list(game.mainline_moves())[0].from_square)
print(list(game.mainline_moves())[0].to_square)


R
None
e2e3
12
20


In [46]:
def encode_square(sq):
    return chess.square_name(sq)

encode_square(chess.A3)

'a3'

In [47]:
def encode_relative(move):
    fr_rank = chess.square_rank(move.from_square)
    to_rank = chess.square_rank(move.to_square)
    fr_file = chess.square_file(move.from_square)
    to_file = chess.square_file(move.to_square)
    
    return str(to_file - fr_file) + ":" + str(to_rank - fr_rank)

encode_relative(chess.Move(chess.B4, chess.C3))

'1:-1'

In [42]:
def encode_pawn(move):
    fr_rank = chess.square_rank(move.from_square)
    to_rank = chess.square_rank(move.to_square)
    fr_file = chess.square_file(move.from_square)
    to_file = chess.square_file(move.to_square)
    h = to_file - fr_file
    v = to_rank - fr_rank
    
    prefix = "Pawn:" + chess.square_name(move.from_square) + ":"
    
    if (move.promotion != None):
        return prefix + "Promote" + chess.piece_symbol(move.promotion)
    
    if h < 0:
        return prefix + "CaptureLeft"
    if h > 0:
        return prefix + "CaptureRight"
    if abs(v) == 1:
        return prefix + "Push"
    if abs(v) == 2:
        return prefix + "TwoPush"


In [61]:
def encode_non_pawn(piece, move):
    if ":B" in piece or ":Q" in piece:
        return piece[2:] + ":" + encode_square(move.to_square)
    else:
        return piece[2:] + ":" + encode_relative(move)

In [120]:
def is_castle(piece, move):
    fr_file = chess.square_file(move.from_square)
    to_file = chess.square_file(move.to_square)
    return ":K" in piece and abs(to_file-fr_file) == 2
    
def get_castled_rook_position(move):
    fr_file = chess.square_file(move.from_square)
    to_file = chess.square_file(move.to_square)
    rank = chess.square_rank(move.from_square)

    if to_file > fr_file: # short
        rook = "R2"
    else: # long
        rook = "R1"
    
    if (rank == 0):
        rook = "W:" + rook
    else:
        rook = "B:" + rook
        
    return rook, chess.square((to_file + fr_file) // 2, rank)
    

In [113]:
class EncoderBoard:
    def __init__(self):
        _, self.square_to_piece = initial_setup()
    
    def EncodeMove(self, move):
        if move.promotion != None:
            return None
        
        if move.from_square in self.square_to_piece:
            piece = self.square_to_piece[move.from_square]
            self.square_to_piece[move.to_square] = piece
            if (is_castle(piece, move)):
                rook, rook_sq = get_castled_rook_position(move)
                self.square_to_piece[rook_sq] = rook
            return encode_non_pawn(piece, move)
        else:
            if move.to_square in self.square_to_piece:
                del self.square_to_piece[move.to_square]
            return encode_pawn(move)

In [114]:
with open(filepath) as pgn:
    game = chess.pgn.read_game(pgn)

encoder = EncoderBoard()

for move in game.mainline_moves():
    enc_move = encoder.EncodeMove(move)
    if enc_move == None:
        break
    print (move, enc_move)

e2e3 Pawn:e2:Push
e7e5 Pawn:e7:TwoPush
b1c3 N1:1:2
g8f6 N2:-1:-2
f1c4 BL:c4
d7d5 Pawn:d7:TwoPush
c4b3 BL:b3
c7c5 Pawn:c7:TwoPush
b3a4 BL:a4
c8d7 BL:d7
a4d7 BL:d7
d8d7 Q:d7
d2d4 Pawn:d2:TwoPush
e5d4 Pawn:e5:CaptureLeft
e3d4 Pawn:e3:CaptureLeft
c5d4 Pawn:c5:CaptureRight
d1d4 Q:d4
d7e6 Q:e6
c1e3 BD:e3
f6e4 N2:-1:-2
g1f3 N2:-1:2
e4c3 N2:-2:-1
d4c3 Q:c3
b8c6 N1:1:-2
f3d4 N2:-2:1
c6d4 N1:1:-2
c3d4 Q:d4
a7a5 Pawn:a7:TwoPush
a2a3 Pawn:a2:Push
b7b5 Pawn:b7:TwoPush
e1g1 K:2:0
f8e7 BD:e7
a1d1 R1:3:0
e8g8 K:2:0
d4d5 Q:d5
e6d5 Q:d5
d1d5 R1:0:4
b5b4 Pawn:b5:Push
d5d7 R1:0:2
f8e8 R2:-1:0
a3b4 Pawn:a3:CaptureRight
e7b4 BD:b4
c2c3 Pawn:c2:Push
b4e7 BD:e7
f1d1 R2:-2:0
a5a4 Pawn:a5:Push
d1a1 R2:-3:0
h7h6 Pawn:h7:Push
d7d4 R1:0:-3
e7f6 BD:f6
d4a4 R1:-3:0
a8a4 R1:0:-4
a1a4 R2:0:3
g7g5 Pawn:g7:TwoPush
g2g3 Pawn:g2:Push
h6h5 Pawn:h6:Push
g1g2 K:0:1
g8g7 K:0:-1
h2h4 Pawn:h2:TwoPush
g5h4 Pawn:g5:CaptureRight
g3h4 Pawn:g3:CaptureRight
f6h4 BD:h4
a4h4 R2:7:0
g7g6 K:0:-1
b2b4 Pawn:b2:TwoPush
e8e5 R2:0:-3
e3d4 BD:

In [73]:
def RelativePosToAbsolute(curr_pos, rel_pos_x, rel_pos_y):
    return chess.square(
        chess.square_file(curr_pos) + int(rel_pos_x),
        chess.square_rank(curr_pos) + int(rel_pos_y))

chess.square_name(RelativePosToAbsolute(chess.B3, "-1", "1"))

'a4'

In [95]:
def DecodePawnMove(curr_pos, enc, is_white):
    dx = 0
    dy = 1
    if enc == "Push":
        pass
    elif enc == "TwoPush":
        dy = 2
    elif enc == "CaptureLeft":
        dx = -1
    elif enc == "CaptureRight":
        dx = 1
    else:
        assert False, enc
    if not is_white:
        dy = -dy
    return chess.square(
        chess.square_file(curr_pos) + dx,
        chess.square_rank(curr_pos) + dy)

chess.square_name(DecodePawnMove(chess.B7, "TwoPush", False))

'b5'

In [122]:
class DecoderBoard:
    def __init__(self):
        self.piece_to_square, _ = initial_setup()
        # self.board = chess.Board()
        self.is_white = True
        
    def AddColor(self, piece):
        if self.is_white: 
            return "W:" + piece
        else: 
            return "B:" + piece
        
    def WhoseMove(self):
        if self.is_white: 
            return "White"
        else: 
            return "Black"
        
    def NextMove(self):
        self.is_white = not self.is_white
    
    def DecodeMove(self, move):
        move = move.split(":")
        piece = move[0]
        
        if piece in ["Q", "BD", "BL"]:
            piece = self.AddColor(piece)
            curr_pos = self.piece_to_square[piece]
            next_pos = chess.parse_square(move[1])
        elif piece in ["R1", "R2", "K", "N1", "N2"]:
            piece = self.AddColor(piece)
            curr_pos = self.piece_to_square[piece]
            next_pos = RelativePosToAbsolute(curr_pos, move[1], move[2])
        elif piece == "Pawn":
            curr_pos = chess.parse_square(move[1])
            next_pos = DecodePawnMove(curr_pos, move[2], self.is_white)
        else:
            assert False, piece
            
        if piece != "Pawn":
            assert piece in self.piece_to_square
            self.piece_to_square[piece] = next_pos
            
        move = chess.Move(curr_pos, next_pos)
        
        if (is_castle(piece, move)):
            rook, rook_sq = get_castled_rook_position(move)
            # print (rook, rook_sq, chess.square_name(rook_sq))
            self.piece_to_square[rook] = rook_sq
            
        self.NextMove()
        
        # print(piece, chess.square_name(curr_pos), chess.square_name(next_pos))
            
        return move


In [123]:
with open(filepath) as pgn:
    game = chess.pgn.read_game(pgn)

encoder = EncoderBoard()
decoder = DecoderBoard()

i = 1
alg = (chess.Board().variation_san(game.mainline_moves())).split()

for move in game.mainline_moves():
    enc_move = encoder.EncodeMove(move)
    if enc_move == None:
        break
    print (decoder.WhoseMove(), alg[i])
    dec_move = decoder.DecodeMove(enc_move)
    print (move, enc_move)
    print (dec_move)
    assert move == dec_move
    i = i + 1
    if i % 3 == 0:
        i = i + 1
    


White e3
e2e3 Pawn:e2:Push
e2e3
Black e5
e7e5 Pawn:e7:TwoPush
e7e5
White Nc3
b1c3 N1:1:2
b1c3
Black Nf6
g8f6 N2:-1:-2
g8f6
White Bc4
f1c4 BL:c4
f1c4
Black d5
d7d5 Pawn:d7:TwoPush
d7d5
White Bb3
c4b3 BL:b3
c4b3
Black c5
c7c5 Pawn:c7:TwoPush
c7c5
White Ba4+
b3a4 BL:a4
b3a4
Black Bd7
c8d7 BL:d7
c8d7
White Bxd7+
a4d7 BL:d7
a4d7
Black Qxd7
d8d7 Q:d7
d8d7
White d4
d2d4 Pawn:d2:TwoPush
d2d4
Black exd4
e5d4 Pawn:e5:CaptureLeft
e5d4
White exd4
e3d4 Pawn:e3:CaptureLeft
e3d4
Black cxd4
c5d4 Pawn:c5:CaptureRight
c5d4
White Qxd4
d1d4 Q:d4
d1d4
Black Qe6+
d7e6 Q:e6
d7e6
White Be3
c1e3 BD:e3
c1e3
Black Ne4
f6e4 N2:-1:-2
f6e4
White Nf3
g1f3 N2:-1:2
g1f3
Black Nxc3
e4c3 N2:-2:-1
e4c3
White Qxc3
d4c3 Q:c3
d4c3
Black Nc6
b8c6 N1:1:-2
b8c6
White Nd4
f3d4 N2:-2:1
f3d4
Black Nxd4
c6d4 N1:1:-2
c6d4
White Qxd4
c3d4 Q:d4
c3d4
Black a5
a7a5 Pawn:a7:TwoPush
a7a5
White a3
a2a3 Pawn:a2:Push
a2a3
Black b5
b7b5 Pawn:b7:TwoPush
b7b5
White O-O
e1g1 K:2:0
e1g1
Black Be7
f8e7 BD:e7
f8e7
White Rad1
a1d1 R1:3:0
a1d1
Black

In [127]:
def check_game(game, should_print):
    encoder = EncoderBoard()
    decoder = DecoderBoard()

    i = 1
    alg = chess.Board().variation_san(game.mainline_moves())
    if should_print:
        print (alg)
    alg = alg.split()

    for move in game.mainline_moves():
        enc_move = encoder.EncodeMove(move)
        if enc_move == None:
            break
        if should_print:
            print (decoder.WhoseMove(), alg[i])
        dec_move = decoder.DecodeMove(enc_move)
        if should_print:
            print (move, enc_move)
            print (dec_move)
        assert move == dec_move
        i = i + 1
        if i % 3 == 0:
            i = i + 1

In [132]:
i = 0
seconds = time.time()
with open(filepath) as pgn:
    while True:
        game = chess.pgn.read_game(pgn)
        # print (chess.Board().variation_san(game.mainline_moves()))
        check_game(game, False)
        i = i + 1
        if i % 1000 == 0:
            print (i, round(time.time() - seconds, 2), "sec")
            seconds = time.time()


1000 4.67 sec
2000 4.83 sec
3000 4.75 sec
4000 4.63 sec
5000 4.69 sec
6000 4.82 sec
7000 4.86 sec
8000 4.6 sec
9000 4.47 sec
10000 4.52 sec
11000 4.72 sec
12000 4.63 sec
13000 4.66 sec
14000 4.44 sec
15000 4.52 sec
16000 4.7 sec
17000 4.66 sec
18000 4.74 sec
19000 4.66 sec
20000 4.76 sec
21000 4.76 sec
22000 4.89 sec
23000 4.81 sec
24000 4.7 sec
25000 4.59 sec
26000 4.81 sec
27000 4.75 sec
28000 4.72 sec
29000 4.79 sec
30000 4.76 sec
31000 4.68 sec
32000 4.62 sec
33000 4.65 sec
34000 4.68 sec
35000 4.71 sec
36000 4.64 sec
37000 4.85 sec
38000 4.71 sec
39000 4.59 sec
40000 4.46 sec
41000 4.51 sec
42000 4.4 sec
43000 4.56 sec
44000 4.7 sec
45000 4.57 sec
46000 4.55 sec
47000 4.8 sec
48000 4.61 sec
49000 4.7 sec
50000 4.79 sec
51000 4.75 sec
52000 4.61 sec
53000 4.72 sec
54000 4.96 sec
55000 4.84 sec
56000 4.72 sec
57000 4.77 sec
58000 4.91 sec
59000 4.69 sec
60000 4.83 sec
61000 4.84 sec
62000 4.67 sec
63000 4.98 sec


KeyboardInterrupt: 

In [144]:
def encode_game_with_check(game):
    encoder = EncoderBoard()
    decoder = DecoderBoard()
    moves = []

    for move in game.mainline_moves():
        enc_move = encoder.EncodeMove(move)
        if enc_move == None:
            break
        moves.append(enc_move)
        dec_move = decoder.DecodeMove(enc_move)
        assert move == dec_move
    return moves


In [150]:
with open(filepath) as pgn:
    game = chess.pgn.read_game(pgn)

with open(out_filepath, "w") as out_file:
    out_file.write(" ".join(encode_game_with_check(game)) + "\n")


In [164]:
i = 0
seconds = time.time()

token_set = set()
with open(filepath) as pgn:
    with open(intermediate_filepath, "w") as out_file:
        while True:
            game = chess.pgn.read_game(pgn)
            move_seq = encode_game_with_check(game)
            token_set.update(move_seq)
            out_file.write(" ".join(move_seq) + "\n")
            i = i + 1
            if i % 1000 == 0:
                print (i, round(time.time() - seconds, 2), "sec")
                seconds = time.time()
            if i > 50000:
                break
token_list = sorted(list(token_set))

# print(token_list)

with open(tokens_filepath, "w") as out_file:
    out_file.write("\n".join(token_list) + "\n")


1000 2.62 sec
2000 2.68 sec
3000 2.64 sec
4000 2.59 sec
5000 2.62 sec
6000 2.72 sec
7000 2.7 sec
8000 2.62 sec
9000 2.46 sec
10000 2.52 sec
11000 2.65 sec
12000 2.65 sec
13000 2.58 sec
14000 2.5 sec
15000 2.54 sec
16000 2.64 sec
17000 2.68 sec
18000 2.69 sec
19000 2.67 sec
20000 2.6 sec
21000 2.67 sec
22000 2.72 sec
23000 2.66 sec
24000 2.65 sec
25000 2.61 sec
26000 2.65 sec
27000 2.64 sec
28000 2.64 sec
29000 2.7 sec
30000 2.66 sec
31000 2.67 sec
32000 2.62 sec
33000 2.64 sec
34000 2.58 sec
35000 2.67 sec
36000 2.62 sec
37000 2.66 sec
38000 2.63 sec
39000 2.57 sec
40000 2.46 sec
41000 2.53 sec
42000 2.47 sec
43000 2.62 sec
44000 2.72 sec
45000 2.58 sec
46000 2.55 sec
47000 2.63 sec
48000 2.57 sec
49000 2.7 sec
50000 2.62 sec
['BD:a1', 'BD:a3', 'BD:a5', 'BD:a7', 'BD:b2', 'BD:b4', 'BD:b6', 'BD:b8', 'BD:c1', 'BD:c3', 'BD:c5', 'BD:c7', 'BD:d2', 'BD:d4', 'BD:d6', 'BD:d8', 'BD:e1', 'BD:e3', 'BD:e5', 'BD:e7', 'BD:f2', 'BD:f4', 'BD:f6', 'BD:f8', 'BD:g1', 'BD:g3', 'BD:g5', 'BD:g7', 'BD:h2', 'B