In [5]:
!python3 -m pip install tensorflow
%pip install faiss-cpu
%pip install h5py
%pip install zstandard
%pip install python-chess
%pip install pinecone-client
!git clone https://github.com/anirudhajith/chesspos.git
!python -m pip install ./chesspos


fatal: destination path 'chesspos' already exists and is not an empty directory.
Processing ./chesspos
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: chesspos
  Building wheel for chesspos (setup.py) ... [?25l[?25hdone
  Created wheel for chesspos: filename=chesspos-0.1.2-py3-none-any.whl size=27169 sha256=a1a713687172a38151abb4c56bc4814b5b483274f3d31f127442d50c6e354d87
  Stored in directory: /tmp/pip-ephem-wheel-cache-5w9pln71/wheels/ff/c3/0e/3d12135a0b60cff5f73ee74213865e7d71a52ef31b00bbb49f
Successfully built chesspos
Installing collected packages: chesspos
  Attempting uninstall: chesspos
    Found existing installation: chesspos 0.1.2
    Uninstalling chesspos-0.1.2:
      Successfully uninstalled chesspos-0.1.2
Successfully installed chesspos-0.1.2


In [6]:
import os
from google.colab import drive
drive.mount('/gdrive')

gdrive_path_dataset = "/gdrive/My Drive/lichess-dataset/"
gdrive_path_pruned = "/gdrive/My Drive/lichess-pruned/"

if not os.path.exists(gdrive_path_dataset):
    os.makedirs(gdrive_path_dataset)

if not os.path.exists(gdrive_path_pruned):
    os.makedirs(gdrive_path_pruned)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import chess.pgn
import chess
import zstandard
import os
import io
from tqdm.auto import tqdm
from multiprocessing import Pool, Queue, Manager
import pinecone
import itertools
import numpy as np
import tensorflow as tf
import pathlib
import shutil
from google.colab import userdata

thread_count = 2
start_skip = 0

def board_to_bitboard(board):
	embedding = np.array([], dtype=bool)
	for color in [1, 0]:
		for i in range(1, 7): # P N B R Q K / white
			bmp = np.zeros(shape=(64,)).astype(bool)
			for j in list(board.pieces(i, color)):
				bmp[j] = True
			embedding = np.concatenate((embedding, bmp))
	additional = np.array([
		bool(board.turn),
		bool(board.castling_rights & chess.BB_A1),
		bool(board.castling_rights & chess.BB_H1),
		bool(board.castling_rights & chess.BB_A8),
		bool(board.castling_rights & chess.BB_H8)
	])
	embedding = np.concatenate((embedding, additional))
	return embedding


def upload_pgn(pgn_file):
    pgn_file, mp_queue, start_skip = pgn_file
    model = tf.keras.models.load_model("model_encoder.h5")
    fh = open(pgn_file, "rb")
    print("Processing " + pgn_file + "...")
    filesize = os.fstat(fh.fileno()).st_size
    dctx = zstandard.ZstdDecompressor()
    stream_reader = dctx.stream_reader(fh, read_size=min(1073741824>>6, filesize))
    # stream_reader = dctx.stream_reader(fh)
    pgn = io.TextIOWrapper(stream_reader, encoding="cp1252")
    pbar = tqdm(total=filesize, unit="B", unit_scale=True)

    # Start a loop to iterate through all games in the pgn file
    game_count = 0
    documents = []
    boards = []
    if (start_skip > 0):
      print("Skipping", start_skip)
    for i in range(start_skip):
        chess.pgn.skip_game(pgn)
        game_count += 1
        if game_count % 2000 == 0:
            pbar.update(fh.tell() - pbar.n)
            print("Skipped " + str(game_count) + " games")
            pbar.set_postfix_str("Skipped " + str(game_count) + " games")

    while True:
        # Read a game from the pgn file
        game = chess.pgn.read_game(pgn)
        if game is None:
            break
        game_count += 1
        if game_count % 1000 == 0:
            # Insert the documents into the database

            query = np.array([board_to_bitboard(board) for board in boards])
            embeddings = model.predict_on_batch(query)
            mp_queue.put((documents, embeddings))
            documents.clear()
            boards.clear()
            pbar.update(fh.tell() - pbar.n)
            pbar.set_postfix_str("Processed " + str(game_count) + " games")
            yield (documents, embeddings)

        board = game.board()


        for move in game.mainline_moves():
            # Extract the board position as a base64 encoded string
            board_string = board.fen()
            # Extract the move that was made
            move_string = move.uci()
            # Create a document to insert into the database
            boards.append(board.copy(stack=False))
            documents.append([ board_string, move_string])
            # Make the move on the board
            board.push(move)

    query = np.array([board_to_bitboard(board) for board in boards])
    embeddings = model.predict_on_batch(query)
    mp_queue.put((documents, embeddings))

    pbar.update(fh.tell() - pbar.n)
    pbar.set_postfix_str("Processed " + str(game_count) + " games")
    pbar.close()


    # Close the pgn file
    pgn.close()

    # Print the number of games processed
    print("Processed " + str(game_count) + " games.")



def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))


# Want to run this upload_pgn function for each pgn.
def main():
    # Download the model and dataset from drive
    if not pathlib.Path("model_encoder.h5").exists():
      try:
        shutil.copyfile("/gdrive/Shareddrives/chess-ai/chesspos-codebase/deep64/model_encoder.h5", "model_encoder.h5")
      except:
        pass

    dataset_files = os.listdir(gdrive_path_pruned)
    for file in dataset_files:
      shutil.copyfile(gdrive_path_pruned + file, file)

    m = Manager()
    mp_queue = m.Queue(maxsize=20)

    # You will need to set up your secret key to connect to your pinecone index
    pinecone.init(api_key=userdata.get('pinecone_api_key'), environment="us-east4-gcp")
    index = pinecone.Index("chesspos-lichess-embeddings", pool_threads=2)

    dataset_files.sort()
    pgn_files = []
    for file in tqdm(dataset_files):
        if file.endswith(".zst"):
            pgn_file = file
            pgn_files.append((pgn_file, mp_queue, start_skip))


    for pgn_file in pgn_files:
      for _ in upload_pgn(pgn_file):
          boards, embeddings = mp_queue.get()
          vectors = [(boards[i][0], embeddings[i].tolist(), {"move": boards[i][1]}) for i in range(len(boards))]
          index.upsert(vectors=vectors, batch_size=100, show_progress=True)
    index.close()


if __name__ == "__main__":
    main()


  0%|          | 0/10 [00:00<?, ?it/s]



Processing lichess_db_standard_rated_2013-01.pgn.zst...


  0%|          | 0.00/48.3k [00:00<?, ?B/s]



Processed 195 games.
Processing lichess_db_standard_rated_2013-02.pgn.zst...


  0%|          | 0.00/119k [00:00<?, ?B/s]



Processed 476 games.
Processing lichess_db_standard_rated_2013-03.pgn.zst...


  0%|          | 0.00/140k [00:00<?, ?B/s]



Processed 575 games.
Processing lichess_db_standard_rated_2013-04.pgn.zst...


  0%|          | 0.00/103k [00:00<?, ?B/s]