In [None]:
!python3 -m pip install tensorflow[and-cuda]
%pip install faiss-cpu
%pip install h5py
%pip install zstandard
%pip install python-chess
!git clone https://github.com/anirudhajith/chesspos.git
!python -m pip install ./chesspos


Collecting nvidia-cuda-runtime-cu11==11.8.89 (from tensorflow[and-cuda])
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cublas-cu11==11.11.3.6 (from tensorflow[and-cuda])
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux1_x86_64.whl (417.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cufft-cu11==10.9.0.58 (from tensorflow[and-cuda])
  Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.4/168.4 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu11==8.7.0.84 (from tensorflow[and-cuda])
  Downloading nvidia_cudnn_cu11-8.7.0.84-py3-none-manylinux1_x86_64.whl (728.5 MB)

In [None]:
"""## Setup imports and globals"""

import chess.pgn
import chess
import zstandard
import os
import io
from tqdm.auto import tqdm
from multiprocessing import Pool, Queue, Manager
import pinecone
import itertools
import numpy as np
import tensorflow as tf
from chesspos.search.binary_index import board_to_bitboard
from

thread_count = 8
start_skip = 0 # Adjust this to skip games in the pgn file

def upload_pgn(pgn_file):
    pgn_file, mp_queue = pgn_file
    model = tf.keras.models.load_model("model_encoder.h5") # Change this to the path of your model
    fh = open(pgn_file, "rb")
    print("Processing " + pgn_file + "...")
    filesize = os.fstat(fh.fileno()).st_size
    dctx = zstandard.ZstdDecompressor()
    stream_reader = dctx.stream_reader(fh, read_size=min(1073741824>>6, filesize))
    # stream_reader = dctx.stream_reader(fh)
    pgn = io.TextIOWrapper(stream_reader)
    pbar = tqdm(total=filesize, unit="B", unit_scale=True)

    # Start a loop to iterate through all games in the pgn file
    game_count = 0
    documents = []
    boards = []

    for i in range(start_skip):
        chess.pgn.skip_game(pgn)
        game_count += 1
        if game_count % 2000 == 0:
            pbar.update(fh.tell() - pbar.n)
            pbar.set_postfix_str("Skipped " + str(game_count) + " games")

    while True:
        # Read a game from the pgn file
        game = chess.pgn.read_game(pgn)
        if game is None:
            break
        game_count += 1
        if game_count % 2000 == 0:
            # Insert the documents into the database

            query = np.array([board_to_bitboard(board) for board in boards])
            embeddings = model.predict_on_batch(query)
            mp_queue.put((documents, embeddings))
            documents.clear()
            boards.clear()
            pbar.update(fh.tell() - pbar.n)
            pbar.set_postfix_str("Processed " + str(game_count) + " games")

        board = game.board()


        for move in game.mainline_moves():
            # Extract the board position as a base64 encoded string
            board_string = board.fen()
            # Extract the move that was made
            move_string = move.uci()
            # Create a document to insert into the database
            boards.append(board.copy(stack=False))
            documents.append([ board_string, move_string])
            # Make the move on the board
            board.push(move)

    embeddings = iemb.encode_bitboards(
        [board_to_bitboard(board) for board in boards],
        model_path="deep64/model_encoder.h5"
    )
    mp_queue.put((documents, embeddings))

    # conn.commit()

    # Close the pgn file
    pgn.close()

    # Print the number of games processed
    print("Processed " + str(game_count) + " games.")



def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))


# Want to run this upload_pgn function for each pgn.
def main():
    m = Manager()
    mp_queue = m.Queue(maxsize=20)


    pinecone.init(api_key="38132697-8f87-4930-a355-376bd93394a3", environment="us-east4-gcp")
    index = pinecone.Index("chesspos-lichess-embeddings", pool_threads=30)

    files = os.listdir("pruned2")
    files.sort()
    files.reverse()
    pgn_files = []
    for file in tqdm(files):
        if file.endswith(".zst"):
            pgn_file = "pruned2/" + file
            pgn_files.append((pgn_file, mp_queue))

    # upload_pgn(pgn_files[0])

    with Pool(thread_count) as p:
        async_res = p.map_async(upload_pgn, pgn_files)

        # While there are still results to process
        while not async_res.ready() or not async_res.successful():
            boards, embeddings = mp_queue.get()
            vectors = [(boards[i][0], embeddings[i].tolist(), {"move": boards[i][1]}) for i in range(len(boards))]
            async_results = [
                index.upsert(vectors=ids_vectors_chunk, async_req=True)
                for ids_vectors_chunk in chunks(vectors, batch_size=100)
            ]
            [async_result.get() for async_result in async_results]
    index.close()


if __name__ == "__main__":
    main()
