# To run this, you should use you Princeton google account on google colab, as it has unlimited google drive storage.

## First we need to install all dependencies

In [31]:
%pip install wget
%pip install zstandard
%pip install python-chess
!apt install zstd

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
zstd is already the newest version (1.4.8+dfsg-3build1).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.


## We will mount google drive to store our dataset on.

In [32]:
from google.colab import drive
drive.mount('/gdrive')
gdrive_path_dataset = "/gdrive/My Drive/lichess-dataset/" #Adjust this path to your own Google Drive folder
gdrive_path_pruned = "/gdrive/My Drive/lichess-pruned/" #Adjust this path to your own Google Drive folder

if not os.path.exists(gdrive_path_dataset):
    os.makedirs(gdrive_path_dataset)

if not os.path.exists(gdrive_path_pruned):
    os.makedirs(gdrive_path_pruned)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


## Adjust `thread_count` as needed for potential speed improvements.


In [33]:
# Setup dataset
import wget
import zstandard
import pathlib
from tqdm.auto import tqdm
import os
import io
from multiprocessing import Pool, Queue, Process, Manager
import random
import subprocess
import shutil

# Setup globals
compressed = list() # List of compressed files
thread_count = 4 # Number of threads to use

## Downloading the entire dataset will take on the order of 10s of hours. If needed, you can instead download only a part of the dataset by slicing the download list.

In [34]:
def setup():
    # Setup lichess dataset index
    download_list_path = wget.download('https://database.lichess.org/standard/list.txt')
    download_list_file = open(download_list_path, "r")
    download_list = download_list_file.read().splitlines()
    download_list_file.close()
    pathlib.Path(download_list_path).unlink()
    compressed.clear()

    download_list = download_list[-10:] # Slice this to use a smaller dataset.

    # Download entire dataset
    for url in tqdm(list(reversed(download_list))):
        download_path = gdrive_path_dataset + pathlib.PurePath(url).name
        compressed.append(download_path)
        # Skip pruning if already pruned
        if pathlib.Path(gdrive_path_pruned + pathlib.PurePath(url).stem ).exists():
            compressed.pop()
        elif pathlib.Path(gdrive_path_pruned + pathlib.PurePath(url).stem + ".zst" ).exists():
            compressed.pop()
        # Skip downloading if already downloaded
        if pathlib.Path(download_path).exists():
            continue
        print("Downloading " + url + "...")
        filepath = wget.download(url)
        # Move to drive
        shutil.move(filepath, gdrive_path_dataset + pathlib.Path(filepath).name)

# This WILL take a long time when running initially.
setup()


  0%|          | 0/10 [00:00<?, ?it/s]

Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-01.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-02.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-03.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-04.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-05.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-06.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-07.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-08.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-09.pgn.zst...
Downloading https://database.lichess.org/standard/lichess_db_standard_rated_2013-10.pgn.zst...


In [35]:
def filter_compressed_file(compressed_gamebook_index):
    index, compressed_gamebook, mp_queue = compressed_gamebook_index
    print(index)
    fh = open(compressed_gamebook, "rb")
    filesize = os.fstat(fh.fileno()).st_size
    dctx = zstandard.ZstdDecompressor()
    stream_reader = dctx.stream_reader(fh, read_size=min(1073741824>>6, filesize))
    pgn = io.TextIOWrapper(stream_reader, encoding="utf-8")
    gamebook_path = pathlib.PurePath(compressed_gamebook).stem
    pruned_gamebook = pathlib.PurePath(gamebook_path).stem + "-prune" + pathlib.PurePath(gamebook_path).suffix
    pgn2 = open(pruned_gamebook, mode="w", buffering=1073741824//2)
    game_count = 0
    written_count = 0
    game = list() # List of lines in the game
    while True:
        game.clear()
        white_elo = 0
        black_elo = 0
        bullet = False
        while True:
            line = pgn.readline()
            game.append(line)
            if line == "":
                break
            elif len(line) > 6 and line[6] == "E":
                if line[1] == "W": # line.startswith("[WhiteElo")
                    try:
                        white_elo = int(line[11:-3])
                    except:
                        pass
                elif line[1] == "B": # line.startswith("[BlackElo")
                    try:
                        black_elo = int(line[11:-3])
                    except:
                        pass
            elif line.startswith("[Event ") and "Bullet" in line:
                bullet = True
            elif line == "\n" and game[-3] == "\n":
                break
        game_count += 1
        if line == "":
            break
        elif white_elo < 2000 or black_elo < 2000 or bullet == True:
            continue
        # Write the game to the pruned gamebook
        pgn2.writelines(game)
        written_count += 1
        if written_count % 211 == 0:
            mp_queue.put((index, fh.tell(), filesize, written_count, game_count))

    mp_queue.put((index, fh.tell(), filesize, written_count, game_count))
    pgn.close()
    pgn2.close()
    stream_reader.close()
    fh.close()
    print("Finished " + compressed_gamebook + " with " + str(game_count) + " games", end="")

    # Rename file to original name
    shutil.move(pruned_gamebook, gdrive_path_dataset + pathlib.Path(gdrive_path_pruned + gamebook_path).name)
    print(", renamed.")

    #Run zstandard on the pruned gamebook
    # subprocess.run(["zstd", "pruned/" + gamebook_path], capture_output=True)
    # pathlib.Path("pruned/" + gamebook_path).unlink()

    return compressed_gamebook


In [36]:
"""Progress bar tracking"""
def process_queue(queue, compressed):
    pbars = [None for _ in range(queue.get())]
    pbars_updated = list()

    # Want to exit the loop when all progress bars are finished
    remaining = len(pbars) * [True]

    overall_pbar = tqdm(desc="Overall progress: ", total=len(pbars),
                        bar_format="{desc}{percentage:3.0f}%|{bar}|[{elapsed}<{remaining}, {rate_fmt}{postfix}]")
    overall_progress = [0] * len(pbars)

    for i, filename in enumerate(compressed):
        compressed[i] = pathlib.PurePath(filename).stem[0:-4]

    while any(remaining):
        index, fh_tell, filesize, written_count, game_count = queue.get()
        if not remaining[index]:
            continue
        if pbars[index] is None:
            pbars[index] = tqdm(total=filesize / 1073741824, desc=compressed[index] + ":",
                                unit="GB", unit_scale=True,
                                bar_format="{desc}{percentage:3.0f}%|{bar}| [{n_fmt}/{total_fmt}GB {elapsed}<{remaining}]{postfix}")
            pbars_updated.append(pbars[index])
        pbar = pbars[index]
        pbar.update(fh_tell / 1073741824 - pbars[index].n)
        pbar.set_postfix_str(f"Found {written_count} of {game_count}.")
        overall_progress[index] = fh_tell / filesize
        overall_pbar.update(sum(overall_progress) - overall_pbar.n)
        if fh_tell == filesize:
            remaining[index] = False
            pbar.close()
            pbars_updated.remove(pbar)


In [37]:
def main():
    setup()

    # Shuffle the list of compressed files to distribute the load
    # random.shuffle(compressed)

    with Pool(thread_count) as p:
        m = Manager()
        mp_queue2 = m.Queue()
        mp_queue2.put(len(compressed))
        print(compressed)
        r = p.map_async(filter_compressed_file, zip(range(len(compressed)), compressed, [mp_queue2] * len(compressed)))
        process_queue(mp_queue2, compressed)
        r.wait()

if __name__ == "__main__":
  main()

  0%|          | 0/10 [00:00<?, ?it/s]

0
1
32['/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-01.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-02.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-03.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-04.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-05.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-06.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-07.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-08.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-09.pgn.zst', '/gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-10.pgn.zst']




Overall progress:   0%|          |[00:00<?, ?it/s]

lichess_db_standard_rated_2013-03:  0%|          | [0.00/0.02GB 00:00<?]

lichess_db_standard_rated_2013-02:  0%|          | [0.00/0.02GB 00:00<?]

lichess_db_standard_rated_2013-04:  0%|          | [0.00/0.02GB 00:00<?]

Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-02.pgn.zst with 123962 games, renamed.
4
Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-01.pgn.zst with 121333 games, renamed.


lichess_db_standard_rated_2013-01:  0%|          | [0.00/0.02GB 00:00<?]

5
Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-03.pgn.zst with 158636 games, renamed.
6
Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-04.pgn.zst with 157872 games, renamed.
7


lichess_db_standard_rated_2013-05:  0%|          | [0.00/0.02GB 00:00<?]

lichess_db_standard_rated_2013-06:  0%|          | [0.00/0.03GB 00:00<?]

lichess_db_standard_rated_2013-07:  0%|          | [0.00/0.04GB 00:00<?]

lichess_db_standard_rated_2013-08:  0%|          | [0.00/0.04GB 00:00<?]

Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-05.pgn.zst with 179551 games, renamed.
8
Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-06.pgn.zst with 224680 games, renamed.
9


lichess_db_standard_rated_2013-09:  0%|          | [0.00/0.04GB 00:00<?]

lichess_db_standard_rated_2013-10:  0%|          | [0.00/0.06GB 00:00<?]

Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-07.pgn.zst with 293460 games, renamed.
Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-08.pgn.zst with 325526 games, renamed.
Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-09.pgn.zst with 325099 games, renamed.
Finished /gdrive/My Drive/lichess-dataset3/lichess_db_standard_rated_2013-10.pgn.zst with 411040 games, renamed.
