In [31]:
DATABASE_URL = "https://database.lichess.org/standard/lichess_db_standard_rated_2023-02.pgn.zst"
COLLECT_GAMES_OVER_ELO = 2500

In [3]:
import requests
import zstandard as zstd
import io
import chess.pgn

In [60]:
def get_games_over_2000():
    """Get games from the lichess database with a rating over 2000."""
    games = []
    r = requests.get(DATABASE_URL, stream=True)
    dctx = zstd.ZstdDecompressor()
    output_file = io.BytesIO()

    with io.BufferedReader(r.raw, buffer_size=16*1024) as reader:
        with dctx.stream_reader(reader) as decompressed_stream:
            game_pgn = ""
            for line in io.TextIOWrapper(decompressed_stream, encoding="utf-8"):
                if line.strip() == "":
                    continue
                if line.startswith("[Event"):
                    global game_count
                    game_count += 1
                    if game_pgn.strip():
                        game = chess.pgn.read_game(io.StringIO(game_pgn))
                        if int(game.headers["WhiteElo"]) > COLLECT_GAMES_OVER_ELO and int(game.headers["BlackElo"]) > COLLECT_GAMES_OVER_ELO:
                            games.append(game)
                    game_pgn = line
                else:
                    game_pgn += line
    return games

In [47]:
game_count = 0
games = get_games_over_2000()
print(game_count)

Headers(Event='Rated Bullet game', Site='https://lichess.org/yceQKND6', Date='2023.02.01', Round='-', White='sebthebest', Black='beforeafter1', Result='1-0', BlackElo='2513', BlackRatingDiff='-5', ECO='C25', Opening='Vienna Game: Max Lange Defense', Termination='Normal', TimeControl='60+0', UTCDate='2023.02.01', UTCTime='00:00:12', WhiteElo='2510', WhiteRatingDiff='+6')
Headers(Event='Rated Bullet game', Site='https://lichess.org/i2D9cMn6', Date='2023.02.01', Round='-', White='Thunder-In-Paradise', Black='BEAST_MODE_ON', Result='1-0', BlackElo='2633', BlackRatingDiff='-7', ECO='D22', Opening="Queen's Gambit Accepted: Alekhine Defense, Haberditz Variation", Termination='Normal', TimeControl='60+0', UTCDate='2023.02.01', UTCTime='00:00:24', WhiteElo='2593', WhiteRatingDiff='+6', WhiteTitle='NM')
Headers(Event='Rated Blitz game', Site='https://lichess.org/yxEGth3Y', Date='2023.02.01', Round='-', White='ForeverColle', Black='CRmessi05', Result='1-0', BlackElo='2530', BlackRatingDiff='-5', 

In [49]:

str(games[1])

'[Event "Rated Bullet game"]\n[Site "https://lichess.org/i2D9cMn6"]\n[Date "2023.02.01"]\n[Round "-"]\n[White "Thunder-In-Paradise"]\n[Black "BEAST_MODE_ON"]\n[Result "1-0"]\n[BlackElo "2633"]\n[BlackRatingDiff "-7"]\n[ECO "D22"]\n[Opening "Queen\'s Gambit Accepted: Alekhine Defense, Haberditz Variation"]\n[Termination "Normal"]\n[TimeControl "60+0"]\n[UTCDate "2023.02.01"]\n[UTCTime "00:00:24"]\n[WhiteElo "2593"]\n[WhiteRatingDiff "+6"]\n[WhiteTitle "NM"]\n\n1. d4 { [%clk 0:01:00] } 1... d5 { [%clk 0:01:00] } 2. c4 { [%clk 0:00:59] } 2... dxc4 { [%clk 0:01:00] } 3. Nf3 { [%clk 0:00:58] } 3... a6 { [%clk 0:01:00] } 4. e3 { [%clk 0:00:56] } 4... b5 { [%clk 0:00:59] } 5. a4 { [%clk 0:00:55] } 5... Bb7 { [%clk 0:00:58] } 6. b3 { [%clk 0:00:52] } 6... e6 { [%clk 0:00:55] } 7. bxc4 { [%clk 0:00:51] } 7... bxc4 { [%clk 0:00:55] } 8. Bxc4 { [%clk 0:00:49] } 8... Nf6 { [%clk 0:00:55] } 9. O-O { [%clk 0:00:47] } 9... c5 { [%clk 0:00:55] } 10. Ba3 { [%clk 0:00:46] } 10... Nbd7 { [%clk 0:00:54] }

with 2500 rating cutoff we get 15588 games where 100 of them are within curoff. This is out of 100M games. Resulting file is 300kB. For all games we get roughly 1.9GB file to keep only 2500 rated games

In [54]:
def write_games_to_pgn_file(games):
    """Write games to a pgn file."""
    with open("games.pgn", "w") as f:
        for game in games:
            f.write(str(game) + "\n\n")

In [55]:
write_games_to_pgn_file(games)

In [56]:
pgn = open("games.pgn")
game = chess.pgn.read_game(pgn)
game

<Game at 0x10bdad590 ('sebthebest' vs. 'beforeafter1', '2023.02.01')>

In [59]:
chess.pgn.read_game(pgn)

<Game at 0x10b6a86d0 ('IMFAR' vs. 'White_7_Tiger', '2023.02.01')>