In [2]:
from peewee import *
import base64

db = SqliteDatabase('2021-07-31-lichess-evaluations-37MM.db')

In [3]:
class Evaluations(Model):
  id = IntegerField()
  fen = TextField()
  binary = BlobField()
  eval = FloatField()

  class Meta:
    database = db

  def binary_base64(self):
    return base64.b64encode(self.binary)
db.connect()
LABEL_COUNT = 37164639
print(LABEL_COUNT)
eval = Evaluations.get(Evaluations.id == 1)
print(eval.binary_base64())

37164639
b'CAAAAAAAAAAQAAAAAAAAAIEAAAAAAAAAJAAAAAAAAABCAAAAAAAAAADvABAAAAAAAAAAAAAAAAgAAAAAAAAAEAAAAAAAAACBAAAAAAAAACQAAAAAAAAAQgAAAAAAAP8AAAABEz8='


In [5]:
# pick 1m random indices from 0 to LABEL_COUNT with no repeats
import numpy as np

index = np.random.choice(LABEL_COUNT, 1000000, replace=False)

In [6]:

        
from collections import Counter
import chess
from tqdm import tqdm
results = []
for idx in tqdm(index):
    eval = Evaluations.get(Evaluations.id == idx)
    fen = eval.fen

    d = {}
    d['fen'] = fen
    board = chess.Board(fen)
    pieces = dict(Counter([str(v) for v in board.piece_map().values()]))
    d = {**d, **pieces}
    d['num_pieces'] = sum(pieces.values())
    d['bucket'] = int((sum(pieces.values()) - 1) / 4)
    d['white_to_move'] = board.turn
    results.append(d)

100%|██████████| 1000000/1000000 [03:34<00:00, 4658.32it/s]


In [7]:
import pandas as pd
df = pd.DataFrame(results).fillna(0)
# drop duplicate fen strings
print(df.shape)
df = df.drop_duplicates(subset=['fen'])
print(df.head(50))
print(df.shape)

(1000000, 16)
                                                  fen  k    r    q    b    n  \
0   rnbq1rk1/2p2ppp/p2bpn2/1p6/1P1P4/P1N1PN2/B4PPP...  1  2.0  1.0  2.0  2.0   
1   2kr2nr/2p2pq1/p6p/1Pb3p1/Q3Bp2/2P2P1b/1P1N3P/R...  1  2.0  1.0  2.0  1.0   
2   1k6/p1q4Q/Pn2n2p/N2pP3/3P1p1P/5Pp1/6P1/2R3K1 b...  1  0.0  1.0  0.0  2.0   
3                 3r4/8/8/3K1k2/8/3N4/8/8 w - - 43 99  1  1.0  0.0  0.0  0.0   
4   rnbk2nr/pppp2Qp/1b6/4p1N1/1q2P3/8/PPPP1PPP/RNB...  1  2.0  1.0  2.0  2.0   
5   4r1k1/1ppb1ppp/p1np4/4r3/B1P5/4B3/P1P2PPP/1R3R...  1  2.0  0.0  1.0  1.0   
6   r2qkb1r/ppp1np1p/6p1/8/2B1Q3/8/PPPP1PPP/R1B1R1...  1  2.0  1.0  1.0  1.0   
7   r2qbrk1/1p2R1pp/2pP1np1/2P1N3/p1P2RP1/2Q4P/1B6...  1  2.0  1.0  1.0  1.0   
8   2r3k1/5qp1/r1B4R/5p1p/ppQP4/4P3/PP4PP/3R2K1 w ...  1  2.0  1.0  0.0  0.0   
9   3r1rk1/1b3pbp/p4n2/1p6/3P4/3BP3/PP3PPP/RNBQ1RK...  1  2.0  0.0  2.0  1.0   
10  r3kb1r/pp1npppp/2p2n2/q4b2/2BP4/2N2N1P/PPP2PP1...  1  2.0  1.0  2.0  2.0   
11  r2q2k1/8/p1Qn1r2/1p2p2

In [8]:
df.to_pickle('production_models/fen_df_lichess1m.pkl')