In [112]:
"""
idea:
    simulate and get board position after 10 moves (approx end of opening sequence)
    - train model to predict whether white or black will win
    simulate and get board position after 30? moves (material will be less equal at this point)
    - use logistic regression to approximate the relative value/strength of pieces (WE CAN COMPARE THIS BY ELO RANGES!!!)

would like to do a test, how well does the model do based on
    - arbitrary encoding of the pieces
    - logistic regression value
    - alpha zero value
    
board encoding:
    length 64 vector representing board
    - black, + white, 0 nothing

    arbitrary:
        1 pawn
        2 knight
        3 bishop
        4 rook
        5 queen
        6 king
    
    based on alphazero:
        1 pawn
        3.05 knight
        3.33 bishop
        5.63 rook
        9.5 queen
        infinity? king
"""

'\nidea:\n    simulate and get board position after 10 moves (approx end of opening sequence)\n    - train model to predict whether white or black will win\n    simulate and get board position after 30? moves (material will be less equal at this point)\n    - use logistic regression to approximate the relative value/strength of pieces (WE CAN COMPARE THIS BY ELO RANGES!!!)\n\nwould like to do a test, how well does the model do based on\n    - arbitrary encoding of the pieces\n    - logistic regression value\n    - alpha zero value\n    \nboard encoding:\n    length 64 vector representing board\n    - black, + white, 0 nothing\n\n    arbitrary:\n        1 pawn\n        2 knight\n        3 bishop\n        4 rook\n        5 queen\n        6 king\n    \n    based on alphazero:\n        1 pawn\n        3.05 knight\n        3.33 bishop\n        5.63 rook\n        9.5 queen\n        infinity? king\n'

In [1]:
import pandas as pd

df = pd.read_csv("chess_games.csv.zip", usecols=["Result", "AN", "WhiteElo", "BlackElo"], compression="zip")

In [39]:
df[df["WhiteElo"] >= 2500].count()

Result      18775
WhiteElo    18775
BlackElo    18775
AN          18775
dtype: int64

In [112]:
games = df[(df["WhiteElo"] >= 2200) & (df["BlackElo"] >= 2200) & (df['Result'] != '1/2-1/2') & ('%eval' not in df['AN'])]
white_win = games[games['Result'] == '1-0'].sample(10000)
black_win = games[games['Result'] == '0-1'].sample(10000)
sample = pd.concat([white_win, black_win])

In [119]:
games = df[(df['Result'] != '1/2-1/2') & ('%eval' not in df['AN'])]
white_win = games[games['Result'] == '1-0'].sample(10000)
black_win = games[games['Result'] == '0-1'].sample(10000)
sample = pd.concat([white_win, black_win])

In [120]:
sample.head()

Unnamed: 0,Result,WhiteElo,BlackElo,AN
1762118,1-0,2000,1734,1. e4 c6 2. d4 d5 3. e5 Bf5 4. Nf3 e6 5. c3 Ne...
900849,1-0,1878,1550,1. e4 Nc6 2. d4 e5 3. d5 Nb4 4. a3 Na6 5. Nc3 ...
4093516,1-0,1592,1810,1. g3 d5 2. Bg2 c5 3. d3 Nc6 4. Nf3 e5 5. O-O ...
3510156,1-0,1864,2163,1. e4 e6 2. d4 d5 3. e5 c5 4. Nf3 Nc6 5. a3 Qb...
565464,1-0,1824,1702,1. e4 e6 2. d4 d5 3. e5 c5 4. c3 Nc6 5. Nf3 Qb...


In [130]:
import chess.pgn
import io
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import math

# upper case for white, lower case for black

def train_values(sample, num_moves = 30):
    X = pd.DataFrame(columns= ['p', 'P', 'n', 'N', 'b', 'B', 'r', 'R', 'q', 'Q'])
    Y = []
    
    for row in sample.itertuples():
        
        pgn = io.StringIO(row.AN.split(f'{num_moves + 1}.')[0])
        game = chess.pgn.read_game(pgn)
        board = game.board()
        for move in game.mainline_moves():
            board.push(move)
        
        state = board.__str__().split()
        new_row = {'p': 0, 'P': 0, 'n': 0, 'N': 0, 'b': 0, 'B': 0, 'r': 0, 'R': 0, 'q': 0, 'Q': 0}
        if row.Result == '1-0':
            Y.append(1)
        else:
            Y.append(0)
    
        for square in state:
            if square not in {'k', 'K', '.'}:
                new_row[square] += 1
    
        X.loc[len(X)] = new_row

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    
    coefs = [math.e ** coef for coef in abs(model.coef_[0])]
    values = []
    for i in range(0, len(coefs), 2):
        values.append((coefs[i] + coefs[i+1])/2)
    values = [value / values[0] for value in values]
    return {'Pawn': values[0], 'Knight': values[1], 'Bishop': values[2], 'Rook': values[3], 'Queen': values[4]}

In [131]:
coefs = train_values(sample, 40)

Accuracy: 77.83%


In [132]:
print(coefs)

{'Pawn': 1.0, 'Knight': 1.6436299948586683, 'Bishop': 1.6887651789985794, 'Rook': 2.6595258511372615, 'Queen': 9.081474796965267}


In [30]:
initial_board = [
    4,  2,  3,  5,  6,  3,  2,  4,
    1,  1,  1,  1,  1,  1,  1,  1,
    0,  0,  0,  0,  0,  0,  0,  0,
    0,  0,  0,  0,  0,  0,  0,  0,
    0,  0,  0,  0,  0,  0,  0,  0,
    0,  0,  0,  0,  0,  0,  0,  0,
   -1, -1, -1, -1, -1, -1, -1, -1,
   -4, -2, -3, -5, -6, -3, -2, -4,
]