In [1]:
#load packages
import sys
sys.path.append("/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/")
import boto3
import numpy as np
import scipy as sp
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse import vstack as vstack
import chess, chess.pgn
import time
import multiprocessing as mp
import os
import re
import pandas as pd
from datetime import datetime
import pickle
import gzip
import tensorflow as tf

In [2]:
# constants
BOARD_LENGTH = 768 #chess board is 8 x 8 and 12 different pieces

## Vector representation of chess board
# v = 1 x BOARD_LENGTH
#
# White = Upper Case, black = lower case
# Piece order: P, N, B, R, Q, K, p, n, b, r, q, k
# Board order:
#    Start at square a1. Move across the columns to square h1.
#    Then go up a row to square a2. Move across the columns to square h2.
#    Repeat until square h8
#    i.e. 0 - a1, 1 - b1, ..., 7 - h1, 8 - a2, ..., 63 - h8
#
# Board vector indices: 
# v[0,...,63] = P, v[64,...,127] = N, ..., v[704,...,767] = k
# v[0,...,7] = row 1; v[8,...,15] = row 2, ..., v[56,...,63] = row 8
# v[0] = col a, v[1] = col b, ..., v[7] = col h

PIECE_OFFSETS = {'P': 0, 'N': 64, 'B': 128, 'R': 192, 'Q': 256, 'K': 320,
                 'p': 384, 'n': 448, 'b': 512, 'r': 576, 'q': 640, 'k': 704}

RESULTS_DICT = {'1-0': 1,'1/2-1/2': 0,'0-1': -1}
RESULTS_LIST = [1, 0, -1]

# Neural Net

In this section, we build, train and test a feedforward neural network on a small sample games to determine an appropriate neural network architecture. We use tensorflow to train the neural network. Then we use AWS to scale up the neural network and train it using all of the data. In the parse data program, the cleaned data was split up into smaller files so that the files can be processed by different cores. We use distributed tensorflow to parallelize the training steps.

## Neural Net Built on Sample Data

We build a neural net on a sample of data in order to determine a resonable number of hidden nodes to use to
train the full model. We use a sample of data so that the training times are short enough that the model can be iterated on. AWS will then be used to train the model on the data from 2007 to 2016, roughly 10M games.

In [3]:
# load 2006 and 2007 data. train/test split. save using numpy/scipy functions
# stats about the data - Number of games
#   Shortest game, average game, longest game in terms of # of moves
#   difference in ratings, min, ave, max
#   num of white wins, draws and loses
#   white win % as a func of rating diff
# code to train nn
# bias variance curves and cost vs iters

### Train/Test Split

In this section, we load the data. Then we check that it is accurate. Then we split the data into a training (80%) dataset and a test (20%) dataset. Finally, we save the datasets.

In [4]:
# directory with data processed from pgn files
dataDir = '../data/clean'
dataFiles = os.listdir(dataDir)
dataFiles

['2007_stats.pickle',
 'ficsgamesdb_2006_chess2000_nomovetimes_1519260_1_mat.pickle.gz',
 '.DS_Store',
 'ficsgamesdb_2006_chess2000_nomovetimes_1519260_2_mat.pickle.gz',
 'ficsgamesdb_2006_chess2000_nomovetimes_1519260_2_stats.pickle',
 'ficsgamesdb_2006_chess2000_nomovetimes_1519260_3_stats.pickle',
 'xTest.npz',
 'ficsgamesdb_2006_chess2000_nomovetimes_1519260_1_stats.pickle',
 'ficsgamesdb_2006_chess2000_nomovetimes_1519260_0_stats.pickle',
 'yTest.npz',
 'yTrain.npz',
 'xTrain.npz',
 'ficsgamesdb_2006_chess2000_nomovetimes_1519260_3_mat.pickle.gz',
 'ficsgamesdb_2006_chess2000_nomovetimes_1519260_0_mat.pickle.gz',
 '2007_mat.pickle.gz']

In [None]:
# load data into memory
boardList = []
resultList = []
statsList = []

for dataFile in dataFiles:
    if dataFile.endswith('.gz'):
        with gzip.open(os.path.join(dataDir,dataFile), 'rb') as f:
            pickledList = f.read()
            boardMat, resultMat = pickle.loads(pickledList) #matrix of board positions, matrix of game results
            boardList.append(boardMat)
            resultList.append(resultMat)    
    
    elif dataFile.endswith('.pickle'):
        stats = pd.read_pickle(os.path.join(dataDir,dataFile))
        statsList.append(stats)
        
#Data check that results vector and board matrix are in sync

s = 0
for i in range(len(boardList)):
    s = s + np.sum(boardList[i].shape[0] == resultList[i].shape[0])
print(len(boardList) - s)
#if not zero then there is a problem with the parsing of the data.
#need to look into parse data code and raw data

In [None]:
# combine list of matrices and into one matrix

boards = scipy.sparse.vstack(boardList)
results = np.concatenate(resultList)
stats = pd.concat(statsList)

del [boardList, resultList, statsList]

In [None]:
# memory usage in GiB
dataGiB = boards.nnz * boards.dtype.itemsize / (1024**3)
colGiB = boards.indices.shape[0] * boards.indices.dtype.itemsize / (1024**3)
rowGiB = boards.indptr.shape[0] * boards.indptr.dtype.itemsize / (1024**3)
print('Boards total GiB: ' + str(dataGiB + colGiB + rowGiB))

dataGiB = results.shape[0] * results.dtype.itemsize / (1024**3)
print('Results total GiB: ' + str(dataGiB))

print('Number of Board Positions: ' + str(boards.shape[0]))

In [None]:
# train test split
from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest = train_test_split(boards, results, train_size = 0.8)

del [boards, results]

In [5]:
# save train and test split
# scipy.sparse.save_npz(os.path.join(dataDir, 'xTrain.npz'), xTrain, compressed=True)
# scipy.sparse.save_npz(os.path.join(dataDir, 'xTest.npz'), xTest, compressed=True)
# np.savez_compressed(os.path.join(dataDir, 'yTrain.npz'), yTrain)
# np.savez_compressed(os.path.join(dataDir, 'yTest.npz'), yTest)
xTrain = scipy.sparse.load_npz(os.path.join(dataDir, 'xTrain.npz'))
yTrain = np.load(os.path.join(dataDir, 'yTrain.npz'))['arr_0']

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes = RESULTS_LIST)

In [7]:
yTrain

array([[ 1],
       [-1],
       [ 1],
       ..., 
       [-1],
       [ 1],
       [-1]], dtype=int8)

### Neural Net Training Code

In this section, we implement a feed forward neural network with one hidden layer. The hidden layer uses the rectified linear function, and the output layer uses the softmax function. Cross-entropy (multinomial likelihood) is used as the loss function

In [None]:
# learning parameters
MINIBATCHSIZE = 512
#MAXITER = 2000
#RELTOL = 0.00001
#MAXEPOCHS = 2
#logs_path = '../log'
#LAMBDA = 1 # strength of L2 regularization

# layer parameters
numInputNodes = BOARD_LENGTH
#numHiddenNodes = 100
#numOutputNodes = 3

In [37]:
class BoardFunction:
    
    def __init__(self, numInputNodes, numHiddenNodes, numOutputNodes):
        # layers
        self.numInputNodes = numInputNodes
        self.numHiddenNodes = numHiddenNodes
        self.numOutputNodes = numOutputNodes
        
        # weight matrices
        self.hiddenWeights = np.empty((self.numInputNodes, self.numHiddenNodes), dtype = np.float_)
        self.hiddenBiases = np.empty((1, self.numHiddenNodes), dtype = np.float_)
        self.outputWeights = np.empty((self.numHiddenNodes, self.numOutputNodes), dtype = np.float_)
        self.outputBiases = np.empty((1, self.numOutputNodes), dtype = np.float_)
        
    def initWeights(self):
        '''
        Randomly initializes the weight matrices
        '''
        
        self.hiddenWeights = np.random.normal(size = self.hiddenWeights.shape)
        self.hiddenBiases = np.random.normal(size = self.hiddenBiases.shape)
        self.outputWeights = np.random.normal(size = self.outputWeights.shape)
        self.outputBiases = np.random.normal(size = self.outputBiases.shape)
        
    def _relu(self, X):
        '''
        X - matrix
        
        returns element wise max of X and zero
        '''
        
        return(np.maximum(X,0))
    
    def _softmax(self, X):
        shiftX = X - np.amax(X, axis = 1, keepdims = True)
        exps = np.exp(shiftX)
        sums = np.sum(exps, axis = 1, keepdims = True)
        
        return(exps / sums)
    
    def predict(self, board):
        '''
        board - csr matrix: sparse row matrix of encoded board positions
    
        returns probs - numpy array: a matrix containing the probability of a win, draw or loss
        '''
        
        numBoards = board.shape[0]
        
        hiddenWeights = board.dot(self.hiddenWeights)
        hiddenBiases = np.outer(np.ones((numBoards, 1), dtype = np.float_), self.hiddenBiases)
        hiddenIn = hiddenWeights + hiddenBiases
        hiddenOut = self._relu(hiddenIn) #rectified linear element-wise max with zero
        
        outputWeights = hiddenOut.dot(self.outputWeights)
        outputBiases = np.outer(np.ones((numBoards, 1), dtype = np.float_), self.outputBiases)
        outputIn = outputWeights + outputBiases
        outputOut = self._softmax(outputIn)
        
        minProb = np.finfo(np.float64).tiny # avoid numerical issues with zero probs
        
        return(np.maximum(outputOut, minProb))
    
    def loss(self, board, result):
        '''
        board - csr matrix: sparse row matrix of encoded board positions
        result - 1d array: one hot enconding of result
        
        returns the cross entropy (multinomial log-likelihood) for the sample
        '''
        
        probs = self.predict(board)
        aveLogLikelihood = -np.sum(result * np.log(probs)) / board.shape[0]
        
        return(aveLogLikelihood)
    
    def calcGradients(self, board, result):
        '''
        board - csr matrix: sparse row matrix of encoded board positions
        result - 1d array: one hot enconding of result
        
        J = cross entropy loss function
        '''
        
        numBoards = board.shape[0]
        
        # feed forward
        hiddenWeights = board.dot(self.hiddenWeights)
        hiddenBiases = np.outer(np.ones((numBoards, 1), dtype = np.float_), self.hiddenBiases)
        hiddenIn = hiddenWeights + hiddenBiases
        hiddenOut = self._relu(hiddenIn) #rectified linear element-wise max with zero
        
        outputWeights = hiddenOut.dot(self.outputWeights)
        outputBiases = np.outer(np.ones((numBoards, 1), dtype = np.float_), self.outputBiases)
        outputIn = outputWeights + outputBiases
        outputOut = self._softmax(outputIn)
        
        # compute gradients
        d1 = outputOut - result
        d2 = d1.dot(self.outputWeights.transpose()) * np.sign(hiddenOut)
        
        # D J(outputWeights)
        DJoutW = hiddenOut.transpose().dot(d1) / numBoards
        
        # D J(outputBiases)
        DJoutB = np.sum(d1.dot(np.eye(result.shape[1])), axis = 0) / numBoards
        
        # D J(hiddenWeights)
        DJhidW = board.transpose().dot(d2) / numBoards
        
        # D J(hiddenBiases)
        DJhidB = np.sum(d2, axis = 0) / numBoards
        
        return(DJoutW, DJoutB, DJhidW, DJhidB)
        

In [22]:
m1, m2,m3,m4 = f.hiddenWeights, f.hiddenBiases, f.outputWeights, f.outputBiases

In [38]:
f = BoardFunction(BOARD_LENGTH, 3000, 3)

In [39]:
f.hiddenWeights, f.hiddenBiases, f.outputWeights, f.outputBiases = m1,m2,m3,m4

In [10]:
f.initWeights()

In [25]:
testVec = xTrain[0:10,:]

In [26]:
r = mlb.fit_transform(yTrain[0:10])

In [13]:
x = tf.placeholder(tf.float64, shape = [None, f.numInputNodes])
y = tf.placeholder(tf.float64, shape = [None, f.numOutputNodes])

# layer weights and biases
hiddenWeights = tf.Variable(f.hiddenWeights, dtype = tf.float64)
hiddenBiases = tf.Variable(f.hiddenBiases, dtype = tf.float64)
outputWeights = tf.Variable(f.outputWeights, dtype = tf.float64)
outputBiases = tf.Variable(f.outputBiases, dtype = tf.float64)

# computations
hidden = tf.nn.relu(tf.add(tf.matmul(x, hiddenWeights), hiddenBiases))
output = tf.add(tf.matmul(hidden, outputWeights), outputBiases)

# cost function
pred = tf.nn.softmax(logits = output)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y, logits = output))

# train
trainStep = tf.train.GradientDescentOptimizer(0.5).compute_gradients(cost)

In [14]:
with tf.Session() as s:
    s.run(tf.global_variables_initializer())
    b, c, d = s.run([pred, cost, trainStep], feed_dict = {x: testVec.toarray(), y: r})
    s.close()

In [40]:
f.loss(testVec,r)

82.357267404708651

In [41]:
f.predict(testVec)

array([[  1.00000000e+000,   5.49102659e-204,   5.13448403e-145],
       [  1.00000000e+000,   3.01993258e-265,   8.90155037e-082],
       [  1.00000000e+000,   1.07235943e-216,   4.32219641e-158],
       [  1.00000000e+000,   4.51320738e-091,   9.65908182e-070],
       [  9.99483101e-001,   2.69832121e-086,   5.16899005e-004],
       [  1.00000000e+000,   2.78646300e-151,   3.62220323e-024],
       [  1.00000000e+000,   1.27503933e-192,   3.91854584e-125],
       [  1.00000000e+000,   2.87236457e-062,   2.25777730e-071],
       [  4.51945766e-003,   1.11887908e-028,   9.95480542e-001],
       [  1.00000000e+000,   5.72030266e-144,   1.46704777e-070]])

In [42]:
DoutW, DoutB, DhidW, DhidB = f.calcGradients(testVec,r)

In [43]:
DhidB

array([-0.40409952, -0.16742241, -0.18327511, ...,  0.10448689,
        0.01214752, -0.67927708])

In [35]:
d[1][0] -

array([[-0.40409952, -0.16742241, -0.18327511, ...,  0.10448689,
         0.01214752, -0.67927708]])

In [44]:
print(np.sum(d[0][0] - DhidW))
print(np.sum(d[1][0] - DhidB))
print(np.sum(d[2][0] - DoutW))
print(np.sum(d[3][0] - DoutB))

4.57798709152e-14
-8.61834000975e-16
-5.98323253871e-14
-5.55111512313e-17


In [20]:
d[0][0]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        , -0.05580747,  0.        , ...,  0.        ,
         0.        , -0.11405194],
       [-0.06951448,  0.        ,  0.        , ...,  0.        ,
         0.13577456, -0.16827343],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
f.predict(testVec)

In [None]:
yTrain[0:2]

In [None]:
f.loss(testVec,r)

In [None]:
c

In [None]:
f.hiddenWeights

In [None]:
# load sample data
dataDir = '../sample/clean'
dataFiles = os.listdir(dataDir)

boardList = []
resultList = []
statsList = []
for dataFile in dataFiles:
    if dataFile.endswith('.gz'):
        with gzip.open(os.path.join(dataDir,dataFile), 'rb') as f:
            pickledList = f.read()
            boardMat, resultMat = pickle.loads(pickledList) #matrix of board positions, matrix of game results
            boardList.append(boardMat)
            resultList.append(resultMat)    
    
    elif dataFile.endswith('.pickle'):
        stats = pd.read_pickle(os.path.join(dataDir,dataFile))
        statsList.append(stats)
        
#Data check that results vector and board matrix are in sync

s = 0
for i in range(len(boardList)):
    s = s + np.sum(boardList[i].shape[0] == resultList[i].shape[0])
print(len(boardList) - s)
#if not zero then there is a problem with the parsing of the data.
#need to look into parse data code and raw data

In [None]:
boards = vstack(boardList)
results = np.concatenate(resultList)
stats = pd.concat(statsList)

In [None]:
print(boards.shape)
print(results.shape)
print(stats.Moves.sum())
print(stats.shape[0])
print(stats.Moves.sum() / stats.shape[0] / 2)

In [None]:
boards.getnnz() * 8 * 3 / 1024 / 1024

In [None]:
# train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes = [-1,0,1])

xTrain, xTest, yTrain, yTest = train_test_split(boards, results, train_size = 0.8)

In [None]:
# generator to feed batches to model
def nextBatch(totObs, batchSize):
    # initialize
    numBatches = totObs // batchSize
    tail = totObs % batchSize
    batch = 0
    
    while True:
        batch = (batch + 1) % numBatches
        if batch == 1:
            firstInd = 0
            lastInd = batchSize
        elif batch == 0:
            firstInd = lastInd
            lastInd = lastInd + batchSize + tail
        else:
            firstInd = lastInd
            lastInd = lastInd + batchSize
        
        yield firstInd, lastInd

In [None]:
# learning parameters
MINIBATCH_SIZE = 512
MAXITER = 2000
RELTOL = 0.00001
MAXEPOCHS = 2
logs_path = '../log'
#LAMBDA = 1 # strength of L2 regularization

# layer parameters
numInputNodes = BOARD_LENGTH
numHiddenNodes = 100
numOutputNodes = 3

# input and output placeholders
#x = tf.sparse_placeholder(tf.float64, shape = [None, numInputNodes])
x = tf.placeholder(tf.float64, shape = [None, numInputNodes])
y = tf.placeholder(tf.float64, shape = [None, numOutputNodes])

# layer weights and biases
hiddenWeights = tf.Variable(tf.random_normal([numInputNodes, numHiddenNodes], 0, 1, dtype = tf.float64))
hiddenBiases = tf.Variable(tf.random_normal([numHiddenNodes], 0, 1, dtype = tf.float64))
outputWeights = tf.Variable(tf.random_normal([numHiddenNodes, numOutputNodes], 0, 1, dtype = tf.float64))
outputBiases = tf.Variable(tf.random_normal([numOutputNodes], 0, 1, dtype = tf.float64))

# computations
# hidden = tf.nn.relu(tf.add(tf.sparse_tensor_dense_matmul(x, hiddenWeights), hiddenBiases))
hidden = tf.nn.relu(tf.add(tf.matmul(x, hiddenWeights), hiddenBiases))
output = tf.add(tf.matmul(hidden, outputWeights), outputBiases)

# cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y, logits = output))

# add L2 regularization
# costReg = cost + lambda_ * (tf.nn.l2_loss(hiddenWeights) + tf.nn.l2_loss(outputWeights)))

# optimization method to minimize cost function
trainStep = tf.train.AdamOptimizer().minimize(cost)


# summary statistics to save
tf.summary.scalar('Cost', cost)
#summary_op = tf.merge_all_summaries()

# train model
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    batchFeeder = nextBatch(xTrain.shape[0], MINIBATCH_SIZE)
    numIter = 0
    currCost = 0
    prevCost = 0
    relImp = 1
    
    # create log writer object
    #writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())
       
    
    while numIter < MAXITER and relImp > RELTOL:
        # update number of iterations
        numIter = numIter + 1
        
        # get next batch
        firstInd, lastInd = next(batchFeeder)
        #xTrain_ = xTrain[firstInd:lastInd]
        xVal = xTrain[firstInd:lastInd].toarray()
        
        # convert sparse xTrain matrix to tensorflow sparse value
        #xTrain_ = xTrain_.tocoo()
        #xVal = tf.SparseTensorValue(
        #    indices = np.stack((xTrain_.row, xTrain_.col), axis = -1),
        #    values = xTrain_.data,
        #    dense_shape = [xTrain_.shape[0], xTrain_.shape[1]])
        #xVal = tf.sparse_reorder(xVal)
    
        # one-hot encoding for yTrain
        yVal = mlb.fit_transform(yTrain[firstInd:lastInd])
    
        prevCost = currCost
        _ , currCost = sess.run([trainStep, cost], feed_dict = {x: xVal, y: yVal})
        
        # write log
        # writer.add_summary(summary, epoch * batch_count + i)
        
        if numIter % 50 == 0:
            print('Iteration: {0} Cost: {1}'.format(numIter, currCost))
        
        if prevCost !=0:
            relImp = abs((prevCost - currCost) / prevCost)

In [None]:
prevCost

In [None]:
if __name__ == '__main__':

# Notes for Report

From Nitish Shirish Keskar, Dheevatsa Mudigere, Jorge Nocedal, Mikhail Smelyanskiy, Ping Tak Peter Tang. On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima. https://arxiv.org/abs/1609.04836 :

The stochastic gradient descent method and its variants are algorithms of choice for many Deep Learning tasks. These methods operate in a small-batch regime wherein a fraction of the training data, usually 32--512 data points, is sampled to compute an approximation to the gradient. It has been observed in practice that when using a larger batch there is a significant degradation in the quality of the model, as measured by its ability to generalize.

In [None]:
b = chess.Board(fen='8/5r1k/2R4P/3p1Pb1/2pP4/8/P4RBK/2r5 b - - 6 42')

In [None]:
b

In [None]:
np.show_config()