In [None]:
import os
import re
import random
import math
import pickle
import numpy as np
from collections import deque
from tqdm import tqdm

import tensorflow as tf
from tensorflow.train import Checkpoint
from tensorflow.keras import Model
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, LeakyReLU, ReLU, Flatten
from tensorflow.keras.layers import Conv2D, Dense, BatchNormalization, Add
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.regularizers import l2

Round = lambda x, n: eval('"%.' + str(int(n)) + 'f" % ' + repr(x))

In [None]:
#### SELF PLAY
TRAINEPISODES = 75
MCTSRUNS = 100
MEMSIZE = 90000
NUMTAUZERO = 10 # turn on which it starts playing deterministically
CPUCT = 1
EPSILON = 0.0
ALPHA = 0.8

#### RETRAINING
BATCHSIZE = 256
EPOCHS = 1
REGCONST = 0.0001
LEARNINGRATE = 0.1
MOMENTUM = 0.9
TRAININGLOOPS = 10

#### MODEL ARCHITECTURE FEEL FREE TO CHANGE THIS AND RE-TRAIN
HIDDENCNNLAYERS = [
   {'filters':75, 'kernel_size': (4,4)},
   {'filters':75, 'kernel_size': (4,4)},
   {'filters':75, 'kernel_size': (4,4)},
   {'filters':75, 'kernel_size': (4,4)},
   {'filters':75, 'kernel_size': (4,4)},
   {'filters':75, 'kernel_size': (4,4)}
]

#### EVALUATION
EVALEPISODES = 30
SCORINGTHRESHOLD = 1.3

#### MODEL SAVING (PERSISTENCE) THRESHOLD AND PATHS
SAVETHRESHOLD = 1
SAVEDIR = 'drive/My Drive/yurikov_models/'
MEMORYPATH = SAVEDIR+'memory.pkl'

def sortedAlphaNumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)

def getModelVersion():
    filename = sortedAlphaNumeric(os.listdir(SAVEDIR))[-4]
    nums = []
    ver = 1
    if filename[:5] == 'agent':
        for word in filename[6:].split('.'):
            if word.isdigit():
                nums.append(int(word))
        ver = nums[0]
    return ver

def getModelPath(ver):
    return SAVEDIR+'agentv'+str(ver)+'.h5'

AGENTVER = 0
MODELPATH = SAVEDIR+'agentv1.h5'
if os.path.exists(SAVEDIR+'agentv1.h5'):
    AGENTVER = getModelVersion()
    MODELPATH = getModelPath(AGENTVER)

In [None]:
class Memory(object):
    def __init__(self):
        self.ltmemory = deque(maxlen=MEMSIZE)
        self.stmemory = deque(maxlen=MEMSIZE)

    def commitSTMemory(self, identities, state, pi):
        for tup in identities(state, pi):
            self.stmemory.append(
                {
                    'state': tup[0],
                    'pi': tup[1],
                    'id': tup[0].id,
                    'side': tup[0].side
                },
            )

    def commitLTMemory(self):
        for each in self.stmemory:
            self.ltmemory.append(each)

    def fillValuesLTMemory(self, state, value):
        for move in memory.stmemory:
            if move['side'] == state.side:
                move['value'] = value
            else:
                move['value'] = -value

    def clearSTMemory(self):
        self.stmemory = deque(maxlen=MEMSIZE)


In [None]:
MODELPATH

'drive/My Drive/yurikov_models/agentv1.h5'

In [None]:
class Game:

	def __init__(self):
		self.side = 1
		self.state = State(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], dtype=np.int), 1)
		self.actionspace = np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], dtype=np.int)
		self.pieces = {'1':'X', '0': '-', '-1':'O'}
		self.gridshape = (6,7)
		self.inputshape = (2,6,7)
		self.name = 'connect4'
		self.statesize = len(self.state.binary)
		self.actionsize = len(self.actionspace)

	def reset(self):
		self.state = State(np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], dtype=np.int), 1)
		self.side = 1
		return self.state

	def step(self, action):
		nextstate, value, done = self.state.takeAction(action)
		self.state = nextstate
		self.side = -self.side
		info = None
		return ((nextstate, value, done, info))

	def identities(self, state, pimcts):
		identities = [(state, pimcts)]

		currentboard = state.board
		currentpis = pimcts

		currentboard = np.array([
			  currentboard[6], currentboard[5],currentboard[4], currentboard[3], currentboard[2], currentboard[1], currentboard[0]
			, currentboard[13], currentboard[12],currentboard[11], currentboard[10], currentboard[9], currentboard[8], currentboard[7]
			, currentboard[20], currentboard[19],currentboard[18], currentboard[17], currentboard[16], currentboard[15], currentboard[14]
			, currentboard[27], currentboard[26],currentboard[25], currentboard[24], currentboard[23], currentboard[22], currentboard[21]
			, currentboard[34], currentboard[33],currentboard[32], currentboard[31], currentboard[30], currentboard[29], currentboard[28]
			, currentboard[41], currentboard[40],currentboard[39], currentboard[38], currentboard[37], currentboard[36], currentboard[35]
			])

		currentpis = np.array([
			currentpis[6], currentpis[5],currentpis[4], currentpis[3], currentpis[2], currentpis[1], currentpis[0]
			, currentpis[13], currentpis[12],currentpis[11], currentpis[10], currentpis[9], currentpis[8], currentpis[7]
			, currentpis[20], currentpis[19],currentpis[18], currentpis[17], currentpis[16], currentpis[15], currentpis[14]
			, currentpis[27], currentpis[26],currentpis[25], currentpis[24], currentpis[23], currentpis[22], currentpis[21]
			, currentpis[34], currentpis[33],currentpis[32], currentpis[31], currentpis[30], currentpis[29], currentpis[28]
			, currentpis[41], currentpis[40],currentpis[39], currentpis[38], currentpis[37], currentpis[36], currentpis[35]
					])

		identities.append((State(currentboard, state.side), currentpis))

		return identities


class State():
	def __init__(self, board, side):
		self.board = board
		self.pieces = {'1':'X', '0': '-', '-1':'O'}
		self.winners = [
			[0,1,2,3],
			[1,2,3,4],
			[2,3,4,5],
			[3,4,5,6],
			[7,8,9,10],
			[8,9,10,11],
			[9,10,11,12],
			[10,11,12,13],
			[14,15,16,17],
			[15,16,17,18],
			[16,17,18,19],
			[17,18,19,20],
			[21,22,23,24],
			[22,23,24,25],
			[23,24,25,26],
			[24,25,26,27],
			[28,29,30,31],
			[29,30,31,32],
			[30,31,32,33],
			[31,32,33,34],
			[35,36,37,38],
			[36,37,38,39],
			[37,38,39,40],
			[38,39,40,41],

			[0,7,14,21],
			[7,14,21,28],
			[14,21,28,35],
			[1,8,15,22],
			[8,15,22,29],
			[15,22,29,36],
			[2,9,16,23],
			[9,16,23,30],
			[16,23,30,37],
			[3,10,17,24],
			[10,17,24,31],
			[17,24,31,38],
			[4,11,18,25],
			[11,18,25,32],
			[18,25,32,39],
			[5,12,19,26],
			[12,19,26,33],
			[19,26,33,40],
			[6,13,20,27],
			[13,20,27,34],
			[20,27,34,41],

			[3,9,15,21],
			[4,10,16,22],
			[10,16,22,28],
			[5,11,17,23],
			[11,17,23,29],
			[17,23,29,35],
			[6,12,18,24],
			[12,18,24,30],
			[18,24,30,36],
			[13,19,25,31],
			[19,25,31,37],
			[20,26,32,38],

			[3,11,19,27],
			[2,10,18,26],
			[10,18,26,34],
			[1,9,17,25],
			[9,17,25,33],
			[17,25,33,41],
			[0,8,16,24],
			[8,16,24,32],
			[16,24,32,40],
			[7,15,23,31],
			[15,23,31,39],
			[14,22,30,38],
			]
		self.side = side
		self.binary = self._binary()
		self.id = self._convertStateToId()
		self.allowed = self._allowedActions()
		self.done = self._checkForEndGame()
		self.value = self._getValue()
		self.score = self._getScore()

	def _allowedActions(self):
		allowed = []
		for i in range(len(self.board)):
			if i >= len(self.board) - 7:
				if self.board[i]==0:
					allowed.append(i)
			else:
				if self.board[i] == 0 and self.board[i+7] != 0:
					allowed.append(i)

		return allowed

	def _binary(self):
		currentposition = np.zeros(len(self.board), dtype=np.int)
		currentposition[self.board==self.side] = 1

		opponentposition = np.zeros(len(self.board), dtype=np.int)
		opponentposition[self.board==-self.side] = 1

		position = np.append(currentposition, opponentposition)

		return (position)

	def _convertStateToId(self):
		wposition = np.zeros(len(self.board), dtype=np.int)
		wposition[self.board==1] = 1

		bposition = np.zeros(len(self.board), dtype=np.int)
		bposition[self.board==-1] = 1

		position = np.append(wposition, bposition)

		id = ''.join(map(str, position))

		return id

	def _checkForEndGame(self):
		if np.count_nonzero(self.board) == 42:
			return 1

		for x,y,z,a in self.winners:
			if (self.board[x] + self.board[y] + self.board[z] + self.board[a] == 4 * -self.side):
				return 1
		return 0

	def _getValue(self):
		# This is the value of the state for the current player
		# i.e. if the previous player played a winning move, you lose
		for x,y,z,a in self.winners:
			if (self.board[x] + self.board[y] + self.board[z] + self.board[a] == 4 * -self.side):
				return (-1, -1, 1)
		return (0, 0, 0)

	def _getScore(self):
		tmp = self.value
		return (tmp[1], tmp[2])

	def takeAction(self, action):
		board_ = np.array(self.board)
		board_[action]=self.side

		state_ = State(board_, -self.side)

		value = 0
		done = 0

		if state_.done:
			value = state_.value[0]
			done = 1

		return (state_, value, done)

	def render(self):
		for r in range(6):
			print([self.pieces[str(x)] for x in self.board[7*r : (7*r + 7)]])
		print('--------------')

	def printavs(self, pi):
		printpi = []
  
		for i in range(42):
			if pi[i] == 0:
				printpi.append('----')
			else:
				printpi.append(Round(pi[i], 2))

		for r in range(6):
			print([x for x in printpi[7*r : (7*r + 7)]])
		print('--------------')

In [None]:
def SoftmaxCrossEntropyWithLogits(y_true, y_pred):

	picnn = y_pred
	pimcts = y_true

	zero = tf.zeros(shape = tf.shape(pimcts), dtype=tf.float32)
	where = tf.equal(pimcts, zero)

	negatives = tf.fill(tf.shape(pimcts), -100.0) 
	picnn = tf.where(where, negatives, picnn)

	loss = tf.nn.softmax_cross_entropy_with_logits(labels = pimcts, logits = picnn)

	return loss

In [None]:
class ACResNet(object):
    def __init__(self, inputdims, outputdims, hiddenlayers, lr=LEARNINGRATE, regconst=REGCONST, momentum=MOMENTUM):
        self.lr = lr
        self.regconst = regconst
        self.momentum = momentum
        self.inputdims = inputdims
        self.outputdims = outputdims
        self.hiddenlayers = hiddenlayers
        self.cpktcall = None
        self.model = self.buildModel()

    def residualLayer(self, inputblock, filters, kernelsize):
        outblock = self.convLayer(inputblock, filters, kernelsize)
        outblock = Conv2D(
            filters = filters,
            kernel_size = kernelsize,
            data_format = 'channels_first',
            padding = 'same',
            use_bias = False,
            activation = 'linear',
            kernel_regularizer = l2(self.regconst)
        )(outblock)

        outblock = BatchNormalization(axis=1)(outblock)
        outblock = Add()([inputblock, outblock])
        outblock = LeakyReLU()(outblock)

        return outblock

    def convLayer(self, inputblock, filters, kernelsize):
        outblock = Conv2D(
            filters = filters,
            kernel_size = kernelsize,
            data_format = 'channels_first',
            padding = 'same',
            use_bias = False,
            activation = 'linear',
            kernel_regularizer = l2(self.regconst)
        )(inputblock)

        outblock = BatchNormalization(axis=1)(outblock)
        outblock = LeakyReLU()(outblock)

        return outblock

    def valueHead(self, outblock):
        outblock = Conv2D(
        filters = 1,
        kernel_size = (1,1),
        data_format="channels_first",
        padding = 'same',
        use_bias=False,
        activation='linear',
        kernel_regularizer = l2(self.regconst),
        )(outblock)

        outblock = BatchNormalization(axis=1)(outblock)
        outblock = LeakyReLU()(outblock)
        outblock = Flatten()(outblock)

        outblock = Dense(
        	20,
            use_bias=False,
        	activation='linear',
        	kernel_regularizer=l2(self.regconst)
        	)(outblock)

        outblock = LeakyReLU()(outblock)

        outblock = Dense(
        	1,
        	use_bias=False,
        	activation='tanh',
        	kernel_regularizer=l2(self.regconst),
        	name = 'value_head',
        	)(outblock)

        return outblock

    def policyHead(self, outblock):
        outblock = Conv2D(
        filters = 2,
        kernel_size = (1,1),
        data_format="channels_first",
        padding = 'same',
        use_bias=False,
        activation='linear',
        kernel_regularizer = l2(self.regconst),
        )(outblock)

        outblock = BatchNormalization(axis=1)(outblock)
        outblock = LeakyReLU()(outblock)

        outblock = Flatten()(outblock)

        outblock = Dense(
        	self.outputdims,
        	use_bias=False,
        	activation='linear',
        	kernel_regularizer=l2(self.regconst),
        	name = 'policy_head',
        	)(outblock)

        return outblock

    def buildModel(self):
        state = Input(shape=self.inputdims)
        block = self.convLayer(state,
                                self.hiddenlayers[0]['filters'],
                                self.hiddenlayers[0]['kernel_size'])

        if len(self.hiddenlayers) > 1:
            for hidden in self.hiddenlayers:
                block = self.residualLayer(block,
                                            hidden['filters'],
                                            hidden['kernel_size'])

        vh = self.valueHead(block)
        ph = self.policyHead(block)

        model = Model(inputs=[state], outputs=[vh, ph])
        model.compile(loss={'value_head': 'mean_squared_error',
                            'policy_head': SoftmaxCrossEntropyWithLogits},
            optimizer=SGD(lr=self.lr, momentum = self.momentum),
            loss_weights={'value_head': 0.5, 'policy_head': 0.5}
        )

        return model

    def predict(self, state):
        preds = self.model.predict(state)
        return preds

    def convertToModelInput(self, state):
        inputmodel = state.binary
        inputmodel = np.reshape(inputmodel, self.inputdims)
        return inputmodel


In [None]:
class Node:
    def __init__(self, state):
        self.state = state
        self.id = self.state.id
        self.edges = []
        self.side = self.state.side

    def isLeaf(self):
        if len(self.edges) > 0:
            return False
        else:
            return True

class Edge:
    def __init__(self, inpnode, outnode, prior, action):
        self.id = inpnode.id + '|' + outnode.id
        self.inpnode, self.outnode = inpnode, outnode
        self.side = self.inpnode.side
        self.action = action
        self.P = prior
        self.W = 0
        self.N = 0
        self.Q = 0

class MCTS:
    def __init__(self, root, cpuct):
        self.root = root
        self.cpuct = cpuct
        self.tree = {}
        self.appendNode(self.root)

    def __len__(self):
        return len(self.tree)

    def appendNode(self, node):
        self.tree[node.state.id] = node

    def selectNode(self):
        currentNode = self.root
        breadcrums = []
        value = 0
        done = 0

        while not currentNode.isLeaf():
            maxQU = -99999
            Nb = np.sum([edge.N for edge in currentNode.edges])

            if currentNode == self.root:
                epsilon = EPSILON
                nu = np.random.dirichlet([ALPHA] * len(currentNode.edges))
            else:
                epsilon = 0
                nu = [0] * len(currentNode.edges)

            for i, edge in enumerate(currentNode.edges):
                Q = edge.Q
                U = self.cpuct * ((1 - epsilon) * edge.P + epsilon * nu[i]) * np.sqrt(Nb/(1+edge.N))

                if Q + U > maxQU:
                    maxQU = Q + U
                    simEdge = edge

            state_, value, done = currentNode.state.takeAction(simEdge.action)
            currentNode = simEdge.outnode
            breadcrums.append(simEdge)

        return (currentNode, value, done, breadcrums)

    def evalNode(self, leaf, done, valterm, valcnn, picnn):
        if not done:
            value = valcnn

            for action in leaf.state.allowed:
                state_, _, _ = leaf.state.takeAction(action)
                if state_.id in self.tree:
                    node_ = self.tree[state_.id]
                else:
                    node_ = Node(state_)
                    self.appendNode(node_)
                edge_ = Edge(leaf, node_, picnn[action], action)
                leaf.edges.append(edge_)

        else:
            value = valterm

        return value

    def backpropNode(self, leaf, value, breadcrums):
        for edge in reversed(breadcrums):
            if edge.side == leaf.state.side:
                switch = 1
            else:
                switch = -1

            edge.W = edge.W + (switch * value)
            edge.N = edge.N + 1
            edge.Q = edge.W / edge.N


In [None]:
class Agent:
    def __init__(self, name, resnet, actionsize, cpuct):
        self.name = name
        self.resnet = resnet
        self.actionsize = actionsize
        self.mcts = None
        self.cpuct = cpuct
        self.mctsruns = MCTSRUNS
        self.policyloss, self.valueloss, self.fullloss  = [], [], []

    def buildMCTS(self, state):
        root = Node(state)
        self.mcts = MCTS(root, self.cpuct)

    def changeRootMCTS(self, state):
        self.mcts.root = self.mcts.tree[state.id]

    def getPreds(self, state):
        stateinput = np.array([self.resnet.convertToModelInput(state)])
        preds = self.resnet.predict(stateinput)

        arrvalues = preds[0]
        arrlogits = preds[1]

        val = arrvalues[0][0]
        logits = arrlogits[0]

        allowed = state.allowed
        disallowed = np.array([sq for sq in range(logits.shape[0]) if sq not in \
                                                                    allowed])
        logits[disallowed] = -100

        exps = np.exp(logits)
        pi = exps / np.sum(exps)

        return (val, pi)

    def getTargetPi(self, tau):
        pi = np.zeros((self.actionsize), dtype=np.integer)
        values = np.zeros((self.actionsize), dtype=np.float32)

        for edge in self.mcts.root.edges:
            pi[edge.action] = pow(edge.N, 1/tau)
            values[edge.action] = edge.Q

        pi = pi/np.sum(pi)
        return (pi, values)

    def simMCTS(self):
        leaf, valterm, done, breadcrums = self.mcts.selectNode() ##selection
        valcnn, picnn = self.getPreds(leaf.state) ##prediction
        finalvalue = self.mcts.evalNode(leaf, done, valterm, valcnn, picnn) ##eval
        self.mcts.backpropNode(leaf, finalvalue, breadcrums) ##backprop

    def chooseAction(self, pi, values, tau):
        if tau == 0:
            actions = np.argwhere(pi == max(pi))
            action = random.choice(actions)[0]
        else:
            idx = np.random.multinomial(1, pi)
            action = np.where(idx == 1)[0][0]

        value = values[action]

        return (action, value)

    def runMCTS(self, state):
        if self.mcts is None or state.id not in self.mcts.tree:
            self.buildMCTS(state)
        else:
            self.changeRootMCTS(state)

        for _ in range(self.mctsruns):
            self.simMCTS()

    def train(self, ltmemory):
        for _ in range(TRAININGLOOPS):
            minibatch = random.sample(ltmemory, min(BATCHSIZE, len(ltmemory)))

            states = np.array([self.resnet.convertToModelInput(row['state']) for row in minibatch])
            targetspi = np.array([row['pi'] for row in minibatch])
            targetsval = np.array([row['value'] for row in minibatch])

            fit = self.resnet.model.fit(states, {'value_head': targetsval, 'policy_head': targetspi},
                                        batch_size=32, epochs=EPOCHS, verbose=1, validation_split=0)


In [None]:
class Player:
    def __init__(self, agent, name, side):
        self.agent = agent
        self.name = name
        self.side = side
        self.wins = 0
        self.draws = 0

def playGamesAgentvAgent(best1, best2, memory, env, episodes, numtauzero, hiddenlayers):
    sides = np.array([1, -1])
    np.random.shuffle(sides)

    best1.mcts = None 
    best2.mcts = None

    player1 = Player(best1, best1.name+'_1', sides[0])
    player2 = Player(best2, best2.name+'_2', sides[1])

    for e in range(episodes):
        state = env.reset()
        done = 0
        numt = 0
        #state.render()

        player = player1 if player1.side == 1 else player2

        print('EPISODE: %s' % e)

        while not done:
            player.agent.runMCTS(state)
            pimcts, valsmcts = player.agent.getTargetPi(1)

            if numt < numtauzero:
                action, mctsval = player.agent.chooseAction(pimcts, valsmcts, tau=1)
            else:
                action, mctsval = player.agent.chooseAction(pimcts, valsmcts, tau=0)

            numt += 1

            #state.printavs(pimcts)
            #print('Action: '+str(action))

            memory.commitSTMemory(env.identities, state, pimcts)
            state, valenv, done, info = env.step(action)

            cnnval, _ = player.agent.getPreds(state)

            print('MCTS Value: '+str(mctsval))
            print('CNN Value: '+str(-1 * cnnval))
            
            if done:
                memory.fillValuesLTMemory(state, valenv)
                memory.commitLTMemory()
                memory.clearSTMemory()

            #state.render()

            player = player1 if player.side == player2.side else player2

        if valenv != 0:
            if player1.side == env.side:
                player2.wins += 1
            elif player2.side == env.side:
                player1.wins += 1

            print(player1.name+' wins: '+str(player1.wins))
            print(player2.name+' wins: '+str(player2.wins))
        else:
            player1.draws += 1
            player2.draws += 1
            print(player1.name+' draws: '+str(player1.draws))
            print(player2.name+' draws: '+str(player2.draws))

        player1.side, player2.side = player2.side, player1.side
    
    return (player1.wins, player2.wins)

def playGamesRandomvAgent(agent, memory, env, cpuct, numtauzero, hiddenlayers):
    sides = np.array([1, -1])
    np.random.shuffle(sides)

    player1 = Player(agent, agent.name+'_1', sides[0])
    player2 = Player(None, 'random_agent_2', sides[1])

    for e in range(20):
        state = env.reset()
        done = 0

        state.render()
        print('EPISODE: %s' % e)

        player = player1 if player1.side == 1 else player2

        while not done:
            if player1.side == player.side:
                player.agent.runMCTS(state)
                pimcts, valsmcts = player.agent.getTargetPi(1)
                action, value = player1.agent.chooseAction(pimcts, valsmcts, tau=0)

                state.printavs(pimcts)
            else:
                actions = state.allowed
                print(state.allowed)
                action =  int(input('Play here: ')) #actions[math.floor(np.random.random()*len(actions))]

            print('Action: %s' % action)

            state, val, done, _ = env.step(action)
            player = player1 if player.side == player2.side else player2

            state.render()

        if val != 0:
            if player1.side == env.side:
                player2.wins += 1
            elif player2.side == env.side:
                player1.wins += 1

            print(player1.name+' wins: '+str(player1.wins))
            print(player2.name+' wins: '+str(player2.wins))
        else:
            player1.draws += 1
            player2.draws += 1
            print(player1.name+' draws: '+str(player1.draws))
            print(player2.name+' draws: '+str(player2.draws))

        player1.side, player2.side = player2.side, player1.side
    
    return (player1.wins, player2.wins)

In [None]:
env = Game()
memory = Memory()

In [None]:
MODELPATH

'drive/My Drive/yurikov_models/agentv1.h5'

In [None]:
currentAgent = Agent('current_agent', ACResNet(env.inputshape, env.actionsize, HIDDENCNNLAYERS), env.actionsize, CPUCT)
bestAgent = Agent('best_agent', ACResNet(env.inputshape, env.actionsize, HIDDENCNNLAYERS), env.actionsize, CPUCT)

if os.path.exists(MODELPATH):
    print('loading model from %s...' % MODELPATH)
    bestAgent.resnet.model = load_model(MODELPATH, custom_objects={'SoftmaxCrossEntropyWithLogits':SoftmaxCrossEntropyWithLogits})
elif os.path.exists(SAVEDIR+'backup.h5'):
    print('loading model from %s...' % SAVEDIR+'backup.h5')
    bestAgent.resnet.model = load_model(SAVEDIR+'backup.h5', custom_objects={'SoftmaxCrossEntropyWithLogits':SoftmaxCrossEntropyWithLogits})

if os.path.exists(MEMORYPATH):
    print('loading memories from %s...' % MEMORYPATH)
    memory = pickle.load(open(MEMORYPATH, 'rb'))

currentAgent.resnet.model.set_weights(bestAgent.resnet.model.get_weights())

loading model from drive/My Drive/yurikov_models/...backup.h5


In [None]:
s = 0

while True:
    _, _ = playGamesAgentvAgent(bestAgent, bestAgent, memory, env, TRAINEPISODES, NUMTAUZERO, HIDDENCNNLAYERS)

    if len(memory.ltmemory) >= MEMSIZE:
        currentAgent.train(memory.ltmemory)
        currentwins, bestwins = playGamesAgentvAgent(currentAgent, bestAgent, memory, env, EVALEPISODES, NUMTAUZERO, HIDDENCNNLAYERS)

        if currentwins > (bestwins * SCORINGTHRESHOLD):
            print('replacing bestAgent weights with currentAgent weights...')
            bestAgent.resnet.model.set_weights(currentAgent.resnet.model.get_weights())

            AGENTVER += 1
            MODELPATH = getModelPath(AGENTVER)

            print('saving model to %s...' % SAVEDIR)
            bestAgent.resnet.model.save(MODELPATH)
    else:
        print('memory size: %d' % len(memory.ltmemory))

    s += 1

    if s % SAVETHRESHOLD == 0:
        print('saving memories to %s' % SAVEDIR)
        pickle.dump(memory, open(MEMORYPATH, 'wb'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
CNN Value: 0.06526549905538559
MCTS Value: 0.8700585
CNN Value: 0.04920274391770363
MCTS Value: -0.914261
CNN Value: 0.08106132596731186
MCTS Value: 1.0
CNN Value: 0.03273453563451767
best_agent_1 wins: 17
best_agent_2 wins: 13
EPISODE: 32
MCTS Value: -0.0039229314
CNN Value: 0.010462791658937931
MCTS Value: 0.0031863658
CNN Value: 0.01927163265645504
MCTS Value: 0.006598748
CNN Value: 0.009808325208723545
MCTS Value: 0.00852629
CNN Value: 0.04635563865303993
MCTS Value: 0.020632623
CNN Value: 0.06309391558170319
MCTS Value: 0.006033786
CNN Value: 0.04060554876923561
MCTS Value: 0.008032954
CNN Value: 0.0284112561494112
MCTS Value: 0.012903788
CNN Value: 0.03133230283856392
MCTS Value: -0.0024577617
CNN Value: 0.010335913859307766
MCTS Value: 0.024720212
CNN Value: 0.03725111111998558
MCTS Value: -0.024717124
CNN Value: -0.00098980322945863
MCTS Value: 0.03147
CNN Value: 0.04731098935008049
MCTS Value: -0.024819516
CNN Va

In [None]:
playGamesRandomvAgent(bestAgent, memory, env,  CPUCT, NUMTAUZERO, HIDDENCNNLAYERS)

In [None]:
memory.ltmemory[-8]['state'].render()
memory.ltmemory[-8]['side'], memory.ltmemory[-8]['value']
memory.ltmemory[-8]['state'].printavs(memory.ltmemory[-8]['pi'])

In [None]:
bestAgent = Agent('best_agent', ACResNet(env.inputshape, env.actionsize, HIDDENCNNLAYERS), env.actionsize, CPUCT)

bestAgent.runMCTS(memory.ltmemory[-8]['state'])
pi, val = bestAgent.getTargetPi(1)
tup = bestAgent.chooseAction(pi, val, tau=0)

memory.ltmemory[-8]['state'].printavs(pi)
tup

In [None]:
bestAgent.resnet.model.save(SAVEDIR+'backup.h5')

In [None]:
len(memory.ltmemory)

In [None]:
env = Game()
state = env.reset()

In [None]:
state_, 

In [None]:
'''
env = Game()
memory = Memory()
state = env.reset()

bestAgent = Agent('best_agent', ACResNet(env.inputshape, env.actionsize, HIDDENCNNLAYERS), env.actionsize, CPUCT)

state, _, _, _ = env.step(38)
state, _, _, _ = env.step(35)
state, _, _, _ = env.step(40)
state, _, _, _ = env.step(41)
state, _, _, _ = env.step(39)
#state, _, _, _ = env.step(31)
state.render() #action seq: 35, 38, 40, 39, 28, 37, 30

bestAgent.runMCTS(state)
pi = bestAgent.getTargetPi(1)
action = bestAgent.chooseAction(pi, tau=0)

state.printavs(pi)
action

for strn, node in bestAgent.mcts.tree.items():
    if node.state.done:
        print('True')
        node.state.render()

'''

In [None]:

'''
['----', '----', '----', '----', '----', '----', '----']
['----', '----', '----', '----', '----', '----', '----']
['----', '----', '----', '----', '----', '----', '----']
['----', '----', '----', '----', '0.14', '----', '----']
['0.14', '----', '0.18', '0.14', '----', '0.12', '0.12']
['----', '0.14', '----', '----', '----', '----', '----']
--------------

30
'''

'''

In [None]:
'''
ltm = memory.ltmemory
curpos = ltm[44]
state = curpos['state']
pi = curpos['pi']
side = curpos['side']

state.render(), state.printavs(pi), side, state.value'''

In [None]:
'''
env = Game()
memory = Memory()

currentAgent = Agent('current_agent', ACResNet(env.inputshape, env.actionsize, HIDDENCNNLAYERS), env.actionsize, CPUCT)
bestAgent = Agent('best_agent', ACResNet(env.inputshape, env.actionsize, HIDDENCNNLAYERS), env.actionsize, CPUCT)

_, _ = playGamesAgentvAgent(bestAgent, bestAgent, memory, env, 2, NUMTAUZERO, HIDDENCNNLAYERS) 
'''

In [None]:
#memory.ltmemory

In [None]:
#memory.ltmemory[-1]['state'].printavs(memory.ltmemory[-1]['pi']), memory.ltmemory[-2]['state'].printavs(memory.ltmemory[-2]['pi']) 

In [None]:
'''
env = Game()
memory = Memory()

bestAgent = Agent('best_agent', ACResNet(env.inputshape, env.actionsize, HIDDENCNNLAYERS), env.actionsize, CPUCT)

if os.path.exists(MODELPATH):
    print('loading model from %s...' % MODELPATH)
    bestAgent.resnet.model = load_model(MODELPATH, custom_objects={'SoftmaxCrossEntropyWithLogits':SoftmaxCrossEntropyWithLogits})

if os.path.exists(MEMORYPATH):
    print('loading memories from %s...' % MEMORYPATH)
    memory = pickle.load(open(MEMORYPATH, 'rb'))'''