In [None]:
!pip install cython
%load_ext cython
!ps

Collecting cython
[?25l  Downloading https://files.pythonhosted.org/packages/64/3f/cac281f3f019b825bbc03fa8cb7eb03d9c355f4aa9eef978279a4966cb21/Cython-0.29-cp36-cp36m-manylinux1_x86_64.whl (2.1MB)
[K    100% |████████████████████████████████| 2.1MB 9.0MB/s 
[?25hInstalling collected packages: cython
Successfully installed cython-0.29
    PID TTY          TIME CMD
      1 ?        00:00:00 run.sh
     11 ?        00:00:00 node
     27 ?        00:00:00 node
     55 ?        00:00:01 jupyter-noteboo
     62 ?        00:00:02 python3
     84 ?        00:00:00 python3
     97 ?        00:00:00 ps


In [None]:
import random
import numpy as np
from collections import deque

In [None]:
import tensorflow as tf
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Conv2D, Dense, MaxPool2D, Input, BatchNormalization, Activation
from tensorflow.python.keras.layers import add, concatenate, Flatten, Lambda
from tensorflow.python.keras.regularizers import l2

# ENV

In [None]:
import itertools
import numpy as np

def grouper(iterable, n, fillvalue=None):
    """From python documentation iter tools:
    https://docs.python.org/3/library/itertools.html"""
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)

class DisjointSets:
    def __init__(self, size):
        self.sets = np.ones(size, dtype=np.int32) * -1
        
    def reset(self):
        self.sets = np.ones(len(self.sets), dtype=np.int32) * -1
    
    def union_group(self, elems:'groupable iterable'):
        if len(elems) == 1:
            return
        
        for i, j in grouper(elems, 2, None):
            self.union(i, j)
        
        self.union_group(elems[::2])
    
    def union(self, a, b):
        if a is None or b is None:
            return
        
        repA = self.find(a)
        repB = self.find(b)

        if repA == repB:
            return

        if self.sets[repA] <= self.sets[repB]:
            self.sets[repA] += self.sets[repB]
            self.sets[repB] = repA
        else:
            self.sets[repB] += self.sets[repA]
            self.sets[repA] = repB
        
    def find(self, elem):
        if self.sets[elem] < 0: 
            return elem
        
        i = self.find(self.sets[elem])
        self.sets[elem] = i
        return i
    
    def __setitem__(self, key, val):
        self.sets[key] = val
        
    def __len__(self):
        return len(self.sets)
    
    def copy(self):
        ret = DisjointSets(len(self.sets))
        ret.sets = np.copy(self.sets)
        return ret

class HexEnv:
    """"""
    LEGAL = 1
    ILLEGAL = 0
    
    TOP_OFFSET = -1
    LEFT_OFFSET = -2
    BOTTOM_OFFSET = -3
    RIGHT_OFFSET = -4
    
    NONE = 0
    PLAYER_ONE = 1
    PLAYER_TWO = 2
    
    def __init__(self, shape=(11, 11), swap_rule=True):
        """Creates a hex environment of specified shape.
        Holds 4 main parts:
            - The board represented by a 2D numpy array.
            - The legal actions is represented by a flattened numpy
                array with one extra item for the swap rule
            - The actions taken represented by a numpy array (plus string notation)
            - A disjoint sets for quick detemination if a game is done.
                Has 4 extra items for each edge
        """
        # Since actions records swapping, parity can determine turn order
        self.shape = shape
        action_shape = shape[0] * shape[1] + 1
        self.resigned = False
        
        self.done = HexEnv.NONE
        self.swap_rule = swap_rule
        
        self.board = np.zeros(shape, dtype=np.int32)
        
        self.actions = np.array([], dtype=np.int8)
        self.action_notation = ""
        
        self.legal_actions = np.ones(action_shape, dtype=np.int32)
        self.legal_actions[-1] = HexEnv.ILLEGAL # swap rule not immediately legal
        
        # sets[-1] and sets[-3] are the opposite vertical sizes
        self.sets = DisjointSets(shape[0] * shape[1] + 4)
        
        self.TOP = len(self.sets) + HexEnv.TOP_OFFSET
        self.LEFT = len(self.sets) + HexEnv.LEFT_OFFSET
        self.BOTTOM = len(self.sets) + HexEnv.BOTTOM_OFFSET
        self.RIGHT = len(self.sets) + HexEnv.RIGHT_OFFSET
        
    def move(self, move):
        if isinstance(move, str):
            move_num = self.uci_to_int(move)
            return self.move(move_num)
            
        try:
            if (move is not None) and ((self.legal_actions[move] == HexEnv.ILLEGAL) or (self.done != HexEnv.NONE)):
                print('Illegal Move:', self.generate_move_notation(move))
                print(self.board)
                return HexEnv.ILLEGAL
        except:
            print(move)
            print(self.legal_actions[move])
            print(self.action_notation)
            print(self.generate_move_notation(move))
            raise
            
        if move is None:
            # even length - player-two moved last and wins
            if len(self.actions) % 2 == 0:
                self.done = HexEnv.PLAYER_TWO
            else:
                self.done = HexEnv.PLAYER_ONE
            self.resigned = True
            return
        
        if move == self.shape[0] * self.shape[1] or move == -1:
            # Swap rule Invoked
            # ADD TO BOARD
            assert len(self.actions) == 1
            row, col = self.get_move_position(self.actions[0])
            
            assert self.board[row][col] == HexEnv.PLAYER_ONE
            self.board[row][col] = HexEnv.NONE
            self.board[col][row] = HexEnv.PLAYER_TWO
            
            # UPDATE LEGAL ACTIONS
            self.legal_actions[self.get_action_position(row, col)] = HexEnv.LEGAL
            self.legal_actions[self.get_action_position(col, row)] = HexEnv.ILLEGAL
            
            # UPDATE DISJOINT SET
            self.sets.reset()
            neighbors = self.get_neighbors(self.get_action_position(col, row), self.board[col][row])
            self.sets.union_group(neighbors)
            
        else:
            # ADD TO BOARD
            row, col = self.get_move_position(move)
            self.board[row][col] = (len(self.actions) % 2) + 1
            
            # UPDATE LEGAL ACTIONS
            self.legal_actions[move] = HexEnv.ILLEGAL
            
            # UPDATE DISJOINT SET
            neighbors = self.get_neighbors(move, self.board[row][col])
            self.sets.union_group(neighbors)
            
        # SET WINNER  # XXX
        if self.sets.find(self.TOP) == self.sets.find(self.BOTTOM):
            self.done = HexEnv.PLAYER_ONE
        
        if self.sets.find(self.LEFT) == self.sets.find(self.RIGHT):
            self.done = HexEnv.PLAYER_TWO
        
        # ADD TO ACTIONS
        self.actions = np.append(self.actions, move)
        self.action_notation += " " + self.generate_move_notation(move)
        self.action_notation = self.action_notation.strip()
        
        # Update swap rule
        self.legal_actions[-1] = 1 if len(self.actions) == 1 else 0
        return HexEnv.LEGAL
        
    def get_neighbors(self, move_num, player):
        neighbors = []
        row, col = self.get_move_position(move_num)
        
        for i in [-1, 0, 1]:
            for j in [-1, 0, 1]:
                if i * j ==1:
                    continue
                
                curr_row, curr_col = row + i, col + j
                
                if curr_row < 0 or curr_col < 0:
                    continue
                if curr_row >= self.shape[0] or curr_col >= self.shape[1]:
                    continue
                
                # All are in array (without index wrapping)
                if self.board[curr_row][curr_col] == player:
#                     print('Normal')
                    neighbors.append(self.get_action_position(curr_row, curr_col))
                    
                    # add which edge it is on
                    if curr_row == 0 and player == HexEnv.PLAYER_ONE:
#                         print('TOP')
                        neighbors.append(self.TOP)
                        
                    if curr_row == self.shape[0] - 1 and player == HexEnv.PLAYER_ONE:
#                         print('BOTTOM')
                        neighbors.append(self.BOTTOM)
                    
                    if curr_col == 0 and player == HexEnv.PLAYER_TWO:
#                         print('LEFT')
                        neighbors.append(self.LEFT)
                        
                    if curr_col == self.shape[1] - 1 and player == HexEnv.PLAYER_TWO:
#                         print('RIGHT')
                        neighbors.append(self.RIGHT)
                
        return neighbors
    
    def invoke_swap_rule(self):
        """Activate swap rule on board. Only the second player can invoke swap rule.
        
        It is really just a transpose along the main diagnol + a value change."""
        pass
        
    def get_move_position(self, move_idx):
        """Does not handle swap"""
        row, col = int(move_idx / self.shape[1]), move_idx % self.shape[1]
        return row, col
    
    def get_action_position(self, row, col):
        return row * self.shape[1] + col
    
    def get_legal_moves(self):
        return self.legal_actions
    
    def get_legal_moves_expensive(self):
        # The played spaces happen to be the exact illegal spaces
        # excluding swap rule.
        legal_mat = np.copy(self.board).flatten()
        legal_mat[legal_mat > 1] = 1
        legal_mat = 1 - legal_mat
        
        # add swap legality (1 if legal)
        legal_mat = np.append(legal_mat, 1 if len(self.actions) == 1 else 0)
        
        return legal_mat
    
    def calc_output_prob(self, policy_output):
        """
        Returns:
            Normalized list of possible moves with swap being listed last
            
            Note - If the predicted output for legal values sum to zero, it will assume
                    all legal values have the same probability
            Note - An array of NaN will be returned if there are no legal moves
        """
        legal_mat = self.get_legal_moves()

        # zero out if illegal
        confidence_mat = np.zeros(policy_output.shape)
        np.putmask(confidence_mat, legal_mat == 1, policy_output)
        
        # normalize probs
        total = np.sum(confidence_mat)
        if total != 0:
            return confidence_mat / total
        else:
            return legal_mat / np.sum(legal_mat)
            
    def generate_model_inputs(self, mat=None, channels_format='channels_last'):
        if mat is None:
            mat = self.board
        formats = {
            'channels_first': 0,
            'channels_last': -1,
        }

        rows, cols = self.shape

        # 2 channels for two players and no extras since
        # Hex is NOT completely observable (due to swap rule)
        player_one = np.zeros((rows, cols), dtype=np.float32)
        np.putmask(player_one, mat == 1, 1)

        player_two = np.zeros((rows, cols), dtype=np.float32)
        np.putmask(player_two, mat == 2, 1)

        if len(self.actions) % 2 == 0:
            player_turn = np.zeros(self.shape, dtype=np.float32)
        else:
            player_turn = np.ones(self.shape, dtype=np.float32)

        return np.stack(
            (player_one, player_two, player_turn),
            axis=formats[channels_format])

    def generate_move_notation(self, move_num):
        if move_num == self.shape[0] * self.shape[1] or move_num == -1:
            return "SWAP"
        else:
            row, col = self.get_move_position(move_num)
            return "".join([chr(ord('A') + row), str(1 + col)])
        
    def generate_key(self):
        return self.action_notation
    
    def generate_actions(self, key_string):
        actions = key_string.split(" ")
        return [self.uci_to_int(a) for a in actions]
    
    def uci_to_int(self, uci):
        uci = uci.strip()
        if uci == "SWAP":
            return self.shape[0] * self.shape[1]
        else:
            row, col = uci[0], uci[1:]
            return self.get_action_position(ord(row) - ord('A'), int(col) - 1)
    
    def copy(self):
        env = HexEnv()
        env.shape = self.shape
        env.done = self.done
        env.swap_rule = self.swap_rule
        
        env.board = np.copy(self.board)
        env.actions = np.copy(self.actions)
        env.action_notation = self.action_notation
        
        env.legal_actions = np.copy(env.legal_actions)
        
        # sets[-1] and sets[-3] are the opposite vertical sizes
        env.sets = self.sets.copy()
        
        env.TOP = self.TOP
        env.LEFT = self.LEFT
        env.BOTTOM = self.BOTTOM
        env.RIGHT = self.RIGHT
        return env

# Blocks

In [None]:
class conv_block:
    def __init__(self,
                 filters,
                 kernel_size,
                 strides=(1, 1),
                 activation='relu',
                 padding='same',
                 use_bias=False,
                 kernel_regularizer=l2(1e-4)):
        self.filters = filters
        self.strides = strides
        self.kernel_size = kernel_size
        self.activation = activation
        self.padding = padding
        self.use_bias = use_bias
        self.kernel_regularizer = kernel_regularizer

    def __call__(self, inp):
        x = Conv2D(
            self.filters,
            self.kernel_size,
            strides=self.strides,
            padding=self.padding,
            use_bias=self.use_bias,
            kernel_regularizer=self.kernel_regularizer)(inp)
        x = BatchNormalization(axis=-1)(x)
        x = Activation(self.activation)(x)
        return x


class ModuleA:
    def __call__(self, inp):
        with tf.variable_scope('Tower_1'):
            t_1 = conv_block(32, (1, 1), activation='relu',
                             padding='same')(inp)

        with tf.variable_scope('Tower_2'):
            t_2 = conv_block(32, (1, 1), activation='relu',
                             padding='same')(inp)
            t_2 = conv_block(32, (3, 3), activation='relu',
                             padding='same')(t_2)

        with tf.variable_scope('Tower_3'):
            t_3 = conv_block(32, (1, 1), activation='relu',
                             padding='same')(inp)
            t_3 = conv_block(32, (3, 3), activation='relu',
                             padding='same')(t_3)
            t_3 = conv_block(32, (3, 3), activation='relu',
                             padding='same')(t_3)

        x = concatenate(inputs=[t_1, t_2, t_3], axis=-1)
        x = conv_block(256, (1, 1), activation='linear', padding='same')(x)
        x = add([x, inp])
        x = Activation('relu')(x)
        return x


class ModuleB:
    def __call__(self, inp):
        with tf.variable_scope('Tower_1'):
            t_1 = conv_block(128, (1, 1), activation='relu',
                             padding='same')(inp)

        with tf.variable_scope('Tower_2'):
            t_2 = conv_block(128, (1, 1), activation='relu',
                             padding='same')(inp)
            t_2 = conv_block(128, (1, 7), activation='relu',
                             padding='same')(t_2)
            t_2 = conv_block(128, (7, 1), activation='relu',
                             padding='same')(t_2)

        x = concatenate(inputs=[t_1, t_2], axis=-1)
        x = conv_block(1024, (1, 1), activation='linear', padding='same')(x)
        x = add([x, inp])
        x = Activation('relu')(x)
        return x


class ModuleC:
    def __call__(self, inp):
        with tf.variable_scope('Tower_1'):
            t_1 = conv_block(192, (1, 1), activation='relu',
                             padding='same')(inp)

        with tf.variable_scope('Tower_2'):
            t_2 = conv_block(192, (1, 1), activation='relu',
                             padding='same')(inp)
            t_2 = conv_block(192, (1, 3), activation='relu',
                             padding='same')(t_2)
            t_2 = conv_block(192, (3, 1), activation='relu',
                             padding='same')(t_2)

        x = concatenate(inputs=[t_1, t_2], axis=-1)
        x = conv_block(1920, (1, 1), activation='linear', padding='same')(x)

        x = add([x, inp])
        x = Activation('relu')(x)
        return x

In [None]:
class ReductionA:
    """Mimics the reduction block of the InecptionResNet architecture
    Args:
        padding - setting for the padding of the final layers of the branches
        strides - setting for the padding of the final layers of the branches"""

    def __init__(self, padding='same', strides=(1, 1)):
        self.padding = padding
        self.strides = strides

    def __call__(self, inp):
        with tf.variable_scope('Tower_1'):
            t_1 = MaxPool2D((3, 3), strides=self.strides,
                            padding=self.padding)(inp)

        with tf.variable_scope('Tower_2'):
            t_2 = Conv2D(
                384, (3, 3), strides=self.strides, activation='relu',
                padding=self.padding, kernel_regularizer=l2(1e-4))(inp)

        with tf.variable_scope('Tower_3'):
            k = 256
            l = 256
            m = 384
            n = 384
            t_3 = Conv2D(k, (1, 1), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(inp)
            t_3 = Conv2D(l, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(t_3)
            t_3 = Conv2D(
                m, (3, 3), activation='relu', strides=self.strides,
                padding=self.padding, kernel_regularizer=l2(1e-4))(t_3)

        x = concatenate(inputs=[t_1, t_2, t_3], axis=-1)
        return x


class ReductionB:
    def __init__(self, padding='same', strides=(1, 1)):
        self.padding = padding
        self.strides = strides

    def __call__(self, inp):
        with tf.variable_scope('Tower_1'):
            t_1 = MaxPool2D((3, 3), strides=self.strides,
                            padding=self.padding)(inp)

        with tf.variable_scope('Tower_2'):
            t_2 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(inp)
            t_2 = Conv2D(384, (3, 3), strides=self.strides,
                         activation='relu', padding=self.padding, kernel_regularizer=l2(1e-4))(t_2)

        with tf.variable_scope('Tower_3'):
            t_3 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(inp)
            t_3 = Conv2D(256, (3, 3), strides=self.strides,
                         activation='relu', padding=self.padding, kernel_regularizer=l2(1e-4))(t_3)

        with tf.variable_scope('Tower_4'):
            t_4 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(inp)
            t_4 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(t_4)
            t_4 = Conv2D(256, (3, 3), strides=self.strides,
                         activation='relu', padding=self.padding, kernel_regularizer=l2(1e-4))(t_4)

        x = concatenate(inputs=[t_1, t_2, t_3, t_4], axis=-1)
        return x

# Model

In [None]:
def build_model(input_shape=(8, 8, 119), output_actions=8 * 8 * 73):
    inp = Input(shape=input_shape, name='inp')

    # Stem
    with tf.variable_scope('Stem'):
        with tf.variable_scope('Tower_7x7'):
            t_1 = conv_block(
                64, (7, 1), activation='relu', padding='same')(inp)
            t_1 = conv_block(
                64, (1, 7), activation='relu', padding='same')(t_1)
            t_1 = conv_block(
                84, (3, 3), activation='relu', padding='same')(t_1)

        with tf.variable_scope('Tower_5x5'):
            t_2 = conv_block(
                64, (3, 3), activation='relu', padding='same')(inp)
            t_2 = conv_block(
                86, (3, 3), activation='relu', padding='same')(t_2)

        with tf.variable_scope('Tower_3x3'):
            t_3 = conv_block(
                86, (3, 3), activation='relu', padding='same')(inp)

        x = concatenate(inputs=[t_1, t_2, t_3], axis=-1)

    # Inception A
    for i in range(1):
        with tf.variable_scope(f'InceptionA_{i}'):
            x = ModuleA()(x)
            print(f'ModA_{i}:', tf.keras.backend.int_shape(x))

    # Reduction A
    with tf.variable_scope('ReductionA'):
        x = ReductionA()(x)
        print('ReductA:', tf.keras.backend.int_shape(x))

    # Inception B
    for i in range(2):
        with tf.variable_scope(f'InceptionB_{i}'):
            x = ModuleB()(x)
            print(f'ModB_{i}:', tf.keras.backend.int_shape(x))

    # Reduction B
    with tf.variable_scope('ReductionB'):
        x = ReductionB()(x)
        print('ReductB:', tf.keras.backend.int_shape(x))

    # Inception C
    for i in range(1):
        with tf.variable_scope(f'InceptionC_{i}'):
            x = ModuleC()(x)
            print(f'ModC_{i}:', tf.keras.backend.int_shape(x))

    with tf.variable_scope('Value_Head'):
        val = conv_block(1, (1, 1), activation='relu', padding='same')(x)
        val = Flatten()(val)
        val = Dense(256, activation='relu', kernel_regularizer=l2(1e-4))(val)
        val = Dense(1, activation='tanh', kernel_regularizer=l2(1e-4), name='val_out')(val)

    with tf.variable_scope('Policy_Head'):
        pol = conv_block(2, (1, 1), activation='relu', padding='same')(x)
        pol = Flatten()(pol)
        pol = Dense(output_actions, activation='relu', kernel_regularizer=l2(1e-4), name='pol_out')(pol)
    print('val:', tf.keras.backend.int_shape(val))
    print('pol:', tf.keras.backend.int_shape(pol))

    return Model(inputs=inp, outputs=[pol, val])

In [None]:
from multiprocessing import Pipe, connection
import threading

class DQN:
    def __init__(self, input_shape=(11, 11, 3), output_shape=11*11+1, input_fn=build_model):
        self.model = input_fn(input_shape, output_shape)
        self.api = None
        self.graph = tf.get_default_graph()
        
    def get_pipes(self, num = 1):
        """
        Creates a list of pipes on which observations of the game state will be listened for. Whenever
        an observation comes in, returns policy and value network predictions on that pipe.
        :param int num: number of pipes to create
        :return str(Connection): a list of all connections to the pipes that were created
        """
        if self.api is None:
            self.api = DQN_Interface(self)
            self.api.start()
        return [self.api.create_pipe() for _ in range(num)]
    
    def load(self, weight_path):
        self.model.load_weights(weight_path)
        self.model._make_predict_function()
        self.graph = tf.get_default_graph()
    
    def save(self, weight_path, h5=False):
        if h5:
            self.model.save_weights(weight_path, save_format='h5')
        else:
            self.model.save_weights(weight_path)

class DQN_Interface:
    def __init__(self, dqn):
        self.dqn = dqn
        self.pipes = []
        
    def start(self):
        worker = Thread(target=self._predict_batch_worker, args=[self.dqn.graph], name="prediction_worker")
        worker.setDaemon(True)
        worker.start()
        
    def create_pipe(self):
        a, b = Pipe()
        self.pipes.append(a)
        return b
        
    def _predict_batch_worker(self, graph):
        while True:
            ready = connection.wait(self.pipes, timeout=0.001)
            if not ready:
                continue
            data, result_pipes = [], []
            for pipe in ready:
                while pipe.poll():
                    data.append(pipe.recv())
                    result_pipes.append(pipe)

            data = np.asarray(data, dtype=np.float32)
            with graph.as_default():
                policy_ary, value_ary = self.dqn.model.predict_on_batch(data)
            for pipe, p, v in zip(result_pipes, policy_ary, value_ary):
                pipe.send((p, float(v)))

# Monte-Carlo Tree Search

In [None]:
from collections import defaultdict
from threading import Lock
from concurrent.futures import ThreadPoolExecutor

n_search_threads = 64
# n_search_threads = 1

class MCTS:
    """A Monte-Carlo Search Tree
    
    Uses a dictionary too speed up search time (and few cache misses). 
    The key is a representation of the list of actions taken. It is a slight modification 
    from """

    class StateStats:
        def __init__(self, num_actions=11 * 11 + 1):
#             self.moves = defaultdict(MoveStats)
            self.N_sum = 0
            self.P = None
            self.W = np.zeros(num_actions, dtype=np.float32)
            self.Q = np.zeros(num_actions, dtype=np.float32)
            self.N = np.zeros(num_actions, dtype=np.float32)
            
    def __init__(self, pipes=None, explore_policy='stochastic', num_actions=11 * 11 + 1, can_stop=True, resign_val=-0.8):
        self.pipes = pipes
        self.tree = defaultdict(MCTS.StateStats)
        self.moves = []
        self.explore_policy = explore_policy
        self.best_move = None
        self.num_actions = num_actions

        self.mutexes = defaultdict(Lock)
        
        self.cpuct = 1.5
        self.alpha = 0.03
        self.noise_eps = 0.25

        self.tau_decay_rate = 0.99
        self.virtual_loss = 3

        
        self.num_futures_for_position = 200

        self.resign_threshold = resign_val
        self.min_resign_turn = 8
        
        self.can_stop=can_stop

    def reset(self):
        self.tree = defaultdict(MCTS.StateStats)

    def is_leaf(self, key):
        return not key in self.tree

    def pick_move(self, env):
        self.reset()

        inp_planes = env.generate_model_inputs()
        result, _ = self.explore_move(env)
        policy = self.calc_policy(env)

        # generate
        if self.explore_policy == 'stochastic':
            temp = self.apply_temp(policy, len(env.actions))
            action = np.random.choice(self.num_actions, p=temp)
            while env.get_legal_moves()[action] == 0:
                action = np.random.choice(self.num_actions, p=temp)
            
        else:
            action = np.argmax(policy)
        
        attempted_resign = False
        if self.can_stop and self.resign_threshold is not None \
            and result <= self.resign_threshold \
            and len(env.actions) > self.min_resign_turn:
            return None, result, True
        else:
            if self.resign_threshold is not None \
                and result <= self.resign_threshold \
                and len(env.actions) > self.min_resign_turn:
                attempted_resign=True
            self.moves.append([inp_planes, policy])
            
        if env.get_legal_moves()[action] == 0:
            # sometimes this happens to trigger? before I added while loop
            print('Action:', env.generate_move_notation(action))
            print('Policy:', policy)
            print('After temp:', self.apply_temp(policy, len(env.actions)))
            raise ValueError('Illegal move selected.')
        
        return action, result, attempted_resign

    def calc_policy(self, env):
        key = env.generate_key()
        stats = self.tree[key]

        policy = np.copy(stats.N)
        policy /= np.sum(policy)
        return policy

    def apply_temp(self, policy, turn):
        tau = np.power(self.tau_decay_rate, turn + 1)
        if tau < 0.1:
            tau = 0
        if tau == 0:
            # Just pick the maximum
            action_idx = np.argmax(policy)
            pol = np.zeros(self.labels_n)
            pol[action_idx] = 1.0
            return pol
        else:
            # Weighted by amount times an action is taken
            ret = np.power(policy, 1 / tau)
            ret /= np.sum(ret)
            return ret

    def explore_move(self, env):
        """Each thread will play one game until completion"""
        futures = []
        with ThreadPoolExecutor(max_workers=n_search_threads) as ex:
            for _ in range(self.num_futures_for_position):
                futures.append(
                    ex.submit(
                        self.explore_move_helper,
                        env=env.copy(),
                        is_root=True))
#         for _ in range(self.num_futures_for_position):
#             futures.append(self.explore_move_helper(env=env.copy(), is_root=True))

        results = [f.result() for f in futures]
        return np.max(results), results[0]

    def explore_move_helper(self, env, is_root=False):
        # Determine winner
        if env.done:
            return -1

        key = env.generate_key()

        with self.mutexes[key]:
            if not key in self.tree:
                # EXPAND & EVAL
                # Calculate probabilities (fixed for legality)
                inp = env.generate_model_inputs()
                
                # Multi-processing model.predict
                pipe = self.pipes.pop()
                pipe.send(inp)
                ret = pipe.recv()
                self.pipes.append(pipe)
                pol, val = ret

                self.tree[key].P = pol
                return val
            # SELECT Q+U
            move_idx, q_plus_u = self.maximize_q_plus_u(env, self.tree[key], is_root)
            self.add_virtual_loss(self.tree[key], move_idx)
        
        if env.move(move_idx) == HexEnv.ILLEGAL:
            print('Move Idx:', move_idx)
            print('Q+U:', q_plus_u)
            print('legal:', env.get_legal_moves())
            print('Notation:', env.action_notation)
            print(env.board)
            print('Swap Legal:', env.legal_actions[-1] == 1)
            print(env.sets.sets)
            print('Winner:', env.done)
            
        next_value = self.explore_move_helper(env)
        next_value = -next_value

        # BACK-PROP
        with self.mutexes[key]:
            self.back_prop(self.tree[key], move_idx, next_value)

        return next_value

    def maximize_q_plus_u(self, env, stats, is_root):
        # calculate U
        if not is_root:
            U = self.cpuct * stats.P * np.sqrt(stats.N_sum + 1) / (1 + stats.N)
        else:
            # random distribution
            noise = np.random.dirichlet([self.alpha] * len(stats.N))
            P = (1 - self.noise_eps) * stats.P + self.noise_eps * noise
            U = self.cpuct * P * np.sqrt(stats.N_sum + 1) / (1 + stats.N)
        
        # zero out illegal moves
        q_plus_u = (stats.Q + U) #* env.get_legal_moves()
        np.putmask(q_plus_u, env.get_legal_moves() == 0, np.NINF)
        move_idx = np.argmax(q_plus_u)
        return move_idx, q_plus_u

    def add_virtual_loss(self, stats, move_idx):
        stats.N_sum += self.virtual_loss
        stats.N[move_idx] += self.virtual_loss
        stats.W[move_idx] += -self.virtual_loss
        stats.Q[move_idx] = stats.W[move_idx] / stats.N[move_idx]

    def back_prop(self, stats, move_idx, explore_value):
        # Removes virtual loss
        stats.N_sum += -self.virtual_loss + 1
        stats.N[move_idx] += -self.virtual_loss + 1
        stats.W[move_idx] += self.virtual_loss + explore_value
        stats.Q[move_idx] = stats.W[move_idx] / stats.N[move_idx]

    def finish_game(self, z):
        for move in self.moves:
            move += [z]

# Useful Method

## Self-Play Meta File

In [None]:
import glob
import os
import re

import tensorflow as tf


def create_dict_from_meta(meta_path='./self_play_data/meta.txt'):
    """Creates a dictionary from the the given metadata file.
    This function will create a meta file if one is not present.

    :param meta_path: The (desired) path to the meta file.
    :return: Dictionary with file names as key and the number of positions as the value
    """
    with open(meta_path, 'r+') as f:
        content = f.read()

    entries = re.split('\n+', content)
    print(entries)

    prog = re.compile(r'(.+) +(\d+)', flags=re.M)
    res = prog.findall(content)
    table = dict(res)
    return table


def write_to_meta(meta: 'opened file', record_name: str, num_positions: int) -> None:
    """Writes to the meta file the information given.
    :param meta: An already opened file (to avoid repetitive opening and closing)
    :param record_name: The name of the record to be analyzed
    :param num_positions: number of positions in the given record file"""
    meta.write(record_name)
    meta.write(" ")
    meta.write(str(num_positions))
    meta.write("\n")


def count_num_positions(record_name: str) -> int:
    """Counts the amount of positions are recorded in the file
    :param record_name: The name of the record to be analyzed
    :return: number of positions in the given record file"""
    num = 0
    for _ in tf.python_io.tf_record_iterator(record_name):
        num += 1
    return num


def refresh_meta(self_play_dir: str = './self_play_data/', complete_refresh=False) -> dict:
    """Creates a dictionary and to include all files and their amount of
    positions. Refreshes meta to include any un-included files. With
    complete_refresh metadata will be ignored and completely rewritten based
    on the tfrecord files in the self_play_directory.

    :param complete_refresh:
    :param self_play_dir: The directory where self play records are held
    :return: A dictionary containing the complete information with the
    keys being the file name and the values being the number of
    positions contained."""

    # Gather all recorded data
    record_pattern = os.path.join(self_play_dir, '*.tfrecords')
    recorded = glob.glob(record_pattern)

    if complete_refresh:
        table = {}
    else:
        # Gather metadata about recorded data
        meta_path = os.path.join(self_play_dir, 'meta.txt')
        table = create_dict_from_meta(meta_path)

    # write data to the file
    with open(meta_path, 'a+') as meta:
        for record in recorded:
            if record not in table:
                num = count_num_positions(record)
                table[record] = num
                write_to_meta(meta, record, num)

    return table

## Record Model Data

In [None]:
import os


weights_path = './model_weights'
best_weights_path = './model_weights/best.ckpt'

def load_best_model_weight(model):
    model.load(best_weights_path)

def save_as_best_model(model):
    model.save(best_weights_path)

def get_newest_subdir(par_dir):
    all_subdirs = [d for d in os.listdir(par_dir) if os.path.isdir(d)]
    latest_subdir = max(all_subdirs, key=os.path.getmtime)
    return latest_subdir

def get_newest_file(directory):
    files = sorted(os.listdir(directory), key=os.path.getctime, reverse=True)
    files = [f for f in files if os.path.isfile(f)]
    return files[0]

def write_game_data_to_file(path, data):
    def float_list_feature(value):
        return tf.train.Feature(float_list=tf.train.FloatList(value=value))
    
    def int_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
        
    with tf.python_io.TFRecordWriter(path) as writer:
        num = 0
        for i in data:
            num += 1
            # numpy arr, numpy arr, int
            inp, pol, win = i
            data = {
                'input': float_list_feature(inp.flatten()),
                'policy': float_list_feature(pol),
                'winner': int_feature(win)
            }
            example = tf.train.Example(features=tf.train.Features(feature=data))
            writer.write(example.SerializeToString())
            
    with open(meta_path, 'a+') as meta:
        write_to_meta(meta, path, num)
        print(f'Path: {path}\nPositions: {num}')
    
    print('Done Writing')
    upload_to_drive()

## Cloud File Transfer

In [None]:
!pip install -U -q PyDrive
import os
import zipfile

import collections
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

FolderMeta = collections.namedtuple('Folder', 'name zipname id')

def get_folders(top_level_dir: str = os.getcwd()) -> 'List of FolderMeta':
    """
    Creates a list of folder metadata inlcuding the self-play data, model weights
    and logs.
    :param top_level_dir: The directory to search for data
    :return: A list of Folder metadata
    """
    folders = []

    data_dir = os.path.join(top_level_dir, 'self_play_data')
    if os.path.isdir(data_dir):
        folders += [FolderMeta(data_dir, 'self-play', '1-sAoxeqKqP56p3oBIEPEP3CvJw3EqAnn')]

    weights_dir = os.path.join(top_level_dir, 'model_weights')
    if os.path.isdir(weights_dir):
        folders += [FolderMeta(weights_dir, 'model-weights', '1y4HHoswgCRppRK47mXuyeJJRis_OP8Ev')]

    logs_dir = os.path.join(top_level_dir, 'logs')
    if os.path.isdir(logs_dir):
        folders += [FolderMeta(logs_dir, 'logs', '1tlEcm9TOLK0rN5YVpqHDh9hOVVlWLZFC')]
  
    print('Folders:', folders)
    return folders


def zipfolder(foldername: str, target_dir: str) -> None:
    """
    Zips a folder
    https://colab.research.google.com/drive/1P2AmVHPmDccstO0BiGu2uGAG0vTx444b#scrollTo=w89m9hpwvGeK&forceEdit=true&offline=true&sandboxMode=true
    :param foldername: The name of the zipped file (not including the .zip)
    :param target_dir: The name of the folder to be zipped
    """
    zipobj = zipfile.ZipFile(foldername + '.zip', 'w', zipfile.ZIP_DEFLATED)
    rootlen = len(target_dir) + 1
    for base, dirs, files in os.walk(target_dir):
        for file in files:
            fn = os.path.join(base, file)
            zipobj.write(fn, fn[rootlen:])


def unzipfolder(zipped_file: str = 'model-weights.zip', target_dir: str = './model_weights') -> None:
    """
    Unzips a zipped file into a folder
    :param zipped_file: The zipped file including .zip
    :param target_dir: The target folder where the contents of the zip file are deposited
    """
    zip_ref = zipfile.ZipFile(zipped_file, 'r')
    zip_ref.extractall(target_dir)
    zip_ref.close()


def upload_to_drive(files: 'List of FolderMeta' = None) -> None:
    """Uploads files to Google Drive
    :param files: A List of folders to be uploaded to google drive
    """

    if files is None:
        files = get_folders(top_level_dir='.')

    # Zip files before uploading
    for i in files:
        zipfolder(i.zipname, i.name)  # .zip, folder_name
        print(f'Zipping {i}')

    drive = authenticate_drive()

    for i in files:
        print(f'Sent {i}')
        upload_file(drive, i.zipname, i.id)


def upload_file(drive, src_file_name: str, src_file_id: str = None) -> None:
    """Uploads a specific file to google drive
    :param drive: The GoogleDrive where the file will be uploaded to
    :param src_file_name: The zipped file to be uploaded
    :param src_file_id: The Google Drive id of the
    """
    if src_file_id is None:
        # Create new file
        file = drive.CreateFile()
    else:
        # Update existing file
        file = drive.CreateFile({'id': src_file_id})

    file.SetContentFile(src_file_name + ".zip")
    file.Upload()


def authenticate_drive() -> GoogleDrive:
    """Authenticates Google Drive and returns the drive object
    :return: The GoogleDrive objects
    """
    # 1. Authenticate and create the PyDrive client.
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
    return drive
  
def download_from_drive() -> None:
    """
    Downloads some frolders from google drive
    Change in the future to accept a list of folder metadata
    """
    drive = authenticate_drive()

    fileId = drive.CreateFile({'id': '1YXmKHZg9fJdR3SdbB0EeqHL9u4ZokEfv'})
    # print(fileId['title'])
    fileId.GetContentFile('self-play.zip')

    fileId = drive.CreateFile({'id': '1y4HHoswgCRppRK47mXuyeJJRis_OP8Ev'})
    # print(fileId['title'])
    fileId.GetContentFile('model-weights.zip')
    
    fileId = drive.CreateFile({'id': '1tlEcm9TOLK0rN5YVpqHDh9hOVVlWLZFC'})
    # print(fileId['title'])
    fileId.GetContentFile('logs.zip')
    
    unzipfolder('model-weights.zip', './model_weights')
    unzipfolder('self-play.zip', './self_play_data')
    unzipfolder('logs.zip', './logs')


# Self-Play

In [None]:
from tensorflow.python.keras.utils import multi_gpu_model
from collections import deque
import glob
import os
from multiprocessing import Process, Manager
from concurrent.futures import ProcessPoolExecutor
from threading import Thread
from time import time
from datetime import datetime
from collections import namedtuple
from google.colab import files


n_max_processes=12
n_game_per_file=50
self_play_dir = './self_play_data'
meta_path = './self_play_data/meta.txt'
max_file_num = 200

GameData = collections.namedtuple('GameData', ['env', 'data', 'n_bad_resigns', 'total_resigns', 'resign_vals'])

class SelfPlayWorker:
    
    def __init__(self, num_files=10):
        self.current_model = self.load_model()
        self.m = Manager()
        self.cur_pipes = self.m.list([self.current_model.get_pipes(n_search_threads) for _ in range(n_max_processes)])
        self.buffer = []
        self.num_files = num_files

    def start(self):
        self.buffer = []

        futures = deque()
        
        # Get sample for calculating resign threshold
        resign_vals = []
        n_bad_resigns = 0
        total_resigns = 0
        
        with ProcessPoolExecutor(max_workers=n_max_processes) as ex:
            for game_idx in range(n_max_processes * 2):
                futures.append(ex.submit(self_play, cur=self.cur_pipes, can_stop=False))
                
            for game_idx in range(50):
                start_time = time()
                gd = futures.popleft().result()
                print(f"game {game_idx:<5d} time={time() - start_time:5.1f}s "
                    f"halfmoves={len(gd.env.actions):3d} winner={gd.env.done} "
                    f"{'by resign ' if gd.env.resigned else '          '}")

                self.buffer += gd.data
                n_bad_resigns += gd.n_bad_resigns
                total_resigns += gd.total_resigns
                resign_vals.extend(gd.resign_vals)
                      
                if game_idx % n_game_per_file == 49:
                    self.flush_buffer()
                if game_idx < 50 - 2 * n_max_processes:
                    futures.append(ex.submit(self_play, cur=self.cur_pipes, can_stop=False)) # Keep it going
        
        resign_val = None
        if total_resigns != 0 and n_bad_resigns / total_resigns >= 0.05:
            # too many bad resigns
            # XXX: Replace with quickselect
            resign_vals.sort()
            resign_val = resign_vals[int(0.05 * len(resign_vals))]
            print('resign_val:', resign_val)
            print('resign_vals:\n', resign_vals)
        assert len(futures) == 0
        
        with ProcessPoolExecutor(max_workers=n_max_processes) as ex:
            for game_idx in range(n_max_processes * 2):
                futures.append(ex.submit(self_play, cur=self.cur_pipes, can_stop=True, resign_val=resign_val))
            for game_idx in range(50, 50 * self.num_files):
                start_time = time()
                gd = futures.popleft().result()
                print(f"game {game_idx:<5d} time={time() - start_time:5.1f}s "
                    f"halfmoves={len(gd.env.actions):3d} winner={gd.env.done} "
                    f"{'by resign ' if gd.env.resigned else '          '}")

                self.buffer += gd.data
                if game_idx % n_game_per_file == 49:
                    self.flush_buffer()
#                     reload_best_model_weight_if_changed(self.current_model) #XXX
                futures.append(ex.submit(self_play, cur=self.cur_pipes, can_stop=True, resign_val=resign_val)) # Keep it going

        if len(self.buffer) > 0:
            self.flush_buffer()
    
    def load_model(self):
        model = DQN()
        
        # Create dir
        os.makedirs(os.path.dirname(best_weights_path), exist_ok=True)
        res = glob.glob(best_weights_path)
        if len(res) == 0:
            save_as_best_model(model)
        else:
            load_best_model_weight(model)
        return model

    def flush_buffer(self):
        """
        Flush the play data buffer and write the data to the appropriate location
        """
        game_id = datetime.now().strftime("%Y%m%d-%H%M%S")
        path = os.path.join(self_play_dir, game_id + '.tfrecords')
        os.makedirs(os.path.dirname(path), exist_ok=True)
        
        thread = Thread(target=write_game_data_to_file, args=(path, self.buffer))
        thread.start()
        self.buffer = []
                      
#         upload_to_drive()
    
    def remove_play_data(self):
        res = glob.glob(self_play_dir+'/*')
        
        if len(files) < self.config.play_data.max_file_num:
            return
        for i in range(len(files) - max_file_num):
            os.remove(files[i])


def self_play(cur, can_stop, resign_val=None):
    pipes = cur.pop()
    env = HexEnv()
    
    if resign_val is None:
        one = MCTS(pipes=pipes, can_stop=can_stop)    
        two = MCTS(pipes=pipes, can_stop=can_stop)
    else:
        one = MCTS(pipes=pipes, can_stop=can_stop, resign_val=resign_val)    
        two = MCTS(pipes=pipes, can_stop=can_stop, resign_val=resign_val)
    one_resign = 0
    two_resign = 0
    
    one_resign_vals = []             
    two_resign_vals = []
    
    while env.done == 0:
        if len(env.actions) % 2 == 0:
            action, position_value, attempted_resign = one.pick_move(env)
                      
            if attempted_resign:
                one_resign += 1
                one_resign_vals.append(position_value) 
        else:
            action, position_value, attempted_resign = two.pick_move(env)
            
            if attempted_resign:
                two_resign += 1
                two_resign_vals.append(position_value)
        env.move(action)

    if env.done == HexEnv.PLAYER_ONE:
        result = 1
    elif env.done == HexEnv.PLAYER_TWO:
        result = -1

    one.finish_game(result)
    two.finish_game(-result)

    data = []
    for i in range(len(one.moves)):
        data.append(one.moves[i])
        if i < len(two.moves):
            data.append(two.moves[i])
    
                      
    resign_vals = []
    n_bad_resigns = 0
    if not can_stop:
        if one_resign != 0 and env.done == HexEnv.PLAYER_ONE:
            n_bad_resigns = one_resign
            resign_vals = one_resign_vals
            
        elif two_resign != 0 and env.done == HexEnv.PLAYER_TWO:
            n_bad_resigns = two_resign
            resign_vals = two_resign_vals
    
    gd = GameData(env, data, n_bad_resigns, one_resign + two_resign, resign_vals)
    cur.append(pipes)
    return gd

# Optimize

In [None]:
import os
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime
from time import sleep
from random import shuffle
import numpy as np

from tensorflow.python import keras
from tensorflow.python.keras.optimizers import Adam, RMSprop
from tensorflow.python.keras.callbacks import TensorBoard

num_workers, worker_index = 1, 0
max_num_games = 500000
iteration = 0

# def filename_to_datetime(filename):
#     return datetime.strptime(filename, "%Y%m%d-%H%M%S.tfrecords")

class Optimizer:
    def __init__(self, initial_epoch=5, num_epochs=2):
        self.initial_epoch = initial_epoch
        self.model = build_model((11,11,3), 11*11+1)
#         self.load_model()
        
        opt = tf.train.AdamOptimizer(learning_rate=0.001)
        self.model.compile(optimizer=opt, loss=['categorical_crossentropy', 'mean_squared_error'])
#         self.model.compile(optimizer=RMSprop(lr=0.0001), loss=['categorical_crossentropy', 'mean_squared_error'])
#         self.estimator = tf.keras.estimator.model_to_estimator(self.model)
        
        # Sort files by newest to oldest
        files = os.listdir(self_play_dir)
        files.remove('meta.txt')
        files = sorted(files, key=lambda x: datetime.strptime(x, "%Y%m%d-%H%M%S.tfrecords"), reverse=True)
        files = [os.path.join(self_play_dir, i) for i in files]
        print(files)
        
        # Grab enough of the most recent files
        if len(files) * n_game_per_file > max_num_games:
            length = int(np.ceil(max_num_games / n_game_per_file))
            files = files[:length]
        
        self.num_positions = self.calc_train_length(self_play_dir, files)
        self.num_epochs = num_epochs
        
        self.dataset = tf.data.TFRecordDataset(files)
        self.dataset = self.process_dataset(self.dataset)

    def calc_train_length(self, direc, files):
        data_count = refresh_meta(direc, complete_refresh=False)
        
        print(data_count)
        n = 0
        for i in files:
            n += int(data_count[i])
        return n

    def process_dataset(self, dataset, worker_index=0, num_workers=1):
#         dataset = dataset.shard(num_workers, worker_index)
        def _parse(ex):
            keys_to_features = {
                'input': tf.FixedLenFeature((363), tf.float32),
                'policy': tf.FixedLenFeature((122), tf.float32),
                'winner': tf.FixedLenFeature((1), tf.int64, default_value=0)
            }
            parsed = tf.parse_single_example(ex, keys_to_features)
            
            inp = tf.cast(parsed['input'], tf.float32)
            inp = tf.reshape(inp, (11, 11, 3))
            pol = tf.cast(parsed['policy'], tf.float32)
            win = tf.cast(parsed['winner'], tf.float32)
            return inp, {'pol_out':pol, 'val_out':win}
        
        dataset = dataset.map(lambda x: _parse(x), num_parallel_calls = 1)
        dataset = dataset.repeat(self.num_epochs)        
        dataset = dataset.shuffle(buffer_size=10000)
        dataset = dataset.batch(64)
        return dataset
        
    def start(self):
        """
        Load the next generation model from disk and start doing the training endlessly.
        """
        self.load_model()
        self.training()
        self.save_current_model()

    def training(self):
        steps = int(np.floor(self.num_positions / 64))
        cb = TensorBoard(log_dir="./logs", batch_size=64, histogram_freq=0)
        
        self.model.fit(self.dataset.make_one_shot_iterator(), epochs=self.initial_epoch + self.num_epochs, steps_per_epoch=steps, callbacks=[cb], initial_epoch=self.initial_epoch)

    def save_current_model(self):
        """
        Saves the current model as the next generation model to the appropriate directory
        """
        model_id = datetime.now().strftime("%Y%m%d-%H%M%S") + '.ckpt'
        
        weights_dir = './model_weights/all_models'
        # Get most recent competitor
        os.makedirs(weights_dir, exist_ok=True)
        
        weight_path = os.path.join(weights_dir, model_id) # ./model_weights/ + latest/ + 991231-123456
        self.model.save_weights(weight_path)
        
    def load_model(self):
        weights_dir = './model_weights/all_models/'
        os.makedirs(weights_dir, exist_ok=True)
        
        # Get most recent competitor
        files = os.listdir(weights_dir)
        files = [i for i in files if i.endswith('.ckpt')]
        files = sorted(files, key=lambda x: datetime.strptime(x, "%Y%m%d-%H%M%S.ckpt.index"), reverse=True)
        files = [os.path.join(weights_dir, i) for i in files]
        
        if len(files) == 0:
            self.model.load_weights(best_weights_path)
        else:
            #files is of form *.ckpt.index
            self.model.load_weights(files[0][:-6])




# Evaluate

In [None]:
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Manager
from datetime import datetime

class Evaluater:

    def __init__(self):
        self.m = Manager()
        self.current_model = self.load_best_model()
        self.cur_pipes = self.m.list([
            self.current_model.get_pipes(n_search_threads)
            for _ in range(n_max_processes)
        ])

        self.challenger_model, self.challenger_name = self.load_challenger()
        self.challenger_pipes = self.m.list([
            self.challenger_model.get_pipes(n_search_threads)
            for _ in range(n_max_processes)
        ])

    def start(self):
        challenger_is_better = self.evaluate_model()
        if challenger_is_better:
            save_as_best_model(self.challenger_model)
            self.replace_best()

    def evaluate_model(self):
        futures = []
        with ProcessPoolExecutor(max_workers=n_max_processes) as ex:
            for game_idx in range(400):
                fut = ex.submit(
                    play_game,
                    best_pipes=self.cur_pipes,
                    challenger_pipes=self.challenger_pipes,
                    current_white=(game_idx % 2 == 0))
                futures.append(fut)

            results = []
            for fut in as_completed(futures):
                # winner = if challenger win -> 1, lose -> 0
                winner, env, current_white = fut.result()
                results.append(winner)
                win_rate = sum(results) / len(results)
                game_idx = len(results)

                print(
                    f"game {game_idx:5}: winner={winner:.1f} as {'black' if current_white else 'white'} "
                    f"{'by resign ' if env.resigned else '          '}"
                    f"win_rate={win_rate*100:5.1f}% "
                    f"{env.action_notation}")

                if len(results) - sum(results) >= 400 * (1 - 0.55):
                    print(f"lose count reach {results.count(0)} so give up challenge")
                    return False

                if sum(results) >= 400 * 0.55:
                    print(f"win count reach {results.count(1)} so change best model")
                    return True
#         futures = []
#         for game_idx in range(400):
#             fut = play_game(best_pipes=self.cur_pipes,
#                     challenger_pipes=self.challenger_pipes,
#                     current_white=(game_idx % 2 == 0))
#             futures.append(fut)

#         results = []
#         for i in futures:
#             # winner = if challenger win -> 1, lose -> 0
#             winner, env, current_white = fut.result()
#             results.append(winner)
#             win_rate = sum(results) / len(results)
#             game_idx = len(results)

#             print(f"game {game_idx:5}: winner={winner:.1f} as {'black' if current_white else 'white'} "
#                     f"{'by resign ' if env.resigned else '          '}"
#                     f"win_rate={win_rate*100:5.1f}% "
#                     f"{env.action_notation}")

#             if len(results) - sum(results) >= 400 * (1 - 0.55):
#                 print(f"lose count reach {results.count(0)} so give up challenge")
#                 return False

#             if sum(results) >= 400 * 0.55:
#                 print(f"win count reach {results.count(1)} so change best model")
#                 return True

        win_rate = sum(results) / len(results)
        print(f"winning rate {win_rate*100:.1f}%")
        return win_rate >= 0.55

    def replace_best(self):
        # add to champions
        champion_dir = './model_weights/champions/'
        print('Replacing best')
        print('Challenger name', self.challenger_name)
        new_champion_path = os.path.join(champion_dir, self.challenger_name)
        self.challenger_model.save(new_champion_path)
        save_as_best_model(self.challenger_model)


    def load_best_model(self):
        print('Loading Best Model:', best_weights_path)
        model = DQN()
        model.load(best_weights_path)
        return model

    def load_challenger(self):
        weights_dir = './model_weights/all_models/'
        os.makedirs(weights_dir, exist_ok=True)

        # Get most recent competitor
        files = os.listdir(weights_dir)
        files = [i for i in files if i.endswith('.ckpt.index')]
        files = sorted(files, key=lambda x: datetime.strptime(x, "%Y%m%d-%H%M%S.ckpt.index"), reverse=True)
        print(files)
        filename = files[0][:-6]

        files = [os.path.join(weights_dir, i) for i in files]
        
        print('Loading Competitor Model:', files[0][:-6])
        print('Competitor Name:', filename)
        model = DQN()
        model.load(files[0][:-6])
        return model, filename


def play_game(best_pipes, challenger_pipes, current_white: bool):

    cur_pipes = best_pipes.pop()
    chlng_pipes = challenger_pipes.pop()
    env = HexEnv()

    current_best = MCTS(pipes=cur_pipes)
    challenger = MCTS(pipes=chlng_pipes)

    if current_white:
        one, two = current_best, challenger
    else:
        one, two = challenger, current_best

    while env.done == 0:
        if len(env.actions) % 2 == 0:
            action, _, _ = one.pick_move(env)
        else:
            action, _, _ = two.pick_move(env)
        env.move(action)

    if env.done == HexEnv.PLAYER_ONE:
        result = 1
    elif env.done == HexEnv.PLAYER_TWO:
        result = 0

    best_pipes.append(cur_pipes)
    challenger_pipes.append(chlng_pipes)


    return result, env, current_white

# Run

In [None]:
!ls

sample_data


In [None]:
download_from_drive()
!ls self_play_data/

ApiRequestError: ignored

In [None]:
!echo "Logs:"
!ls logs

In [None]:
!ls model_weights/all_models/

In [None]:
# upload_to_drive()

##Self-Play

In [None]:
s = SelfPlayWorker(num_files=4)

ModA_0: (None, 11, 11, 256)
ReductA: (None, 11, 11, 1024)
ModB_0: (None, 11, 11, 1024)
ModB_1: (None, 11, 11, 1024)
ReductB: (None, 11, 11, 1920)
ModC_0: (None, 11, 11, 1920)
val: (None, 1)
pol: (None, 122)


In [None]:
s.start() # Creates 10 records need. 25

game 0     time=1322.2s halfmoves= 85 winner=1           
game 1     time= 62.8s halfmoves= 97 winner=1           
game 2     time=  0.0s halfmoves= 83 winner=1           
game 3     time=  0.0s halfmoves= 93 winner=1           
game 4     time=  0.0s halfmoves=105 winner=1           
game 5     time=  0.0s halfmoves= 59 winner=1           
game 6     time=  0.0s halfmoves= 93 winner=1           
game 7     time=  0.0s halfmoves= 99 winner=1           
game 8     time=  0.0s halfmoves= 89 winner=1           
game 9     time=  0.0s halfmoves= 81 winner=1           
game 10    time=  0.0s halfmoves= 69 winner=1           
game 11    time=  0.0s halfmoves= 86 winner=2           
game 12    time=  0.0s halfmoves= 68 winner=2           
game 13    time=320.0s halfmoves= 96 winner=2           
game 14    time=  0.0s halfmoves= 80 winner=2           
game 15    time= 55.3s halfmoves=107 winner=1           
game 16    time=  0.0s halfmoves= 93 winner=1           
game 17    time= 64.7s halfmov

In [None]:
!ls
upload_to_drive()

datalab  logs.zip	model-weights.zip  self-play.zip
logs	 model_weights	self_play_data
Folders: [Folder(name='./self_play_data', zipname='self-play', id='1YXmKHZg9fJdR3SdbB0EeqHL9u4ZokEfv'), Folder(name='./model_weights', zipname='model-weights', id='1y4HHoswgCRppRK47mXuyeJJRis_OP8Ev'), Folder(name='./logs', zipname='logs', id='1tlEcm9TOLK0rN5YVpqHDh9hOVVlWLZFC')]
Zipping Folder(name='./self_play_data', zipname='self-play', id='1YXmKHZg9fJdR3SdbB0EeqHL9u4ZokEfv')
Zipping Folder(name='./model_weights', zipname='model-weights', id='1y4HHoswgCRppRK47mXuyeJJRis_OP8Ev')
Zipping Folder(name='./logs', zipname='logs', id='1tlEcm9TOLK0rN5YVpqHDh9hOVVlWLZFC')
Sent Folder(name='./self_play_data', zipname='self-play', id='1YXmKHZg9fJdR3SdbB0EeqHL9u4ZokEfv')
Sent Folder(name='./model_weights', zipname='model-weights', id='1y4HHoswgCRppRK47mXuyeJJRis_OP8Ev')
Sent Folder(name='./logs', zipname='logs', id='1tlEcm9TOLK0rN5YVpqHDh9hOVVlWLZFC')


In [None]:
!rm self-play.zip

In [None]:
!ls self_play_data/
!cat self_play_data/meta.txt

20180805-234728.tfrecords  20180807-172751.tfrecords  20180809-174810.tfrecords
20180806-004243.tfrecords  20180807-175808.tfrecords  20180809-181803.tfrecords
20180806-011231.tfrecords  20180807-182631.tfrecords  20180809-184812.tfrecords
20180806-014138.tfrecords  20180807-190304.tfrecords  20180809-191758.tfrecords
20180806-034722.tfrecords  20180807-193105.tfrecords  20180809-194657.tfrecords
20180806-041625.tfrecords  20180807-195949.tfrecords  20180809-201731.tfrecords
20180806-160823.tfrecords  20180807-202909.tfrecords  20180809-204602.tfrecords
20180806-164126.tfrecords  20180807-205913.tfrecords  20180809-211620.tfrecords
20180806-173039.tfrecords  20180808-175859.tfrecords  20180809-214324.tfrecords
20180806-180038.tfrecords  20180808-182857.tfrecords  20180809-215146.tfrecords
20180806-183029.tfrecords  20180808-185850.tfrecords  meta.txt
20180807-165658.tfrecords  20180809-171648.tfrecords
./self_play_data/20180805-234728.tfrecords 4743
./self_play_data/20180806

## Optimize

In [None]:
!ls logs/

In [None]:
if not os.path.isfile('./logs/epoch.txt'):
    with open('./logs/epoch.txt', 'w+') as f:
        f.write(str(8))
        
!ls logs/
!cat logs/epoch.txt

In [None]:
with open('./logs/epoch.txt', 'r') as f:
    epoch = int(f.read())
print(epoch)

In [None]:
o = Optimizer(initial_epoch=epoch, num_epochs=1) # Starts at 9 and ends at 9
print(o.initial_epoch)
print(o.num_epochs)

In [None]:
o.start()
with open('./logs/epoch.txt', 'w+') as f:
    f.write(str(epoch + 1))

In [None]:
!ls model_weights/all_models/
!ls logs/

In [None]:
upload_to_drive()

## Evaluate

In [None]:
!ls model_weights/all_models/

20180810-180218.ckpt.data-00000-of-00001
20180810-180218.ckpt.index
20180811-015139.ckpt.data-00000-of-00001
20180811-015139.ckpt.index
20180817-163509.ckpt.data-00000-of-00001
20180817-163509.ckpt.index
20180821-162154.ckpt.data-00000-of-00001
20180821-162154.ckpt.index
20180823-164234.ckpt.data-00000-of-00001
20180823-164234.ckpt.index
20180825-223203.ckpt.data-00000-of-00001
20180825-223203.ckpt.index
20180830-014116.ckpt.data-00000-of-00001
20180830-014116.ckpt.index
20180902-155106.ckpt.data-00000-of-00001
20180902-155106.ckpt.index
20180906-020604.ckpt.data-00000-of-00001
20180906-020604.ckpt.index
20180909-001728.ckpt.data-00000-of-00001
20180909-001728.ckpt.index
20180914-033459.ckpt.data-00000-of-00001
20180914-033459.ckpt.index
20180916-163547.ckpt.data-00000-of-00001
20180916-163547.ckpt.index
20180919-033518.ckpt.data-00000-of-00001
20180919-033518.ckpt.index
20180922-173042.ckpt.data-00000-of-00001
20180922-173042.ckpt.index
20180924-184258.ckpt.data-00000-of-00001
2018092

In [None]:
e = Evaluater()

Loading Best Model: ./model_weights/best.ckpt
ModA_0: (None, 11, 11, 256)
ReductA: (None, 11, 11, 1024)
ModB_0: (None, 11, 11, 1024)
ModB_1: (None, 11, 11, 1024)
ReductB: (None, 11, 11, 1920)
ModC_0: (None, 11, 11, 1920)
val: (None, 1)
pol: (None, 122)
['20181011-020732.ckpt.index', '20181008-190600.ckpt.index', '20181008-010135.ckpt.index', '20181006-035243.ckpt.index', '20181002-223906.ckpt.index', '20180930-173411.ckpt.index', '20180927-224436.ckpt.index', '20180924-184258.ckpt.index', '20180922-173042.ckpt.index', '20180919-033518.ckpt.index', '20180916-163547.ckpt.index', '20180914-033459.ckpt.index', '20180909-001728.ckpt.index', '20180906-020604.ckpt.index', '20180902-155106.ckpt.index', '20180830-014116.ckpt.index', '20180825-223203.ckpt.index', '20180823-164234.ckpt.index', '20180821-162154.ckpt.index', '20180817-163509.ckpt.index', '20180811-015139.ckpt.index', '20180810-180218.ckpt.index']
Loading Competitor Model: ./model_weights/all_models/20181011-020732.ckpt
Competitor N

In [None]:
e.start()

game     1: winner=1.0 as black by resign win_rate=100.0% K2 G6 E4 E7 J6 B4 H3 C2 G9 F10 E1 A3 I4 C11 J9 K11 J8 E2 C1 I8 D7 C10 C3 B7 C6 J10 E9 C7 G5 H7 D4 G7 F6 C4 H10 J11 A5 H2 H8 K8 A11
game     2: winner=1.0 as white           win_rate=100.0% K2 G8 E5 D7 H5 A2 I5 B1 G3 F10 F6 H10 C1 A10 C2 E1 C3 K8 B8 B4 E6 K6 F11 B2 H4 A6 A8 A5 C4 A3 F3 A9 K3 D3 D6 C9 D1 A1 J4 B7 I4 D9 B3 G10 G4 K11 B6 I6 J2 K9 F9 J9 G2 H2 H7 B10 F1 D4 A7 I11 F4 C10 E4 J3 I10 J10 C6
game     3: winner=1.0 as black           win_rate=100.0% K1 G7 E3 E7 J6 B4 H1 C4 H2 I2 H6 A6 I5 A7 C7 E6 K7 G2 A2 A4 A3 B5 B9 C1 K6 G1 C5 K4 G3 I1 F6 D9 I7 D10 C6 E11 I8 H11 C9 H10 F9 G5 B10 F8 E10 G4 B8 A1 G6 F11 J2 K5 J9 I4 C2 H8 I3 K8 E5 H9 H7 E1 F3 D8 K10 G10 I11 G11 F4 K2 C10 F2 J3 H5 J7 C3 D3 B7 K11 F10 J1 B2 K9 B11 G8 J11 A11 G9 C8 E4 B1 E9 D11 D7 C11
game     4: winner=1.0 as black           win_rate=100.0% K1 G5 E5 E7 J6 B4 H3 C1 G9 E3 E2 D2 G4 D8 A3 G10 J8 H1 F6 H10 I2 H5 I3 F9 B11 A6 B1 E11 D10 C11 K6 A10 G7 K9 A5 D9 J7 C5 

In [None]:
# download_from_drive()
e.replace_best()

Replacing best
Challenger name 20181011-020732.ckpt


In [None]:
upload_to_drive()

Folders: [Folder(name='./self_play_data', zipname='self-play', id='1YXmKHZg9fJdR3SdbB0EeqHL9u4ZokEfv'), Folder(name='./model_weights', zipname='model-weights', id='1y4HHoswgCRppRK47mXuyeJJRis_OP8Ev'), Folder(name='./logs', zipname='logs', id='1tlEcm9TOLK0rN5YVpqHDh9hOVVlWLZFC')]
Zipping Folder(name='./self_play_data', zipname='self-play', id='1YXmKHZg9fJdR3SdbB0EeqHL9u4ZokEfv')
Zipping Folder(name='./model_weights', zipname='model-weights', id='1y4HHoswgCRppRK47mXuyeJJRis_OP8Ev')
Zipping Folder(name='./logs', zipname='logs', id='1tlEcm9TOLK0rN5YVpqHDh9hOVVlWLZFC')
Sent Folder(name='./self_play_data', zipname='self-play', id='1YXmKHZg9fJdR3SdbB0EeqHL9u4ZokEfv')
Sent Folder(name='./model_weights', zipname='model-weights', id='1y4HHoswgCRppRK47mXuyeJJRis_OP8Ev')
Sent Folder(name='./logs', zipname='logs', id='1tlEcm9TOLK0rN5YVpqHDh9hOVVlWLZFC')


In [None]:
!ls model_weights/

all_models		       best.ckpt.index	checkpoint
best.ckpt.data-00000-of-00001  champions


In [None]:
!ls model_weights/best*
!ls model_weights/champions/

model_weights/best.ckpt.data-00000-of-00001  model_weights/best.ckpt.index
20180810-180218.ckpt.data-00000-of-00001
20180810-180218.ckpt.index
20180811-015139.ckpt.data-00000-of-00001
20180811-015139.ckpt.index
20180830-014116.ckpt.data-00000-of-00001
20180830-014116.ckpt.index
20180902-155106.ckpt.data-00000-of-00001
20180902-155106.ckpt.index
20180906-020604.ckpt.data-00000-of-00001
20180906-020604.ckpt.index
20180924-184258.ckpt.data-00000-of-00001
20180924-184258.ckpt.index
20181011-020732.ckpt.data-00000-of-00001
20181011-020732.ckpt.index
checkpoint


In [None]:
!cmp model_weights/all_models/20181011-020732.ckpt.data-00000-of-00001 model_weights/best.ckpt.data-00000-of-00001

model_weights/all_models/20181011-020732.ckpt.data-00000-of-00001 model_weights/best.ckpt.data-00000-of-00001 differ: byte 66442, line 1
