In [5]:
import os
import time
import math
import random
import pickle
import numpy as np
import tensorflow as tf
import multiprocess as mp
from libs.TicTacToe import TicTacToe
from libs.model import create_az_model
from libs.MCTS import MCTS
from libs.Trainer import Trainer

In [6]:
n_cores = mp.cpu_count()
print(f"[Number of cores]: {n_cores}")

[Number of cores]: 16


## Tic-Tac-Toe engine

In [7]:
state = np.array([0., 0., -1., -1., 1., -1., 1., 1., 0.])
TicTacToe().get_available_actions(state)

array([1., 1., 0., 0., 0., 0., 0., 0., 1.])

In [8]:
ttt = TicTacToe()

while ttt.status=="Ongoing":

    valid_moves = ttt.next_moves()
    new_board = random.choice(valid_moves)
    ttt.play(new_board)
    print(np.reshape(ttt.board, (3,3)),"\n")
    if len(valid_moves)==0:
        break

print("[Winner]:",ttt.winner)

[[0. 0. 0.]
 [1. 0. 0.]
 [0. 0. 0.]] 

[[ 0.  0.  0.]
 [ 1.  0. -1.]
 [ 0.  0.  0.]] 

[[ 0.  0.  0.]
 [ 1.  0. -1.]
 [ 0.  0.  1.]] 

[[ 0.  0. -1.]
 [ 1.  0. -1.]
 [ 0.  0.  1.]] 

[[ 0.  1. -1.]
 [ 1.  0. -1.]
 [ 0.  0.  1.]] 

[[ 0.  1. -1.]
 [ 1.  0. -1.]
 [-1.  0.  1.]] 

[[ 0.  1. -1.]
 [ 1.  1. -1.]
 [-1.  0.  1.]] 

[[-1.  1. -1.]
 [ 1.  1. -1.]
 [-1.  0.  1.]] 

[[-1.  1. -1.]
 [ 1.  1. -1.]
 [-1.  1.  1.]] 

[Winner]: 1


## Create policy and probability model

In [9]:
model = create_az_model()

In [10]:
model.summary()

Model: "tictactoe_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 9)]          0           []                               
                                                                                                  
 tf.__operators__.add (TFOpLamb  (None, 9)           0           ['input_1[0][0]']                
 da)                                                                                              
                                                                                                  
 tf.cast (TFOpLambda)           (None, 9)            0           ['tf.__operators__.add[0][0]']   
                                                                                                  
 tf.one_hot (TFOpLambda)        (None, 9, 3)         0           ['tf.cast[0][0]']  

In [11]:
tf.keras.utils.plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


# Monte Carlo Tree Search

In [12]:
mcts = MCTS(game=TicTacToe(), n_simulations=100)

In [13]:
state = np.array([0., 0., -1., -1., 1., -1., 1., 1., 0.])

In [14]:
root = mcts.run(model=model, state=state, player=1)



In [15]:
root

State:
[ 0.  0. -1. -1.  1. -1.  1.  1.  0.]
Player:1
Value:0.96
Leaf:False
Visits:100
Children:
 -0: prior=0.3333333333333333
 -1: prior=0.3333333333333333
 -8: prior=0.3333333333333333

# Generating the first dataset

In [16]:
# speed test on 100 games
start_time = time.time()
trainer = Trainer(game = TicTacToe, mcts = MCTS, model=None)
dataset = trainer.create_dataset(number_of_games=100, temperature=1)
print("Running time: %s seconds" % (time.time() - start_time))

Running time: 23.375245571136475 seconds


In [17]:
def parallel_fn(n):
    from libs.Trainer import Trainer
    from libs.TicTacToe import TicTacToe
    from libs.MCTS import MCTS
    N_GAMES_PER_JOB = 250
    trainer = Trainer(game = TicTacToe, mcts = MCTS, model=None)
    game_batch = trainer.create_dataset(number_of_games=N_GAMES_PER_JOB, temperature=1)
    return game_batch

In [18]:
start_time = time.time()
ctx = mp.get_context("spawn")
pool = ctx.Pool(n_cores)
dataset = pool.map(parallel_fn, range(n_cores))
print("Running time: %s seconds" % (time.time() - start_time))

Running time: 120.05786895751953 seconds


In [20]:
N_GAMES_PER_JOB = 250
filename = str(N_GAMES_PER_JOB*n_cores)+"_tictactoe_temperature_1.pickle"

flatten_dataset = []
for batch in dataset:
    flatten_dataset += batch
    
with open(filename, "wb") as fp:
    pickle.dump(flatten_dataset, fp, protocol=pickle.HIGHEST_PROTOCOL)

# 1st training

In [21]:
trainer = Trainer(game = TicTacToe, mcts = MCTS, model=create_az_model())

In [22]:
with open(filename, 'rb') as handle:
    dataset = pickle.load(handle)

In [23]:
fit_history = trainer.train(dataset, n_epochs=1000, batch_size=30, learning_rate=1e-2)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000


Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000


Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000


Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000
Epoch 133/1000
Epoch 134/1000
Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000
Epoch 139/1000
Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1000
Epoch 152/1000
Epoch 153/1000
Epoch 154/1000


Epoch 155/1000
Epoch 156/1000
Epoch 157/1000
Epoch 158/1000
Epoch 159/1000
Epoch 160/1000
Epoch 161/1000
Epoch 162/1000
Epoch 163/1000
Epoch 164/1000
Epoch 165/1000
Epoch 166/1000
Epoch 167/1000
Epoch 168/1000
Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000
Epoch 173/1000
Epoch 174/1000
Epoch 175/1000
Epoch 176/1000
Epoch 177/1000
Epoch 178/1000
Epoch 179/1000
Epoch 180/1000
Epoch 181/1000
Epoch 182/1000
Epoch 183/1000
Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000


Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000


In [24]:
trainer.model.predict(np.array([[0., 0., -1., -1., 1., -1., 1., 1., 0.]]))



(array([[7.3029539e-03, 9.9688314e-02, 4.8711382e-09, 3.8707190e-06,
         1.1328524e-06, 6.5809280e-07, 7.0552865e-08, 1.1762501e-06,
         8.9300179e-01]], dtype=float32),
 array([[0.99997973]], dtype=float32))

In [25]:
trainer.model.predict(np.array([[0., 0., 0., 0., -1., 1., -1., 0., 0.]]))



(array([[7.0015445e-02, 8.6133733e-02, 5.7866484e-01, 9.6366681e-02,
         1.1903725e-04, 3.7989702e-07, 3.1434181e-06, 7.8452431e-02,
         9.0244174e-02]], dtype=float32),
 array([[-0.910473]], dtype=float32))

In [26]:
trainer.save_model('saved_model/alpha_zero_model_first_training_validation')



INFO:tensorflow:Assets written to: saved_model/alpha_zero_model_first_training_validation\assets


INFO:tensorflow:Assets written to: saved_model/alpha_zero_model_first_training_validation\assets


# Generating the self-playing dataset (2nd)

In [27]:
loaded_model = tf.keras.models.load_model("saved_model/alpha_zero_model_first_training_validation/")

In [28]:
# speed test on 10 games
start_time = time.time()
trainer = Trainer(game = TicTacToe, mcts = MCTS, model=loaded_model)
dataset = trainer.create_dataset(number_of_games=10, temperature=1)
print("Running time: %s seconds" % (time.time() - start_time))





















KeyboardInterrupt: 

In [None]:
# 1175s -> ~30m for 100
# 25*0.5 ~ 12hs

In [29]:
N_ROUNDS = 20
N_GAMES_PER_JOB = 25

In [30]:
def parallel_fn(job_n, n):
    
    from libs.Trainer import Trainer
    from libs.TicTacToe import TicTacToe
    from libs.MCTS import MCTS
    import tensorflow as tf
    N_GAMES_PER_JOB = 25
    
    model = tf.keras.models.load_model("saved_model/tmp_model")
    
    trainer = Trainer(game = TicTacToe, mcts = MCTS, model=model)
    
    game_batch = trainer.create_dataset(number_of_games=N_GAMES_PER_JOB, temperature=1.0/n)
    
    return game_batch

In [31]:
#stored previous self-plays
filename_data = "4000_tictactoe_temperature_1_2nd_round.pickle"
if os.path.isfile(filename_data):
    with open(filename_data, 'rb') as handle:
        self_play_dataset = pickle.load(handle)
    print(f"Reusing {filename_data}")
else:
    self_play_dataset = []
    print(f"Starting from empty self-play data")

model_tmp_filename = "saved_model/tmp_model"
if os.path.isdir(model_tmp_filename):
    model = tf.keras.models.load_model(model_tmp_filename)
    print(f"Starting with {model_tmp_filename}")
else:
    model = tf.keras.models.load_model("saved_model/alpha_zero_model_first_training_validation/")
    print(f"Starting with saved_model/alpha_zero_model_first_training_validation/")

model.save(model_tmp_filename)

Starting from empty self-play data
Starting with saved_model/alpha_zero_model_first_training_validation/




INFO:tensorflow:Assets written to: saved_model/tmp_model\assets


INFO:tensorflow:Assets written to: saved_model/tmp_model\assets


In [None]:

for n in range(N_ROUNDS):
    
    start_time = time.time()
    
    ctx = mp.get_context("spawn")
    
    pool = ctx.Pool(n_cores)
    
    args = list(zip(range(n_cores),[n+1]*n_cores))
    
    round_dataset = pool.starmap(parallel_fn, args)
    print(f"Running time {n+1}-th round: {time.time() - start_time} seconds")
    
    flatten_round_dataset = []
    for batch in round_dataset:
        flatten_round_dataset += batch
    self_play_dataset += flatten_round_dataset
    
    trainer = Trainer(game = TicTacToe, mcts = MCTS, model=model)
    fit_history = trainer.train(self_play_dataset, n_epochs=50, batch_size=50, learning_rate=1e-2)
    
    trainer.model.save("saved_model/tmp_model")
    
    with open("selfplay_data.pickle", "wb") as fp:
        pickle.dump(self_play_dataset, fp, protocol=pickle.HIGHEST_PROTOCOL)
    