In [None]:
import sys
sys.path.append("../src/")

In [2]:
from plugin_write_and_run import *

In [3]:
from utilities import *
from config import *
from game import *
from shared_storage import *
from networks import *
from mcts import *
from self_play import *
from training import *

In [4]:
def muzero(config: MuZeroConfig):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer())
    replay_buffer = ReplayBuffer(config)

    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes)
        train_network(config, storage, replay_buffer, config.nb_epochs)

        print("Train score:", score_train)
        print("Eval score:", run_eval(config, storage, 50))
        print(f"MuZero played {config.nb_episodes * (loop + 1)} "
              f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n")

    return storage.latest_network()

In [5]:
config = make_atari_config()

In [None]:
last_nn = muzero(config)

Automatic pdb calling has been turned ON
Training loop 0
Train score: 18.9
Eval score: 9.42
MuZero played 20 episodes and trained for 20 epochs.

Training loop 1
Train score: 16.4
Eval score: 9.2
MuZero played 40 episodes and trained for 40 epochs.

Training loop 2
Train score: 22.75
Eval score: 9.22
MuZero played 60 episodes and trained for 60 epochs.

Training loop 3
Train score: 22.75
Eval score: 9.62
MuZero played 80 episodes and trained for 80 epochs.

Training loop 4
Train score: 20.15
Eval score: 9.1
MuZero played 100 episodes and trained for 100 epochs.

Training loop 5
Train score: 23.9
Eval score: 9.28
MuZero played 120 episodes and trained for 120 epochs.

Training loop 6
Train score: 25.2
Eval score: 9.32
MuZero played 140 episodes and trained for 140 epochs.

Training loop 7
Train score: 22.6
Eval score: 9.32
MuZero played 160 episodes and trained for 160 epochs.

Training loop 8
Train score: 22.5
Eval score: 9.24
MuZero played 180 episodes and trained for 180 epochs.

Tra