# Training Napoleon AI Brumaire

In [1]:
import torch
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from tqdm.notebook import tqdm

from brumaire.model import BrumaireHParams, BrumaireTrickModel
from brumaire.controller import BrumaireController
from brumaire.agent import RandomAgent, BrumaireAgent
from brumaire.session import Game
from brumaire.exp import ExperienceDB

## Hyperparameters

### Model constructions

In [2]:
DECL_L1_NODE = 4000
DECL_L2_NODE = 2000
DECL_L3_NODE = 1000

TRICK_L1_NODE = 4000
TRICK_L2_NODE = 2000
TRICK_L3_NODE = 1000

### Training Parameters

In [3]:
DECL_ITA = 0.01
DECL_CLIP_GRAD = 10.0

TRICK_ITA = 0.01
TRICK_CLIP_GRAD = 10.0

GAMMA = 0.97
EPSILON_BEGINNING = 1.0
EPSILON_LAST = 0.3

### Training Plan

In [4]:
EPISODE_NUM = 500

TRAIN_EPOCH = 10
TRAIN_SIZE = 400
TEST_SIZE = 200

RENEW_TARGET_INTERVALS = 100

### Evaluations

In [5]:
EVAL_BOARD_NUM = 10000
EVAL_INTERVALS = 10

Name this execution for logging.

In [6]:
RUN_NAME = f"trial-{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.%f')}"

Note hyperparameters in a log. This can be viewed with Tensorboard.

In [7]:
writer = SummaryWriter(f"./runs/{RUN_NAME}")

h_params = BrumaireHParams()
h_params.decl_l1_node = DECL_L1_NODE
h_params.decl_l2_node = DECL_L2_NODE
h_params.decl_l3_node = DECL_L3_NODE
h_params.decl_ita = DECL_ITA
h_params.decl_clip_grad = DECL_CLIP_GRAD
h_params.trick_l1_node = TRICK_L1_NODE
h_params.trick_l2_node = TRICK_L2_NODE
h_params.trick_l3_node = TRICK_L3_NODE
h_params.trick_ita = TRICK_ITA
h_params.trick_clip_grad = TRICK_CLIP_GRAD
h_params.gamma = GAMMA
h_params.write_summary(writer)

Use a CUDA device if available.

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Setup an agent and its model

In [9]:
controller = BrumaireController(h_params, device, writer)
agent = BrumaireAgent(controller, epsilon=EPSILON_BEGINNING)
opponent_agent = RandomAgent()

In [10]:
AGENTS = [
    agent,
    opponent_agent,
    opponent_agent,
    opponent_agent,
    opponent_agent
]

In [11]:
train_db = ExperienceDB()
test_db = ExperienceDB()
target = BrumaireTrickModel(h_params, device)
target.load_state_dict(controller.trick_model.state_dict())

<All keys matched successfully>

## Play and train the model

In [12]:
for ep in tqdm(range(EPISODE_NUM)):
    agent.epsilon = EPSILON_BEGINNING + (EPSILON_LAST - EPSILON_BEGINNING) * ep / EPISODE_NUM
    writer.add_scalar("epsilon", agent.epsilon, controller.trick_global_step)

    game = Game(TRAIN_SIZE + TEST_SIZE, AGENTS)
    game.decide_napoleon()
    game.discard_additional_cards()
    for idx in range(10):
        game.trick(idx)
    game.check_result()

    train_record, test_record = game.recorder.gen_batch(TRAIN_SIZE, TEST_SIZE)
    train_db.import_from_record(0, train_record, target, GAMMA, device)
    test_db.import_from_record(0, test_record, target, GAMMA, device)

    controller.train_decl(train_db, test_db, TRAIN_SIZE, TEST_SIZE, epoch=TRAIN_EPOCH)
    controller.train_trick(train_db, test_db, TRAIN_SIZE, TEST_SIZE, epoch=TRAIN_EPOCH)

    if (ep + 1) % EVAL_INTERVALS == 0:
        epsilon = agent.epsilon
        agent.epsilon = 0

        game = Game(EVAL_BOARD_NUM, AGENTS, log_enabled=True)
        game.decide_napoleon()
        game.discard_additional_cards()
        for idx in range(10):
            game.trick(idx)
        game.check_result()

        game.recorder.write_eval_result(0, writer, controller.trick_global_step)

        agent.epsilon = epsilon

    if (ep + 1) % RENEW_TARGET_INTERVALS == 0:
        train_db = ExperienceDB()
        test_db = ExperienceDB()
        target = BrumaireTrickModel(h_params, device)
        target.load_state_dict(controller.trick_model.state_dict())

  0%|          | 0/500 [00:00<?, ?it/s]

## Save the model

In [13]:
controller.save(f"./runs/{RUN_NAME}")