# TichuAgents: Training the Agents

This notebook is used for training Agents using Reinforcement Learning to play Tichu.

# Setup

In [None]:
# Execute after restarting runtime
!git clone https://github.com/alxwdm/tichuagent

In [None]:
# Execute when content on github changed
%cd /content/tichuagent
!git pull
%cd /content/

In [1]:
import sys
import numpy as np
sys.path.append('/content/tichuagent')
# import Environment
from env.env import Env
# import all Agents
from agents.heuristic.greedy import greedyAgent
from agents.ddpg.ddpg_agent import DDPGAgent
# import utility functions
from utils import play_dumb_game, play_greedy_game

# Shared utils

In [None]:
def battle(agents, n_games=1000):
    """ Lets trained agents play games against each other. """
    pass # TODO

# DDPG Agent

Deep Deterministic Policy Gradient is an off-policy RL approach that combines both value- and policy-based learning. The DDPG actor learns how to act (i.e. policy-based), and a critic learns how to estimate the current situation (i.e. value-based). 

In [33]:
import sys
import numpy as np
sys.path.append('/content/tichuagent')
from itertools import compress
# import Environment
from env.env import Env
from env.cards import Cards
from env.deck import Deck
# import all Agents
from agents.heuristic.greedy import greedyAgent
from agents.ddpg.ddpg_agent import DDPGAgent
# import utility functions
from utils import play_dumb_game, play_greedy_game

def _vec_to_cards(vec):
    """ Turns a vector representation into a Cards instance. """
    all_cards = Deck().all_cards
    return Cards(list(compress(all_cards, vec)))

def ddpg(n_episodes=100, episode_offset=0, checkpoint_path=None,
         eps_start = 0.1, eps_decay = 0.995):
    """ Trains a DDPG Agent on Tichu. """
    # initialize environment and agent
    env = Env()
    state_size, action_size = env.info()
    heuristic_agent = greedyAgent()
    agent = DDPGAgent(state_size=state_size, action_size=action_size,
                  random_seed=0, heuristic_agent=heuristic_agent)
    all_scores = []
    eps = eps_start
    # reload checkpoint from previous training if available
    if checkpoint_path:
        agent.load_checkpoint(filepath=checkpoint_path)
    # train for n_episodes
    for i_episode in range(episode_offset, n_episodes + episode_offset):
        state, reward, done, active_player = env.reset()
        action_buffer = [None, None, None, None]
        scores = [0, 0, 0, 0]
        nstep = 0
        init_cnt = 0
        invalid_init = 0
        # make a valid initial move from heuristic agent (first steps)
        # each player must make an initial move before learning,
        # because of reward-design (reward is valid every 4 steps)
        print("\n")
        for i in range(4):
            last_active = active_player
            action_buffer[active_player] = agent.act(state[active_player],
                                                         1) # 1: heuristic move
            state, reward, done, active_player = env.step(active_player,
                                                  action_buffer[active_player])
            print(last_active, active_player, np.shape(action_buffer[last_active]))
        # train one episode
        while True:
            prev_state = state
            # regular learning routine after initialization:
            # learn from previous step, then take next step
            # vice-versa not possible (because of state/reward validity)
            try:
                assert np.shape(action_buffer[active_player]) == (56,), \
                    "nstep {}, Shape of action buffer must be (56,1) but it is {}".format(
                        nstep, np.shape(action_buffer[active_player]))
            except AssertionError:
                print('assertion failed, showing last actions')
                for i in range(4):
                    if action_buffer[i] is not None:
                        cards = _vec_to_cards(action_buffer[i])
                        cards.show()
                    else:
                        print('Player {} NoneType action_buffer!'.format(i))
            agent.step(prev_state[active_player],
                       action_buffer[active_player],
                       reward[active_player],
                       state[active_player],
                       done, nstep)
            # add rewards to scores list
            scores[active_player] += reward[active_player]
            # take an action in the environment
            prev_state[active_player] = state[active_player]
            action_buffer[active_player] = agent.act(state[active_player], eps)
            state, reward, done, active_player = env.step(active_player,
                                                  action_buffer[active_player])
            nstep += 1
            # all agents take a step when game is finished
            if done:
                for i in range(4):
                    assert np.shape(action_buffer[active_player]) == (56,), \
                    "Shape of action buffer must be (56,1) but it is {}".format(
                        np.shape(action_buffer[active_player]))
                    agent.step(prev_state[i], action_buffer[i],
                               reward[i], state[i], done, nstep)
                break
        # print episode info
        print('\rEpisode: {} \t Steps: {} \t Avg score: {}'.format(i_episode,
                                                    nstep, np.mean(scores)),
              end='')
        if i_episode > 0 and i_episode%10 == 0:
            print('\nEpisode: {} \t Steps: {} \t Avg score: {}'.format(i_episode,
                                                    nstep, np.mean(scores)))
        # take average rewards of all agents
        all_scores.append(np.mean(scores))
        eps = eps_decay * eps # decrease epsilon
    # save checkpoints
    fpath = 'checkpoint_' + str(i_episode)
    agent.save_checkpoint(filename=fpath)
    return all_scores

In [34]:
all_scores = ddpg(eps_decay=1)



3 0 (56,)
0 1 (56,)
1 2 (56,)
2 3 (56,)
Episode: 0 	 Steps: 752 	 Avg score: -1683.75

2 3 (56,)
3 0 (56,)
0 1 (56,)
1 2 (56,)
Episode: 1 	 Steps: 755 	 Avg score: -1690.0

1 2 (56,)
2 3 (56,)
3 0 (56,)
0 1 (56,)
Episode: 2 	 Steps: 833 	 Avg score: -1911.25

1 2 (56,)
2 3 (56,)
3 0 (56,)
0 1 (56,)
Episode: 3 	 Steps: 1015 	 Avg score: -2302.5

0 1 (56,)
1 2 (56,)
2 3 (56,)
3 0 (56,)
Episode: 4 	 Steps: 779 	 Avg score: -1746.25

1 2 (56,)
2 3 (56,)
3 0 (56,)
0 1 (56,)
Episode: 5 	 Steps: 550 	 Avg score: -1270.0

1 2 (56,)
2 3 (56,)
3 0 (56,)
0 1 (56,)
Episode: 6 	 Steps: 521 	 Avg score: -1158.75

1 3 (56,)
3 0 (56,)
0 1 (56,)
1 2 (56,)
assertion failed, showing last actions
┍┄┄┄┑┍┄┄┄┑┍┄┄┄┑┍┄┄┄┑┍┄┄┄┑
┆ 7   ┆┆ 7   ┆┆ Q   ┆┆ Q   ┆┆ Q   ┆
┆  ♠  ┆┆  ♢  ┆┆  ♠  ┆┆  ♡  ┆┆  ♢  ┆
┆   7 ┆┆   7 ┆┆   Q ┆┆   Q ┆┆   Q ┆
┖┄┄┄┚┖┄┄┄┚┖┄┄┄┚┖┄┄┄┚┖┄┄┄┚
  PASS
Player 2 NoneType action_buffer!
┍┄┄┄┑┍┄┄┄┑┍┄┄┄┑┍┄┄┄┑┍┄┄┄┑
┆ Ph  ┆┆ 3   ┆┆ 3   ┆┆ 9   ┆┆ 9   ┆
┆ oe  ┆┆  ♠  ┆┆  ♡  ┆┆  ♠  ┆┆  ♢  ┆
┆ nix ┆┆   3 ┆

ValueError: ignored

# Debugging Area

In [None]:
# nothing here