# TichuAgents: Training the Agents

This notebook is used for training Agents using Reinforcement Learning to play Tichu.

# Setup

In [None]:
# Execute after restarting runtime
!git clone https://github.com/alxwdm/tichuagent

In [None]:
# Execute when content on github changed
%cd /content/tichuagent
!git pull
%cd /content/

In [1]:
import sys
import numpy as np
sys.path.append('/content/tichuagent')
# import Environment
from env.env import Env
# import all Agents
from agents.heuristic.greedy import greedyAgent
from agents.ddpg.ddpg_agent import DDPGAgent
# import utility functions
from utils import play_dumb_game, play_greedy_game

# Shared utils

In [None]:
def battle(agents, n_games=1000):
    """ Lets trained agents play games against each other. """
    pass # TODO

# DDPG Agent

Deep Deterministic Policy Gradient is an off-policy RL approach that combines both value- and policy-based learning. The DDPG actor learns how to act (i.e. policy-based), and a critic learns how to estimate the current situation (i.e. value-based). 

In [1]:
def ddpg(n_episodes=100, episode_offset=0, checkpoint_path=None,
         eps_start = 0.3, eps_decay = 0.995, max_steps = 1000):
    """ Trains a DDPG Agent on Tichu. """
    # initialize environment and agent
    env = Env(train_mode=True)
    state_size, action_size = env.info()
    heuristic_agent = greedyAgent()
    agent = DDPGAgent(state_size=state_size, action_size=action_size,
                  random_seed=0, heuristic_agent=heuristic_agent)
    all_scores = []
    eps = eps_start
    # reload checkpoint from previous training if available
    if checkpoint_path:
        agent.load_checkpoint(filepath=checkpoint_path)
    # train for n_episodes
    for i_episode in range(episode_offset, n_episodes + episode_offset):
        state, reward, done, active_player = env.reset()
        action_buffer = [None, None, None, None]
        scores = [0, 0, 0, 0]
        nstep = 0
        init_cnt = 0
        # make a valid initial move from heuristic agent (first steps)
        # each player must make an initial move before learning,
        # because of reward-design (reward is valid every 4 steps)
        idle_cnt = 0
        while any(elem is None for elem in action_buffer):
            action_buffer[active_player] = agent.act(state[active_player], 1)
            state, reward, done, active_player = env.step(active_player,
                                            action_buffer[active_player])            
            idle_cnt += 1
            if idle_cnt > 10:
                raise EnvironmentError("Something went wrong.")
                #return state[active_player]
        # train one episode
        while nstep < max_steps:
            prev_state = state
            # regular learning routine after initialization:
            # learn from previous step, then take next step
            # vice-versa not possible (because of state/reward validity)
            agent.step(prev_state[active_player],
                       action_buffer[active_player],
                       reward[active_player],
                       state[active_player],
                       done, nstep)
            # add rewards to scores list
            scores[active_player] += reward[active_player]
            # take an action in the environment
            prev_state[active_player] = state[active_player]
            action_buffer[active_player] = agent.act(state[active_player], eps)
            state, reward, done, active_player = env.step(active_player,
                                                  action_buffer[active_player])
            nstep += 1
            # all agents take a step when game is finished
            if done:
                for i in range(4):
                    agent.step(prev_state[i], action_buffer[i],
                               reward[i], state[i], done, nstep)
                break
        # print episode info
        print('\rEpisode: {} \t Steps: {} \t Avg score: {}'.format(i_episode,
                                                    nstep, np.mean(scores)),
              end='')
        if i_episode > 0 and i_episode%10 == 0:
            print('')
        # take average rewards of all agents
        all_scores.append(np.mean(scores))
        eps = eps_decay * eps # decrease epsilon
    # save checkpoints
    fpath = 'checkpoint_' + str(i_episode)
    agent.save_checkpoint(filename=fpath)
    return all_scores

In [None]:
all_scores = ddpg(eps_decay=1)

# Debugging Area

In [None]:
# for debugging, use this function to nicely print action vector as cards:
# example: _vec_to_cards(action).show()
def _vec_to_cards(vec):
    """ Turns a vector representation into a Cards instance. """
    all_cards = Deck().all_cards
    return Cards(list(compress(all_cards, vec)))