# README

* You should single step each cell to see what is going on.

* The results are saved in: my_log_dir='experiments/results_gin_rummy_dqn/'.

    This is tempoary storage for your colab session: it will be deleted when you leave colab.

* The model name is: my_model_name='model_gin_rummy_dqnagent'

* You can rerun the cell with "train_dqn_agent(...)" to continue training the model.

# Optionally save work in your google drive to keep it after you logout

If you wish to save work in your google drive:
* change my_working_directory to something of your choice
* change False to True

In [None]:
if True:
    my_working_directory = 'drive/MyDrive/rlcard-colab/'
    %cd {my_working_directory}

/content/drive/MyDrive/rlcard-colab



#Definitions
* Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state', 'next_legal_actions', 'done'])
* Trajectory is a list of transitions made by an agent

# Agent

The Agent has these functions:
* step(state) -> action_id
* eval_step(state) -> action_id, probabilites
* feed(trajectory)
* predict(state) -> q_values

If the agent is not trainable:
* feed(trajectory) does nothing
* predict(state) returns None

# Environment

The Environment holds information about:
* game
* agents
* state_shape
* action_shape
* timestep (incremented every time any agent takes a step)

The Environment has these functions:
* set_agents(agents)
* run(is_training) -> Trajectories, Payoffs

The Environment determines:
* how the state of the game is represented
* what is the state_shape
* what is the action_shape



# Install RLCard

In [None]:
pip install rlcard



In [None]:
pip install rlcard[torch]



# Imports

In [None]:
import numpy as np
import torch
import torch.nn as nn
import os
from copy import deepcopy

from datetime import datetime
from pytz import timezone

In [None]:
import rlcard
from rlcard.agents.random_agent import RandomAgent
from rlcard.models.gin_rummy_rule_models import GinRummyNoviceRuleAgent

from rlcard.agents.dqn_agent import DQNAgent
from rlcard.agents.dqn_agent import Estimator
from rlcard.agents.dqn_agent import EstimatorNetwork
from rlcard.agents.dqn_agent import Memory

from rlcard.utils import reorganize
from rlcard.utils import get_device
from rlcard.utils import set_seed
from rlcard.utils import Logger
from rlcard.utils import tournament
from rlcard.utils import plot_curve

from rlcard.envs.env import Env
from rlcard.envs.gin_rummy import GinRummyEnv

from rlcard.games.gin_rummy.game import GinRummyGame
from rlcard.games.gin_rummy.player import GinRummyPlayer
from rlcard.games.gin_rummy.utils.move import ScoreNorthMove, ScoreSouthMove, DrawCardMove, PickupDiscardMove, KnockMove, GinMove, DeclareDeadHandMove
from rlcard.games.gin_rummy.utils.action_event import KnockAction, GinAction

# Gin Rummy Scoring function

The get_gin_rummy_environment function uses this to set the payoffs for the Gin Rummy game.

A player gets 1 point for going out (knock or gin) else 0 points (dead hand or lose).

The DQNAgent needs to learn how to form and keep melds so that he can go out.

The DQNAgent should pickup the discard when it would form a meld.
Otherwise, he should draw a card (altough an exper player might break this rule sometimes).

In [None]:
def get_payoff_gin_rummy(player: GinRummyPlayer, game: GinRummyGame) -> float:
    ''' Get the payoff of player:
            a) 1.0 if player gins
            b) 1.0 if player knocks
            c) 0.0 otherwise
        The goal is to have the agent learn how to knock and gin.
    Returns:
        payoff (int or float): payoff for player (higher is better)
    '''
    going_out_action = game.round.going_out_action
    going_out_player_id = game.round.going_out_player_id
    if going_out_player_id == player.player_id and isinstance(going_out_action, KnockAction):
        payoff = 1
    elif going_out_player_id == player.player_id and isinstance(going_out_action, GinAction):
        payoff = 1
    else:
        payoff = 0
    return payoff

# Get Gin Rummy Environment function

Note that we patch the Gin Rummy game that is in the environment.

In [None]:
def get_gin_rummy_environment(seed) -> GinRummyEnv:
    # Make the environment with seed
    gin_rummy_env = rlcard.make('gin-rummy', config={'seed': seed})

    # patch gin rummy game
    gin_rummy_env.game.judge.scorer.get_payoff = get_payoff_gin_rummy # wch: Note this
    gin_rummy_env.game.settings.is_always_knock = True # wch: Note this

    # print info
    state_shape = gin_rummy_env.state_shape[0] # state_shape for player_id = 0
    action_shape = gin_rummy_env.action_shape
    num_actions = gin_rummy_env.game.get_num_actions()
    print(f'state_shape={state_shape}')
    print(f'action_shape={action_shape}')
    print(f'num_actions={num_actions}')
    return gin_rummy_env

# Get DQNAgent function

The train_dqn_agent function uses this to get the DQNAgent.

The critical hyper parameters for the DQNAgent are:

* replay_memory_size = 200000 rather than 20K
* replay_memory_init_size = 1000 rather than 100
* batch_size  =128 rather than 32
* train_every = 100 rather than 1
* mlp_layers = [128, 128, 128] rather than [64, 64]
* learning_rate=0.00005 You might want to modify this.

The DQNAgent has instance variables and three components:

* Instance variables:
    1. update_target_estimator_every
    2. train_every
    3. discount_factor
    4. epsilon_start
    5. epsilon_end
    6. epsilon_decay_step

* Estimator:
    1. learning_rate
    2. device

* Estimator Network:
    1. state_shape
    2. mlp_layers
    3. num_actions

* Replay Memory:
    1. replay_memory_size
    2. replay_memory_init_size
    3. batch_size





In [None]:
def get_dqn_agent(env, model_name: str, log_dir: str):
    replay_memory_size = 200000 # 20000
    replay_memory_init_size = 1000 #100
    update_target_estimator_every = 1000
    discount_factor=0.99
    epsilon_start=1.0
    epsilon_end=0.1
    epsilon_decay_steps=20000
    batch_size=128 #32
    train_every=100 #1
    mlp_layers = [128, 128, 128] # [64, 64]
    learning_rate=0.00005 #0.00005
    device = get_device()

    agent_path = os.path.join(log_dir, f'{model_name}.pth')
    if os.path.exists(agent_path):
        train_id = 2 # increment for each train iteration
        dqn_agent = torch.load(agent_path, map_location=device)
        dqn_agent.set_device(device)
        #dqn_agent.train_every = 100  # wch: note this
        #dqn_agent.batch_size = 128  # wch: note this
        #dqn_agent.memory.batch_size = dqn_agent.batch_size  # wch: note this
        #dqn_agent.replay_memory_init_size = 1000  # wch: note this
        #dqn_agent.q_estimator.optimizer.param_groups[0]['lr'] = 0.000005
        #dqn_agent.target_estimator.optimizer.param_groups[0]['lr'] = 0.000005
        print(f'train_id={train_id}; agent={dqn_agent}')
        print(f'train_steps={dqn_agent.train_t} time_steps={dqn_agent.total_t}')
    else:
        train_id = 1 # reset for first train iteration
        num_actions = env.game.get_num_actions()
        state_shape = env.state_shape[0] # state_shape for player_id = 0
        dqn_agent = DQNAgent(
            replay_memory_size,
            replay_memory_init_size,
            update_target_estimator_every,
            discount_factor,
            epsilon_start,
            epsilon_end,
            epsilon_decay_steps,
            batch_size,
            num_actions,
            state_shape,
            train_every,
            mlp_layers,
            learning_rate,
            device=device)
    print(f'log_dir={log_dir}')
    print(f'model_name={model_name}')
    print(dqn_agent.q_estimator.qnet)
    return dqn_agent

# Game Statistics class

The Train class train function uses this to accumulate statistics for each completed game.

The Train class train function uses this to show the accumulated statistics when the training session ends.


In [None]:
class GameStatistics(object):  # interface
    def update_statistics(self, game):
        pass

    def show_statistics(self):
        pass

class GinRummyGameStatistics(GameStatistics):
    def __init__(self):
        super().__init__()
        self.game_count = 0
        self.north_draw_card_move_count = 0
        self.north_pickup_discard_move_count = 0
        self.knock_count = 0
        self.gin_count = 0
        self.declare_dead_hand_count = 0
        self.moves_per_game = 0

    def update_statistics(self, game):
        move_sheet = game.round.move_sheet
        self.game_count += 1
        if len(move_sheet) > 2:
            north_draw_card_moves = [move for move in move_sheet if isinstance(move, DrawCardMove) and move.player.player_id == 0]
            self.north_draw_card_move_count += len(north_draw_card_moves)
            north_pickup_discard_moves = [move for move in move_sheet if isinstance(move, PickupDiscardMove) and move.player.player_id == 0]
            self.north_pickup_discard_move_count += len(north_pickup_discard_moves)
            going_out_move = move_sheet[-3]
            if isinstance(going_out_move, KnockMove):
                if going_out_move.player.player_id == 0:
                    self.knock_count += 1
            elif isinstance(going_out_move, GinMove):
                if going_out_move.player.player_id == 0:
                    self.gin_count += 1
            elif isinstance(going_out_move, DeclareDeadHandMove):
                self.declare_dead_hand_count += 1
        self.moves_per_game += len(move_sheet)

    def show_statistics(self):
        print(f'game_count={self.game_count}')
        print(f'north_draw_card_moves_per_game={self.north_draw_card_move_count / self.game_count}')
        print(f'north_pickup_discard_moves_per_game={self.north_pickup_discard_move_count / self.game_count}')
        print(f'knock_count={self.knock_count}')
        print(f'gin_count={self.gin_count}')
        print(f'declare_dead_hand_count={self.declare_dead_hand_count}')
        print(f'average_moves_per_game={self.moves_per_game / self.game_count}')

# Trainer class

This class is initialized with an environment.

You must set the agents for the environment before you call the train method.

The first agent must be the agent to be trained.

=====================================

In this notebook, the first agent is the DQNAgent and the second agent is the GinRummyNoviceRuleAgent.

Training the DQNAgent against the RandomAgent is easy, but the trained DQNAgent learns a lot of bad habits. The play of the trained DQNAgent against the RandomAgent results in long games where both players frequently pick up the opponent's discard. The DQNAgent learns how to form melds (so that the game doesn't end as a dead hand), but waste times picking up the opponent's discard.

=====================================

In [None]:
class Trainer(object):

    def __init__(self, log_dir: str, model_name: str, env: Env):
        self.log_dir = log_dir
        self.model_name = model_name
        self.algorithm = 'dqn'
        self.env = env
        self.game_statistics = GameStatistics()
    
    def train(self, multiplier: int, seed):
        with Logger(self.log_dir) as logger:
            num_episodes = 1000 * multiplier  # 5000
            evaluate_every = 20 * multiplier  # 100
            num_eval_games = 300 # 2000
            # Seed numpy, torch, random
            set_seed(seed)
            for episode in range(num_episodes):
                # Generate data from the environment
                trajectories, payoffs = self.env.run(is_training=True)
                # Reorganaize the data to be state, action, reward, next_state, done
                trajectories = reorganize(trajectories, payoffs)

                # Feed transitions into agent memory
                self.feed(trajectories)

                # Keep game statistics
                self.game_statistics.update_statistics(game=self.env.game)

                # Evaluate the performance. Play with random agents.
                if episode % evaluate_every == 0:
                    self.evaluate_performance(episode=episode, evaluate_every=evaluate_every, num_eval_games=num_eval_games, logger=logger)
                            
                # Save model when number of episodes is very large ?
                if episode > 0 and episode % 100000 == 0:
                    self.save_model(agent=self.env.agents[0])

        # Show game statistics
        self.game_statistics.show_statistics()
        # Plot the learning curve
        self.plot_learning_curve(logger=logger)
        # Save model
        self.save_model(agent=self.env.agents[0])
    
    def feed(self, trajectories):
        # Feed transitions into agent memory, and train the agent
        # Here, we assume that DQN always plays the first position
        # and the other players play randomly (if any)
        for ts in trajectories[0]:
            self.env.agents[0].feed(ts)

    def evaluate_performance(self, episode: int, evaluate_every: int, num_eval_games: int, logger):
        # Evaluate the performance. Play with random agents.
        logger.log(f"\n----------------------------------------")
        logger.log(f"\nEpisode: {episode}")
        logger.log_performance(self.env.timestep, tournament(self.env, num_eval_games)[0])
    
    def plot_learning_curve(self, logger):
        # Plot the learning curve
        csv_path, fig_path = logger.csv_path, logger.fig_path
        plot_curve(csv_path, fig_path, self.algorithm)

    def save_model(self, agent):
        # Save model
        # Note: Saving the agent is better than saving just the model.
        save_path = os.path.join(self.log_dir, f'{self.model_name}.pth')
        torch.save(agent, save_path)
        print('Model saved in', save_path)

# Get current time function

In [None]:
def get_current_time():
    return datetime.now(timezone('America/Chicago')).strftime('%I:%M:%S %p')

# Constants

In [None]:
my_log_dir='experiments/results_gin_rummy_dqn/'
my_model_name='model_gin_rummy_dqnagent'
my_seed = None
my_multiplier = 500

# Train DQNAgent

In [None]:
%%time
print(f"Start: {get_current_time()}")
if torch.cuda.is_available():
    !nvidia-smi
# define gin rummy environment
my_gin_rummy_env = get_gin_rummy_environment(seed=my_seed)
# define learning agent
my_learning_agent = get_dqn_agent(env=my_gin_rummy_env, model_name=my_model_name, log_dir=my_log_dir)
# define opponent agent
my_opponent_agent = GinRummyNoviceRuleAgent()
# set agents for environment
my_gin_rummy_env.set_agents([my_learning_agent, my_opponent_agent])
# train dqn_agent
my_trainer = Trainer(log_dir=my_log_dir, model_name=my_model_name, env=my_gin_rummy_env)
my_trainer.game_statistics = GinRummyGameStatistics()
my_trainer.train(multiplier=my_multiplier, seed=my_seed)
print(f"End: {get_current_time()}")

Start: 10:07:22 AM
state_shape=[5, 52]
action_shape=[None, None]
num_actions=110
--> Running on the CPU
log_dir=experiments/results_gin_rummy_dqn/
model_name=model_gin_rummy_dqnagent
EstimatorNetwork(
  (fc_layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): BatchNorm1d(260, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=260, out_features=128, bias=True)
    (3): Tanh()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): Tanh()
    (6): Linear(in_features=128, out_features=128, bias=True)
    (7): Tanh()
    (8): Linear(in_features=128, out_features=110, bias=True)
  )
)

----------------------------------------

Episode: 0

----------------------------------------
  timestep     |  55
  reward       |  0.03
----------------------------------------


  state_batch, action_batch, reward_batch, next_state_batch, legal_actions_batch, done_batch = self.memory.sample()


INFO - Step 1000, rl-loss: 0.36836180090904236
INFO - Copied model parameters to target network.
INFO - Step 101000, rl-loss: 0.08030931651592255
INFO - Copied model parameters to target network.
INFO - Step 201000, rl-loss: 0.030089031904935837
INFO - Copied model parameters to target network.
INFO - Step 259500, rl-loss: 0.024272767826914787
----------------------------------------

Episode: 10000

----------------------------------------
  timestep     |  544043
  reward       |  0.03
----------------------------------------
INFO - Step 301000, rl-loss: 0.03186260908842087
INFO - Copied model parameters to target network.
INFO - Step 401000, rl-loss: 0.02609328180551529
INFO - Copied model parameters to target network.
INFO - Step 501000, rl-loss: 0.014850076287984848
INFO - Copied model parameters to target network.
INFO - Step 515300, rl-loss: 0.01892142929136753
----------------------------------------

Episode: 20000

----------------------------------------
  timestep     |  10

In [None]:
print("Done")

# play_gin_rummy_tournament function

In [None]:
def play_gin_rummy_tournament_with_agents(north_agent, south_agent, num_games: int):
    tournament_env = rlcard.make('gin-rummy', config={'seed': None})
    tournament_env.game.judge.scorer.get_payoff = get_payoff_gin_rummy # wch: Note this
    tournament_env.game.settings.is_always_knock = True # wch: Note this
    tournament_env.set_agents([north_agent, south_agent])
    payoffs = tournament(tournament_env, num_games)
    print(f"payoffs=[{payoffs[0]}, {payoffs[1]}]")

In [None]:
def play_gin_rummy_tournament(log_dir, model_name, num_games):
    agent_path = os.path.join(log_dir, f'{model_name}.pth')
    device = get_device()
    gin_rummy_dqn_agent = torch.load(agent_path, map_location=device)
    gin_rummy_dqn_agent.set_device(device)
    print(f"gin_rummy_dqn_agent={gin_rummy_dqn_agent}")
    ginRummyNoviceRuleAgent = GinRummyNoviceRuleAgent()
    num_actions = gin_rummy_dqn_agent.num_actions
    random_agent = RandomAgent(num_actions=num_actions)
    num_games_preliminary = 100
    for tournament_id in range(5):
        print(f"----- preliminary tournament {tournament_id + 1} {num_games_preliminary} games ------")
        play_gin_rummy_tournament_with_agents(gin_rummy_dqn_agent, ginRummyNoviceRuleAgent, num_games = num_games_preliminary)
        play_gin_rummy_tournament_with_agents(gin_rummy_dqn_agent, random_agent, num_games = num_games_preliminary)
        play_gin_rummy_tournament_with_agents(ginRummyNoviceRuleAgent, random_agent, num_games = num_games_preliminary)
    print(f"----- final tournament gin_rummy_dqn_agent vs ginRummyNoviceRuleAgent {num_games} games  ------")
    play_gin_rummy_tournament_with_agents(gin_rummy_dqn_agent, ginRummyNoviceRuleAgent, num_games = num_games)

# Gin Rummy Tournaments

There are 5 preliminary tournaments of 100 games per tournament:

*   gin_rummy_dqn_agent vs ginRummyNoviceRuleAgent
*   gin_rummy_dqn_agent vs randomAgent
*   ginRummyNoviceRuleAgent vs randomAgent

There is one final tournament of {num_games} games per tournament:

*   gin_rummy_dqn_agent vs ginRummyNoviceRuleAgent

In [None]:
play_gin_rummy_tournament(log_dir=my_log_dir, model_name=my_model_name, num_games=2000)

# Print local variables

In [None]:
for my_type in ['str', 'int', 'float', 'list', 'dict', 'NoneType']:
    print(f'---------------------------------')
    print(f'{my_type}')
    %whos {my_type}
    print()

In [None]:
%whos str int list dict NoneType