<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Coding-Battleship" data-toc-modified-id="Coding-Battleship-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Coding Battleship</a></span></li><li><span><a href="#Callback-and-Plotting" data-toc-modified-id="Callback-and-Plotting-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Callback and Plotting</a></span></li><li><span><a href="#Playing-with-One-Ship-on-a-5x5-board" data-toc-modified-id="Playing-with-One-Ship-on-a-5x5-board-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Playing with One Ship on a 5x5 board</a></span></li><li><span><a href="#Playing-with-One-Ship-on-a-Bigger-Board" data-toc-modified-id="Playing-with-One-Ship-on-a-Bigger-Board-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Playing with One Ship on a Bigger Board</a></span></li><li><span><a href="#Visualizing-How-the-Agent-Plays" data-toc-modified-id="Visualizing-How-the-Agent-Plays-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Visualizing How the Agent Plays</a></span></li><li><span><a href="#Optimizing-The-Algorithm-Parameters-with-Hyperopt" data-toc-modified-id="Optimizing-The-Algorithm-Parameters-with-Hyperopt-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Optimizing The Algorithm Parameters with Hyperopt</a></span></li><li><span><a href="#Links" data-toc-modified-id="Links-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Links</a></span></li><li><span><a href="#Reward-scheme" data-toc-modified-id="Reward-scheme-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Reward scheme</a></span></li><li><span><a href="#Skeleton-Battleship-Environmnt" data-toc-modified-id="Skeleton-Battleship-Environmnt-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Skeleton Battleship Environmnt</a></span></li></ul></div>

### Coding Battleship

In [1]:
import gym
from gym import spaces
import numpy as np
from BattleshipStats import Battleship
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.results_plotter import load_results, ts2xy
import matplotlib.pyplot as plt
import os
from stable_baselines3 import DQN, A2C
from stable_baselines3.ppo import PPO 
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import os
from stable_baselines3.common.results_plotter import load_results, ts2xy
import matplotlib.pyplot as plt
import torch as th
from torch import nn

In [26]:
class BattleshipEnv(gym.Env):
    
    """Custom Environment that follows gym interface"""
    """see https://github.com/openai/gym/blob/master/gym/core.py"""
    
    metadata = {'render.modes': ['human']} 


    def __init__(self, battleship, rewards):
        
        super(BattleshipEnv, self).__init__()
        self.rewards = rewards
        self.battleship = battleship
        # board size
        self.dim = battleship.dim 
        # cell state encoding (empty, hit, miss)
        self.cell = {'E': 0, 'X': 1, 'O': -1} 
        # boards, actions, rewards
        self.board = self.cell['E']*np.ones((self.dim, self.dim), dtype='int')
        # set enemy board
        self.enemyBoard = self.battleship.randomBoardFlatIndeces()
        print(self.enemyBoard)
        self.hits = set()
        self.misses = set()

        # legal (empty) cells available for moves
        self.legal_actions = list(range(self.dim*self.dim)) 
        
        # Define action and observation space
        # They must be gym.spaces objects
        # In our case the action space is discrete: index of action
        self.action_space = spaces.Discrete(self.dim * self.dim)
        # The observation will be the state or configuration of the board
        self.observation_space = spaces.Box(low=-1, high=1,shape=(self.dim, self.dim), 
                                            dtype="float32")

    def step(self, action):       
        #print(action)
        temp = -1

        # assign a penalty for each illegal action used instead of a legal one
        if (action in self.misses or action in self.hits):

            reward = self.rewards[0]
            temp = action
            action = np.random.choice(self.legal_actions)
            return self.board, reward, False , {}
        # set new state after performing action (scoring board is updated)
        self.set_state(action)

        # new state on scoring board - this includes last action
        next_state = self.board

        # game completed?
        done = bool(len(self.hits) == np.sum(self.battleship.ships))
        # reward for a hit
        if action in self.enemyBoard and temp < 0: 
            reward = (self.rewards[1] if done else self.rewards[2])
        elif temp < 0:
            reward = self.rewards[3]
      
        reward = float(reward)
        info = {}
        

        return next_state, reward, done, info


    
    def reset(self):
        # Reset the state of the environment to an initial state
        """
        Important: the observation must be a numpy array
        :return: (np.array) 
        """
        
        self.board = self.cell['E']*np.ones((self.dim, self.dim), dtype='int')
        self.hits = set()
        self.misses = set()
        
        self.legal_actions = list(range(self.dim*self.dim)) 
               
        # generate a random board again if it was set randomly before

        self.enemyBoard = self.battleship.randomBoardFlatIndeces()

        return self.board
    
    # Render the environment to the screen
    # board (i,j)
    ## ------------>j
    ## | (0,0) | (0,1) | (0,2) | |
    ## | (1,0) | (1,1) | (1,2) | |
    ##                           v i
    def render(self, mode='human'):
        for i in range(self.dim):
            print("-"*(4*self.dim+2))
            for j in range(self.dim):
                current_state_value = self.board[i,j]
                current_state = list(self.cell.keys())[list(self.cell.values()).index(current_state_value)]
                current_state = (current_state if current_state!='E' else ' ')
                print(" | ", end="")
                print(current_state, end='')
            print(' |')
        print("-"*(4*self.dim+2))
        
    
    # set board configuration and state value after player action
    def set_state(self, action):
        if action in self.enemyBoard:
            self.hits.add(action)
            
            self.board[int(np.floor(action/self.dim)), action % self.dim] = self.cell["X"]
            
        else:
            self.misses.add(action)
            self.board[int(np.floor(action/self.dim)), action % self.dim] = self.cell["O"]
#         print(action)
#         print(int(np.floor(action/self.dim)), action % self.dim)
        self.legal_actions.remove(action)
            
            


In [27]:
dim = 10
ships = [2,3,3,4,5]
rewards = [-10,25,5,-1]

battleship = Battleship(dim, ships, True)

env = BattleshipEnv(battleship,rewards)

check_env(env, warn=True)

{64, 32, 2, 40, 74, 43, 12, 44, 42, 16, 50, 53, 54, 22, 26, 60, 63}


### Callback and Plotting

In [12]:
## This callback function is legacy and needs to be replaced with object oriented functions
## to work with all policies. See next callback function

def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global n_steps, best_mean_reward
    # Print stats every step_interval calls
    if (n_steps + 1) % step_interval == 0:
        # Evaluate policy training performance
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if len(x) > 0:
            # NOTE: when done is True, timesteps are counted and reported to the log_dir
            mean_reward = np.mean(y[-episode_interval:]) # mean reward over previous episode_interval episodes
            mean_moves = np.mean(np.diff(x[-episode_interval:])) # mean moves over previous episode_interval episodes
            print(x[-1], 'timesteps') # closest to step_interval step number
            print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f} - Last mean moves per episode: {:.2f}".format(best_mean_reward, 
                                                                                           mean_reward, mean_moves))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
    n_steps += 1
    # Returning False will stop training early
    return True

In [13]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, episode_interval: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.episode_interval = episode_interval
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model.pkl')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            # Evaluate policy training performance
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                # NOTE: when done is True, timesteps are counted and reported to the log_dir
                mean_reward = np.mean(y[-self.episode_interval:]) # mean reward over previous episode_interval episodes
                mean_moves = np.mean(np.diff(x[-self.episode_interval:])) # mean moves over previous 100 episodes
                if self.verbose > 0:
                    print(x[-1], 'timesteps') # closest to step_interval step number
                    print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f} - Last mean moves per episode: {:.2f}".format(self.best_mean_reward, 
                                                                                                   mean_reward, mean_moves))

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print("Saving new best model")
                    self.model.save(self.save_path)

        return True

In [14]:
def moving_average(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, window = 100, title='Learning Curve'):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    """
    
    x, y = ts2xy(load_results(log_folder), 'timesteps')
    y = moving_average(y, window=window)
    y_moves = moving_average(np.diff(x), window = window) 
    # Truncate x
    x = x[len(x) - len(y):]
    x_moves = x[len(x) - len(y_moves):]

    title = 'Smoothed Learning Curve of Rewards (every ' + str(window) +' steps)'
    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title)
    plt.show()

    title = 'Smoothed Learning Curve of Moves (every ' + str(window) +' steps)'
    fig = plt.figure(title)
    plt.plot(x_moves, y_moves)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Moves')
    plt.title(title)
    plt.show()

In [15]:

# class CustomCNN(BaseFeaturesExtractor):
#     """
#     :param observation_space: (gym.Space)
#     :param features_dim: (int) Number of features extracted.
#         This corresponds to the number of unit for the last layer.
#     """

#     def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
#         super(CustomCNN, self).__init__(observation_space, features_dim)
#         # We assume CxHxW images (channels first)
#         # Re-ordering will be done by pre-preprocessing or wrapper
#         self.layer1 = nn.Linear()
#         print(self.layer1)
        
#         self.cnn = nn.Sequential(
#             nn.Linear(10,200),
#             nn.Sigmoid(),
#             nn.Linear(200,400),
#             nn.Sigmoid(),
#             #nn.Conv2d(1, 1, kernel_size=40, stride=4, padding=1),
#             #nn.ReLU(),
#             nn.Flatten(),
#         )
#         #print(th.as_tensor(observation_space.sample()[None]).float())
#         # Compute shape by doing one forward pass
#         with th.no_grad():
#             n_flatten = self.cnn(
#                 th.as_tensor(observation_space.sample().reshape(1,dim,dim)[None]).float()
#             ).shape[1]
#         print(n_flatten)
#         self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

#     def forward(self, observations: th.Tensor) -> th.Tensor:
#         return self.linear(self.cnn(observations.reshape(1,1,dim,dim)))



### Playing with One Ship {3} : 5x5 board

In [2]:
dim = 5
ships = [3]
rewards = [-10, 25, 5, -0.1]
lr = 0.0001
num_timesteps = 500000000 # this is number of moves and not number of episodes
log_dir = "./gym_full/"
os.makedirs(log_dir, exist_ok=True)
policy_kwargs = dict(
                     activation_fn=th.nn.Tanh, 
                     net_arch=[200,dict(pi= [300,200,100], vf= [300,200,100])]
)


battleship = Battleship(dim, ships, True)

env = BattleshipEnv(battleship, rewards)
env = Monitor(
    env, 
    filename=log_dir, 
    allow_early_resets=True
)
env = DummyVecEnv([lambda: env])

best_mean_reward, n_steps, step_interval, episode_interval = -np.inf, 0, 10000, 10000

model = A2C(
    'MlpPolicy', 
    env,
    learning_rate=lr,
    policy_kwargs=policy_kwargs,
    verbose=0, 
    gamma=1
).learn(
    total_timesteps=num_timesteps, 
    callback=callback
)

NameError: name 'Battleship' is not defined

In [16]:
model_best = A2C.load('./gym_full/best_model.pkl')

In [18]:
num_timesteps = 495000000 # this is number of moves and not number of episodes
model_best.learn(
    total_timesteps=num_timesteps, 
    callback=callback
)

AttributeError: 'NoneType' object has no attribute 'reset'

### Playing with Two Ships {3,3} : 6x6 board

In [42]:
dim = 6
ships = [3,3]
rewards = [-45, 6, 6, -2]
lr = 0.00007
num_timesteps = 50000000 # this is number of moves and not number of episodes
log_dir = "./gym_full_6_new/"
os.makedirs(log_dir, exist_ok=True)
policy_kwargs = dict(
                     activation_fn=th.nn.Tanh, 
#                     net_arch=[100,120,100]

)


battleship = Battleship(dim, ships, True)

env = BattleshipEnv(battleship, rewards)
env = Monitor(
    env, 
    filename=log_dir, 
    allow_early_resets=True
)
env = DummyVecEnv([lambda: env])

best_mean_reward, n_steps, step_interval, episode_interval = -np.inf, 0, 10000, 10000

model = A2C(
    'MlpPolicy', 
    env,
    learning_rate=lr,
    policy_kwargs=policy_kwargs,
    verbose=0, 
    gamma=1
).learn(
    total_timesteps=num_timesteps, 
    callback=callback
)

{5, 11, 13, 14, 15, 17}
9959 timesteps
Best mean reward: -inf - Last mean reward per episode: -2157.36 - Last mean moves per episode: 79.11
Saving new best model
19984 timesteps
Best mean reward: -2157.36 - Last mean reward per episode: -2286.07 - Last mean moves per episode: 81.95
29982 timesteps
Best mean reward: -2157.36 - Last mean reward per episode: -2269.04 - Last mean moves per episode: 81.50
39870 timesteps
Best mean reward: -2157.36 - Last mean reward per episode: -2373.16 - Last mean moves per episode: 83.97
49952 timesteps
Best mean reward: -2157.36 - Last mean reward per episode: -2374.91 - Last mean moves per episode: 83.98
59968 timesteps
Best mean reward: -2157.36 - Last mean reward per episode: -2367.04 - Last mean moves per episode: 83.77
69958 timesteps
Best mean reward: -2157.36 - Last mean reward per episode: -2369.57 - Last mean moves per episode: 83.80
79956 timesteps
Best mean reward: -2157.36 - Last mean reward per episode: -2400.36 - Last mean moves per episod

629966 timesteps
Best mean reward: -1924.16 - Last mean reward per episode: -1920.86 - Last mean moves per episode: 72.63
Saving new best model
639984 timesteps
Best mean reward: -1920.86 - Last mean reward per episode: -1913.54 - Last mean moves per episode: 72.44
Saving new best model
649982 timesteps
Best mean reward: -1913.54 - Last mean reward per episode: -1906.20 - Last mean moves per episode: 72.24
Saving new best model
659939 timesteps
Best mean reward: -1906.20 - Last mean reward per episode: -1899.60 - Last mean moves per episode: 72.07
Saving new best model
669911 timesteps
Best mean reward: -1899.60 - Last mean reward per episode: -1892.20 - Last mean moves per episode: 71.87
Saving new best model
679981 timesteps
Best mean reward: -1892.20 - Last mean reward per episode: -1888.48 - Last mean moves per episode: 71.77
Saving new best model
689965 timesteps
Best mean reward: -1888.48 - Last mean reward per episode: -1879.32 - Last mean moves per episode: 71.54
Saving new bes

1209922 timesteps
Best mean reward: -1508.00 - Last mean reward per episode: -1505.04 - Last mean moves per episode: 61.24
Saving new best model
1219993 timesteps
Best mean reward: -1505.04 - Last mean reward per episode: -1505.46 - Last mean moves per episode: 61.23
1229989 timesteps
Best mean reward: -1505.04 - Last mean reward per episode: -1503.13 - Last mean moves per episode: 61.14
Saving new best model
1239993 timesteps
Best mean reward: -1503.13 - Last mean reward per episode: -1497.86 - Last mean moves per episode: 60.99
Saving new best model
1249901 timesteps
Best mean reward: -1497.86 - Last mean reward per episode: -1499.30 - Last mean moves per episode: 61.00
1259966 timesteps
Best mean reward: -1497.86 - Last mean reward per episode: -1497.05 - Last mean moves per episode: 60.93
Saving new best model
1269984 timesteps
Best mean reward: -1497.05 - Last mean reward per episode: -1494.08 - Last mean moves per episode: 60.84
Saving new best model
1279947 timesteps
Best mean r

1809895 timesteps
Best mean reward: -1407.06 - Last mean reward per episode: -1402.62 - Last mean moves per episode: 58.06
Saving new best model
1819937 timesteps
Best mean reward: -1402.62 - Last mean reward per episode: -1402.35 - Last mean moves per episode: 58.03
Saving new best model
1829986 timesteps
Best mean reward: -1402.35 - Last mean reward per episode: -1398.75 - Last mean moves per episode: 57.94
Saving new best model
1839991 timesteps
Best mean reward: -1398.75 - Last mean reward per episode: -1397.27 - Last mean moves per episode: 57.91
Saving new best model
1849984 timesteps
Best mean reward: -1397.27 - Last mean reward per episode: -1396.43 - Last mean moves per episode: 57.90
Saving new best model
1859958 timesteps
Best mean reward: -1396.43 - Last mean reward per episode: -1393.03 - Last mean moves per episode: 57.81
Saving new best model
1869921 timesteps
Best mean reward: -1393.03 - Last mean reward per episode: -1394.55 - Last mean moves per episode: 57.84
1879987

2429987 timesteps
Best mean reward: -1351.42 - Last mean reward per episode: -1356.24 - Last mean moves per episode: 56.66
2439979 timesteps
Best mean reward: -1351.42 - Last mean reward per episode: -1354.27 - Last mean moves per episode: 56.61
2449962 timesteps
Best mean reward: -1351.42 - Last mean reward per episode: -1353.95 - Last mean moves per episode: 56.60
2459975 timesteps
Best mean reward: -1351.42 - Last mean reward per episode: -1351.68 - Last mean moves per episode: 56.55
2469983 timesteps
Best mean reward: -1351.42 - Last mean reward per episode: -1351.63 - Last mean moves per episode: 56.53
2479963 timesteps
Best mean reward: -1351.42 - Last mean reward per episode: -1353.26 - Last mean moves per episode: 56.55
2489999 timesteps
Best mean reward: -1351.42 - Last mean reward per episode: -1355.18 - Last mean moves per episode: 56.61
2499973 timesteps
Best mean reward: -1351.42 - Last mean reward per episode: -1352.27 - Last mean moves per episode: 56.53
2509930 timestep

3039996 timesteps
Best mean reward: -1279.25 - Last mean reward per episode: -1276.84 - Last mean moves per episode: 54.55
Saving new best model
3049953 timesteps
Best mean reward: -1276.84 - Last mean reward per episode: -1277.87 - Last mean moves per episode: 54.58
3059996 timesteps
Best mean reward: -1276.84 - Last mean reward per episode: -1276.77 - Last mean moves per episode: 54.54
Saving new best model
3069976 timesteps
Best mean reward: -1276.77 - Last mean reward per episode: -1268.79 - Last mean moves per episode: 54.33
Saving new best model
3079956 timesteps
Best mean reward: -1268.79 - Last mean reward per episode: -1267.14 - Last mean moves per episode: 54.29
Saving new best model
3089974 timesteps
Best mean reward: -1267.14 - Last mean reward per episode: -1266.16 - Last mean moves per episode: 54.26
Saving new best model
3099940 timesteps
Best mean reward: -1266.16 - Last mean reward per episode: -1261.66 - Last mean moves per episode: 54.13
Saving new best model
3109985

3639964 timesteps
Best mean reward: -1188.96 - Last mean reward per episode: -1186.86 - Last mean moves per episode: 52.30
Saving new best model
3649976 timesteps
Best mean reward: -1186.86 - Last mean reward per episode: -1187.37 - Last mean moves per episode: 52.30
3659975 timesteps
Best mean reward: -1186.86 - Last mean reward per episode: -1187.56 - Last mean moves per episode: 52.30
3669987 timesteps
Best mean reward: -1186.86 - Last mean reward per episode: -1186.14 - Last mean moves per episode: 52.26
Saving new best model
3679968 timesteps
Best mean reward: -1186.14 - Last mean reward per episode: -1186.64 - Last mean moves per episode: 52.28
3689950 timesteps
Best mean reward: -1186.14 - Last mean reward per episode: -1187.38 - Last mean moves per episode: 52.29
3699915 timesteps
Best mean reward: -1186.14 - Last mean reward per episode: -1185.59 - Last mean moves per episode: 52.24
Saving new best model
3709972 timesteps
Best mean reward: -1185.59 - Last mean reward per episo

4259956 timesteps
Best mean reward: -1142.08 - Last mean reward per episode: -1139.46 - Last mean moves per episode: 51.07
Saving new best model
4269976 timesteps
Best mean reward: -1139.46 - Last mean reward per episode: -1141.94 - Last mean moves per episode: 51.14
4279996 timesteps
Best mean reward: -1139.46 - Last mean reward per episode: -1142.87 - Last mean moves per episode: 51.14
4289971 timesteps
Best mean reward: -1139.46 - Last mean reward per episode: -1140.62 - Last mean moves per episode: 51.07
4299974 timesteps
Best mean reward: -1139.46 - Last mean reward per episode: -1138.58 - Last mean moves per episode: 51.01
Saving new best model
4309985 timesteps
Best mean reward: -1138.58 - Last mean reward per episode: -1137.23 - Last mean moves per episode: 50.98
Saving new best model
4319950 timesteps
Best mean reward: -1137.23 - Last mean reward per episode: -1135.12 - Last mean moves per episode: 50.92
Saving new best model
4329969 timesteps
Best mean reward: -1135.12 - Last

4859981 timesteps
Best mean reward: -1047.98 - Last mean reward per episode: -1048.00 - Last mean moves per episode: 48.67
4869969 timesteps
Best mean reward: -1047.98 - Last mean reward per episode: -1049.87 - Last mean moves per episode: 48.73
4879959 timesteps
Best mean reward: -1047.98 - Last mean reward per episode: -1047.73 - Last mean moves per episode: 48.68
Saving new best model
4889970 timesteps
Best mean reward: -1047.73 - Last mean reward per episode: -1047.21 - Last mean moves per episode: 48.66
Saving new best model
4899974 timesteps
Best mean reward: -1047.21 - Last mean reward per episode: -1049.87 - Last mean moves per episode: 48.75
4909988 timesteps
Best mean reward: -1047.21 - Last mean reward per episode: -1048.19 - Last mean moves per episode: 48.70
4919991 timesteps
Best mean reward: -1047.21 - Last mean reward per episode: -1045.32 - Last mean moves per episode: 48.63
Saving new best model
4929972 timesteps
Best mean reward: -1045.32 - Last mean reward per episo

5459975 timesteps
Best mean reward: -956.36 - Last mean reward per episode: -955.74 - Last mean moves per episode: 46.09
Saving new best model
5469991 timesteps
Best mean reward: -955.74 - Last mean reward per episode: -954.94 - Last mean moves per episode: 46.08
Saving new best model
5479964 timesteps
Best mean reward: -954.94 - Last mean reward per episode: -954.29 - Last mean moves per episode: 46.06
Saving new best model
5489994 timesteps
Best mean reward: -954.29 - Last mean reward per episode: -950.21 - Last mean moves per episode: 45.95
Saving new best model
5499970 timesteps
Best mean reward: -950.21 - Last mean reward per episode: -948.99 - Last mean moves per episode: 45.92
Saving new best model
5509995 timesteps
Best mean reward: -948.99 - Last mean reward per episode: -947.85 - Last mean moves per episode: 45.90
Saving new best model
5519975 timesteps
Best mean reward: -947.85 - Last mean reward per episode: -945.14 - Last mean moves per episode: 45.83
Saving new best model

6069972 timesteps
Best mean reward: -887.49 - Last mean reward per episode: -888.03 - Last mean moves per episode: 44.33
6079982 timesteps
Best mean reward: -887.49 - Last mean reward per episode: -885.03 - Last mean moves per episode: 44.25
Saving new best model
6089995 timesteps
Best mean reward: -885.03 - Last mean reward per episode: -881.80 - Last mean moves per episode: 44.16
Saving new best model
6099981 timesteps
Best mean reward: -881.80 - Last mean reward per episode: -879.26 - Last mean moves per episode: 44.10
Saving new best model
6109960 timesteps
Best mean reward: -879.26 - Last mean reward per episode: -880.09 - Last mean moves per episode: 44.12
6119989 timesteps
Best mean reward: -879.26 - Last mean reward per episode: -878.57 - Last mean moves per episode: 44.09
Saving new best model
6129991 timesteps
Best mean reward: -878.57 - Last mean reward per episode: -877.12 - Last mean moves per episode: 44.04
Saving new best model
6139962 timesteps
Best mean reward: -877.12

6679962 timesteps
Best mean reward: -812.51 - Last mean reward per episode: -813.43 - Last mean moves per episode: 42.28
6689900 timesteps
Best mean reward: -812.51 - Last mean reward per episode: -812.99 - Last mean moves per episode: 42.26
6699957 timesteps
Best mean reward: -812.51 - Last mean reward per episode: -812.37 - Last mean moves per episode: 42.26
Saving new best model
6709986 timesteps
Best mean reward: -812.37 - Last mean reward per episode: -809.43 - Last mean moves per episode: 42.17
Saving new best model
6719984 timesteps
Best mean reward: -809.43 - Last mean reward per episode: -807.87 - Last mean moves per episode: 42.12
Saving new best model
6729983 timesteps
Best mean reward: -807.87 - Last mean reward per episode: -805.74 - Last mean moves per episode: 42.07
Saving new best model
6739979 timesteps
Best mean reward: -805.74 - Last mean reward per episode: -803.86 - Last mean moves per episode: 42.01
Saving new best model
6749957 timesteps
Best mean reward: -803.86

7279998 timesteps
Best mean reward: -724.09 - Last mean reward per episode: -722.46 - Last mean moves per episode: 39.63
Saving new best model
7289975 timesteps
Best mean reward: -722.46 - Last mean reward per episode: -721.35 - Last mean moves per episode: 39.60
Saving new best model
7299993 timesteps
Best mean reward: -721.35 - Last mean reward per episode: -718.04 - Last mean moves per episode: 39.53
Saving new best model
7309971 timesteps
Best mean reward: -718.04 - Last mean reward per episode: -715.56 - Last mean moves per episode: 39.46
Saving new best model
7319973 timesteps
Best mean reward: -715.56 - Last mean reward per episode: -711.83 - Last mean moves per episode: 39.36
Saving new best model
7329978 timesteps
Best mean reward: -711.83 - Last mean reward per episode: -711.08 - Last mean moves per episode: 39.36
Saving new best model
7339975 timesteps
Best mean reward: -711.08 - Last mean reward per episode: -711.80 - Last mean moves per episode: 39.37
7350000 timesteps
Bes

7889993 timesteps
Best mean reward: -652.97 - Last mean reward per episode: -651.85 - Last mean moves per episode: 37.56
Saving new best model
7899972 timesteps
Best mean reward: -651.85 - Last mean reward per episode: -651.15 - Last mean moves per episode: 37.54
Saving new best model
7909995 timesteps
Best mean reward: -651.15 - Last mean reward per episode: -650.37 - Last mean moves per episode: 37.51
Saving new best model
7919987 timesteps
Best mean reward: -650.37 - Last mean reward per episode: -647.67 - Last mean moves per episode: 37.42
Saving new best model
7929995 timesteps
Best mean reward: -647.67 - Last mean reward per episode: -645.42 - Last mean moves per episode: 37.35
Saving new best model
7939988 timesteps
Best mean reward: -645.42 - Last mean reward per episode: -643.32 - Last mean moves per episode: 37.29
Saving new best model
7949978 timesteps
Best mean reward: -643.32 - Last mean reward per episode: -642.76 - Last mean moves per episode: 37.24
Saving new best model

8479961 timesteps
Best mean reward: -583.74 - Last mean reward per episode: -580.39 - Last mean moves per episode: 35.24
Saving new best model
8489987 timesteps
Best mean reward: -580.39 - Last mean reward per episode: -576.42 - Last mean moves per episode: 35.13
Saving new best model
8499986 timesteps
Best mean reward: -576.42 - Last mean reward per episode: -573.97 - Last mean moves per episode: 35.05
Saving new best model
8509988 timesteps
Best mean reward: -573.97 - Last mean reward per episode: -575.31 - Last mean moves per episode: 35.09
8519970 timesteps
Best mean reward: -573.97 - Last mean reward per episode: -573.99 - Last mean moves per episode: 35.05
8529995 timesteps
Best mean reward: -573.97 - Last mean reward per episode: -573.40 - Last mean moves per episode: 35.03
Saving new best model
8539994 timesteps
Best mean reward: -573.40 - Last mean reward per episode: -570.29 - Last mean moves per episode: 34.94
Saving new best model
8549992 timesteps
Best mean reward: -570.29

9090000 timesteps
Best mean reward: -517.44 - Last mean reward per episode: -518.80 - Last mean moves per episode: 33.29
9099988 timesteps
Best mean reward: -517.44 - Last mean reward per episode: -517.53 - Last mean moves per episode: 33.24
9109966 timesteps
Best mean reward: -517.44 - Last mean reward per episode: -518.12 - Last mean moves per episode: 33.25
9119984 timesteps
Best mean reward: -517.44 - Last mean reward per episode: -517.86 - Last mean moves per episode: 33.23
9129982 timesteps
Best mean reward: -517.44 - Last mean reward per episode: -517.78 - Last mean moves per episode: 33.22
9139973 timesteps
Best mean reward: -517.44 - Last mean reward per episode: -517.86 - Last mean moves per episode: 33.22
9149940 timesteps
Best mean reward: -517.44 - Last mean reward per episode: -517.80 - Last mean moves per episode: 33.21
9159999 timesteps
Best mean reward: -517.44 - Last mean reward per episode: -516.14 - Last mean moves per episode: 33.15
Saving new best model
9169926 ti

9699995 timesteps
Best mean reward: -471.92 - Last mean reward per episode: -472.93 - Last mean moves per episode: 31.78
9709979 timesteps
Best mean reward: -471.92 - Last mean reward per episode: -471.78 - Last mean moves per episode: 31.74
Saving new best model
9719997 timesteps
Best mean reward: -471.78 - Last mean reward per episode: -471.84 - Last mean moves per episode: 31.74
9729997 timesteps
Best mean reward: -471.78 - Last mean reward per episode: -471.11 - Last mean moves per episode: 31.72
Saving new best model
9739987 timesteps
Best mean reward: -471.11 - Last mean reward per episode: -466.96 - Last mean moves per episode: 31.60
Saving new best model
9749950 timesteps
Best mean reward: -466.96 - Last mean reward per episode: -466.67 - Last mean moves per episode: 31.59
Saving new best model
9759989 timesteps
Best mean reward: -466.67 - Last mean reward per episode: -465.18 - Last mean moves per episode: 31.55
Saving new best model
9769969 timesteps
Best mean reward: -465.18

10319977 timesteps
Best mean reward: -436.17 - Last mean reward per episode: -435.54 - Last mean moves per episode: 30.59
Saving new best model
10329988 timesteps
Best mean reward: -435.54 - Last mean reward per episode: -433.49 - Last mean moves per episode: 30.53
Saving new best model
10339994 timesteps
Best mean reward: -433.49 - Last mean reward per episode: -433.22 - Last mean moves per episode: 30.51
Saving new best model
10350000 timesteps
Best mean reward: -433.22 - Last mean reward per episode: -433.49 - Last mean moves per episode: 30.53
10359985 timesteps
Best mean reward: -433.22 - Last mean reward per episode: -432.12 - Last mean moves per episode: 30.49
Saving new best model
10369994 timesteps
Best mean reward: -432.12 - Last mean reward per episode: -431.65 - Last mean moves per episode: 30.49
Saving new best model
10379988 timesteps
Best mean reward: -431.65 - Last mean reward per episode: -431.01 - Last mean moves per episode: 30.47
Saving new best model
10389976 times

10939987 timesteps
Best mean reward: -394.50 - Last mean reward per episode: -394.28 - Last mean moves per episode: 29.23
Saving new best model
10949989 timesteps
Best mean reward: -394.28 - Last mean reward per episode: -395.96 - Last mean moves per episode: 29.27
10959981 timesteps
Best mean reward: -394.28 - Last mean reward per episode: -394.01 - Last mean moves per episode: 29.20
Saving new best model
10969992 timesteps
Best mean reward: -394.01 - Last mean reward per episode: -394.52 - Last mean moves per episode: 29.22
10979975 timesteps
Best mean reward: -394.01 - Last mean reward per episode: -394.65 - Last mean moves per episode: 29.21
10989969 timesteps
Best mean reward: -394.01 - Last mean reward per episode: -392.93 - Last mean moves per episode: 29.16
Saving new best model
10999991 timesteps
Best mean reward: -392.93 - Last mean reward per episode: -389.60 - Last mean moves per episode: 29.05
Saving new best model
11009982 timesteps
Best mean reward: -389.60 - Last mean r

11579972 timesteps
Best mean reward: -362.31 - Last mean reward per episode: -361.43 - Last mean moves per episode: 28.09
Saving new best model
11589993 timesteps
Best mean reward: -361.43 - Last mean reward per episode: -361.61 - Last mean moves per episode: 28.09
11599968 timesteps
Best mean reward: -361.43 - Last mean reward per episode: -361.92 - Last mean moves per episode: 28.10
11609975 timesteps
Best mean reward: -361.43 - Last mean reward per episode: -360.71 - Last mean moves per episode: 28.09
Saving new best model
11619968 timesteps
Best mean reward: -360.71 - Last mean reward per episode: -359.68 - Last mean moves per episode: 28.06
Saving new best model
11629979 timesteps
Best mean reward: -359.68 - Last mean reward per episode: -359.45 - Last mean moves per episode: 28.05
Saving new best model
11639979 timesteps
Best mean reward: -359.45 - Last mean reward per episode: -357.31 - Last mean moves per episode: 28.00
Saving new best model
11649992 timesteps
Best mean reward:

12189995 timesteps
Best mean reward: -332.84 - Last mean reward per episode: -331.80 - Last mean moves per episode: 27.03
Saving new best model
12199984 timesteps
Best mean reward: -331.80 - Last mean reward per episode: -328.41 - Last mean moves per episode: 26.93
Saving new best model
12209987 timesteps
Best mean reward: -328.41 - Last mean reward per episode: -324.78 - Last mean moves per episode: 26.84
Saving new best model
12219995 timesteps
Best mean reward: -324.78 - Last mean reward per episode: -324.01 - Last mean moves per episode: 26.80
Saving new best model
12229957 timesteps
Best mean reward: -324.01 - Last mean reward per episode: -325.55 - Last mean moves per episode: 26.83
12239996 timesteps
Best mean reward: -324.01 - Last mean reward per episode: -326.56 - Last mean moves per episode: 26.86
12249992 timesteps
Best mean reward: -324.01 - Last mean reward per episode: -327.08 - Last mean moves per episode: 26.87
12259981 timesteps
Best mean reward: -324.01 - Last mean r

12819981 timesteps
Best mean reward: -293.99 - Last mean reward per episode: -291.87 - Last mean moves per episode: 25.69
Saving new best model
12829992 timesteps
Best mean reward: -291.87 - Last mean reward per episode: -293.30 - Last mean moves per episode: 25.71
12839984 timesteps
Best mean reward: -291.87 - Last mean reward per episode: -294.67 - Last mean moves per episode: 25.73
12849985 timesteps
Best mean reward: -291.87 - Last mean reward per episode: -295.67 - Last mean moves per episode: 25.74
12859998 timesteps
Best mean reward: -291.87 - Last mean reward per episode: -296.53 - Last mean moves per episode: 25.74
12869975 timesteps
Best mean reward: -291.87 - Last mean reward per episode: -297.49 - Last mean moves per episode: 25.75
12879989 timesteps
Best mean reward: -291.87 - Last mean reward per episode: -296.41 - Last mean moves per episode: 25.73
12889983 timesteps
Best mean reward: -291.87 - Last mean reward per episode: -294.52 - Last mean moves per episode: 25.68
12

13459995 timesteps
Best mean reward: -270.32 - Last mean reward per episode: -272.15 - Last mean moves per episode: 25.00
13469962 timesteps
Best mean reward: -270.32 - Last mean reward per episode: -273.97 - Last mean moves per episode: 25.03
13479953 timesteps
Best mean reward: -270.32 - Last mean reward per episode: -274.04 - Last mean moves per episode: 25.03
13489992 timesteps
Best mean reward: -270.32 - Last mean reward per episode: -273.68 - Last mean moves per episode: 25.02
13499980 timesteps
Best mean reward: -270.32 - Last mean reward per episode: -273.22 - Last mean moves per episode: 25.01
13509998 timesteps
Best mean reward: -270.32 - Last mean reward per episode: -270.28 - Last mean moves per episode: 24.95
Saving new best model
13519998 timesteps
Best mean reward: -270.28 - Last mean reward per episode: -270.87 - Last mean moves per episode: 24.97
13529971 timesteps
Best mean reward: -270.28 - Last mean reward per episode: -272.27 - Last mean moves per episode: 25.02
13

14099997 timesteps
Best mean reward: -251.77 - Last mean reward per episode: -251.35 - Last mean moves per episode: 24.24
Saving new best model
14109997 timesteps
Best mean reward: -251.35 - Last mean reward per episode: -249.02 - Last mean moves per episode: 24.18
Saving new best model
14119981 timesteps
Best mean reward: -249.02 - Last mean reward per episode: -249.02 - Last mean moves per episode: 24.17
Saving new best model
14130000 timesteps
Best mean reward: -249.02 - Last mean reward per episode: -252.55 - Last mean moves per episode: 24.25
14139985 timesteps
Best mean reward: -249.02 - Last mean reward per episode: -249.11 - Last mean moves per episode: 24.16
14149965 timesteps
Best mean reward: -249.02 - Last mean reward per episode: -247.94 - Last mean moves per episode: 24.14
Saving new best model
14159996 timesteps
Best mean reward: -247.94 - Last mean reward per episode: -245.51 - Last mean moves per episode: 24.09
Saving new best model
14169984 timesteps
Best mean reward:

14739984 timesteps
Best mean reward: -232.68 - Last mean reward per episode: -238.99 - Last mean moves per episode: 23.69
14749993 timesteps
Best mean reward: -232.68 - Last mean reward per episode: -241.99 - Last mean moves per episode: 23.76
14759995 timesteps
Best mean reward: -232.68 - Last mean reward per episode: -242.84 - Last mean moves per episode: 23.78
14769994 timesteps
Best mean reward: -232.68 - Last mean reward per episode: -239.00 - Last mean moves per episode: 23.69
14779962 timesteps
Best mean reward: -232.68 - Last mean reward per episode: -239.91 - Last mean moves per episode: 23.72
14789986 timesteps
Best mean reward: -232.68 - Last mean reward per episode: -239.45 - Last mean moves per episode: 23.69
14799979 timesteps
Best mean reward: -232.68 - Last mean reward per episode: -238.29 - Last mean moves per episode: 23.64
14809997 timesteps
Best mean reward: -232.68 - Last mean reward per episode: -232.63 - Last mean moves per episode: 23.52
Saving new best model
14

15389999 timesteps
Best mean reward: -210.80 - Last mean reward per episode: -212.06 - Last mean moves per episode: 22.85
15399991 timesteps
Best mean reward: -210.80 - Last mean reward per episode: -211.18 - Last mean moves per episode: 22.82
15409993 timesteps
Best mean reward: -210.80 - Last mean reward per episode: -212.14 - Last mean moves per episode: 22.83
15419984 timesteps
Best mean reward: -210.80 - Last mean reward per episode: -211.80 - Last mean moves per episode: 22.82
15429980 timesteps
Best mean reward: -210.80 - Last mean reward per episode: -212.07 - Last mean moves per episode: 22.83
15439987 timesteps
Best mean reward: -210.80 - Last mean reward per episode: -211.11 - Last mean moves per episode: 22.80
15449978 timesteps
Best mean reward: -210.80 - Last mean reward per episode: -212.72 - Last mean moves per episode: 22.84
15459952 timesteps
Best mean reward: -210.80 - Last mean reward per episode: -213.40 - Last mean moves per episode: 22.86
15469993 timesteps
Best 

16049984 timesteps
Best mean reward: -198.66 - Last mean reward per episode: -202.89 - Last mean moves per episode: 22.41
16059992 timesteps
Best mean reward: -198.66 - Last mean reward per episode: -200.77 - Last mean moves per episode: 22.34
16069993 timesteps
Best mean reward: -198.66 - Last mean reward per episode: -195.97 - Last mean moves per episode: 22.23
Saving new best model
16079985 timesteps
Best mean reward: -195.97 - Last mean reward per episode: -195.43 - Last mean moves per episode: 22.21
Saving new best model
16089997 timesteps
Best mean reward: -195.43 - Last mean reward per episode: -196.30 - Last mean moves per episode: 22.23
16100000 timesteps
Best mean reward: -195.43 - Last mean reward per episode: -195.71 - Last mean moves per episode: 22.19
16109994 timesteps
Best mean reward: -195.43 - Last mean reward per episode: -199.85 - Last mean moves per episode: 22.30
16119992 timesteps
Best mean reward: -195.43 - Last mean reward per episode: -199.48 - Last mean moves

16699993 timesteps
Best mean reward: -168.86 - Last mean reward per episode: -178.37 - Last mean moves per episode: 21.58
16709983 timesteps
Best mean reward: -168.86 - Last mean reward per episode: -177.57 - Last mean moves per episode: 21.54
16719987 timesteps
Best mean reward: -168.86 - Last mean reward per episode: -179.02 - Last mean moves per episode: 21.56
16729988 timesteps
Best mean reward: -168.86 - Last mean reward per episode: -180.32 - Last mean moves per episode: 21.58
16739989 timesteps
Best mean reward: -168.86 - Last mean reward per episode: -182.46 - Last mean moves per episode: 21.65
16749999 timesteps
Best mean reward: -168.86 - Last mean reward per episode: -180.95 - Last mean moves per episode: 21.60
16759963 timesteps
Best mean reward: -168.86 - Last mean reward per episode: -181.27 - Last mean moves per episode: 21.62
16769982 timesteps
Best mean reward: -168.86 - Last mean reward per episode: -179.97 - Last mean moves per episode: 21.58
16779999 timesteps
Best 

17359991 timesteps
Best mean reward: -154.91 - Last mean reward per episode: -157.58 - Last mean moves per episode: 20.73
17369998 timesteps
Best mean reward: -154.91 - Last mean reward per episode: -157.70 - Last mean moves per episode: 20.74
17379989 timesteps
Best mean reward: -154.91 - Last mean reward per episode: -158.21 - Last mean moves per episode: 20.76
17389997 timesteps
Best mean reward: -154.91 - Last mean reward per episode: -163.12 - Last mean moves per episode: 20.89
17399987 timesteps
Best mean reward: -154.91 - Last mean reward per episode: -162.79 - Last mean moves per episode: 20.89
17409996 timesteps
Best mean reward: -154.91 - Last mean reward per episode: -163.86 - Last mean moves per episode: 20.92


KeyboardInterrupt: 

### Playing with Three Ships {2,3,3} : 7x7 board

In [None]:
plot_results(log_dir, 1000)

### Playing with One Ship on a Bigger Board

In [None]:
dim = 10
ships = [2,3,3,4,5]
rewards = [-10, 6, 6, -0.5]
lr = 0.00005
num_timesteps = 50000000 # this is number of moves and not number of episodes
log_dir = "./gym_full_6/"
os.makedirs(log_dir, exist_ok=True)
policy_kwargs = dict(
                     activation_fn=th.nn.Tanh, 
#                     net_arch=[100,120,100]

)


battleship = Battleship(dim, ships, True)

env = BattleshipEnv(battleship, rewards)
env = Monitor(
    env, 
    filename=log_dir, 
    allow_early_resets=True
)
env = DummyVecEnv([lambda: env])

best_mean_reward, n_steps, step_interval, episode_interval = -np.inf, 0, 10000, 10000

model = A2C(
    'MlpPolicy', 
    env,
    learning_rate=lr,
    policy_kwargs=policy_kwargs,
    verbose=0, 
    gamma=1
).learn(
    total_timesteps=num_timesteps, 
    callback=callback
)

In [None]:
# save manually 
#model.save(log_dir + 'best_model_cruiser_10x10.pkl')

In [None]:
plot_results(log_dir,1000)

## 10x10 Board

In [None]:
dim = 10
ships = [2,3,3,4,5]
rewards = [-150, 100, 10, -0.75]
lr = 0.000125
num_timesteps = 500000000 # this is number of moves and not number of episodes
log_dir = "./gym_full_10/"
os.makedirs(log_dir, exist_ok=True)
policy_kwargs = dict(
                     activation_fn=th.nn.Tanh, 
                     net_arch=[200,dict(pi= [300,200,100], vf= [300,200,100])]
)


battleship = Battleship(dim, ships, True)

env = BattleshipEnv(battleship, rewards)
env = Monitor(
    env, 
    filename=log_dir, 
    allow_early_resets=True
)
env = DummyVecEnv([lambda: env])

best_mean_reward, n_steps, step_interval, episode_interval = -np.inf, 0, 10000, 10000

model = A2C(
    'MlpPolicy', 
    env,
    learning_rate=lr,
    policy_kwargs=policy_kwargs,
    verbose=0, 
    gamma=1
).learn(
    total_timesteps=num_timesteps, 
    callback=callback
)

### Visualizing How the Agent Plays

In [None]:
#model_best = A2C.load('./gym/best_model_cruiser_5x5.pkl')
#model_best = A2C.load('./gym/best_model_cruiser_6x6.pkl')
model_best = A2C.load('./gym/best_model_cruiser_7x7.pkl')
#model_best = A2C.load('./gym/best_model_cruiser_10x10.pkl')

In [None]:
# brew install ffmpeg
# brew install gifsicle
# Shift + Command + 5 for recording. This saves .mov file
# right-click on mov file, get info for video size to use here below
# ffmpeg -i in.mov -s 448x790 -pix_fmt rgb24 -r 10 -f gif - | gifsicle --optimize=3 --delay=3 > out.gif

from IPython.display import clear_output
import time

ships = {}
ships['cruiser'] = 3

grid_size=7
enemy_board = 0*np.ones((grid_size, grid_size), dtype='int')
#enemy_board[3,5] = 1
#enemy_board[4,5] = 1
#enemy_board[5,5] = 1
env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size, ships=ships)
# give me time to setup recording
time.sleep(5)
for ep in range(10):
    obs = env.reset()
    ## 2 empty boards
    done = False
    nmoves = 0
    print('episode no.', ep, '# moves:', nmoves)
    env.render()
    env.render()
    time.sleep(5)
    clear_output(wait=True)        
    while not done:
        action, obs = model_best.predict(obs, deterministic=True)
        obs, _, done , _ = env.step(action)
        nmoves += 1
        print('episode no.', ep, '# moves:', nmoves)
        env.render()
        board_rendering(grid_size, env.enemy_board)
        time.sleep(np.random.uniform(1,3))
        clear_output(wait=True)        
        

### Optimizing The Algorithm Parameters with Hyperopt

In [None]:
## To optimize a RL model, see https://github.com/araffin/rl-baselines-zoo/tree/master/hyperparams or
## in general https://github.com/araffin/rl-baselines-zoo. This package uses optuna optimization
## but it works for the trained agents there. You can modify this package to include your case
## or just use the yml file to see what parameters to tune

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import DQN, PPO2, A2C, ACKTR
from stable_baselines3.bench import Monitor

# Agent hyperparameter optimization
def objective(space):
    

    env_copies = space['env_copies']    
    num_timesteps = space['num_timesteps']
    gamma = space['gamma']
    n_steps = space['n_steps']
    vf_coef = space['vf_coef']
    ent_coef = space['ent_coef']
    max_grad_norm = space['max_grad_norm']
    learning_rate = space['learning_rate']
    alpha = space['alpha']
    epsilon = space['epsilon']
    lr_schedule = space['lr_schedule']
    
    print('space:', space)
    
    # ships
    ships = {}
    ships['cruiser'] = 3

    grid_size = 7

    # Instantiate the env
    env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size, ships=ships)

    env = DummyVecEnv([lambda: env]*env_copies)
      
    model = A2C('MlpPolicy', env, verbose=0, 
                 gamma=gamma,
                 n_steps=n_steps,
                 ent_coef=ent_coef,
                 learning_rate=learning_rate,
                 vf_coef=vf_coef,
                 max_grad_norm=max_grad_norm,
                 alpha=alpha,
                 epsilon=epsilon,
                 lr_schedule=lr_schedule
               ).learn(total_timesteps=num_timesteps)
        
    rewards_mean = []
    moves_mean = []
    n_episodes = 100
    for ep in range(n_episodes):
        reward_env = []
        moves_env = []
        for env_i in env.envs:
            obs = env_i.reset()
            done = False
            rewards_sum = 0
            moves = 0
            while not done:
                action, obs = model.predict(obs, deterministic=True)
                obs, reward, done , _ = env_i.step(action)
                rewards_sum += reward # total reward for this episode
                moves += 1
            reward_env.append(rewards_sum)
            moves_env.append(moves)
        rewards_mean.append(np.min(reward_env)) # avg environment reward 
        moves_mean.append(np.mean(moves_env)) # avg environment reward 
    rewards_mean = np.mean(rewards_mean)
    moves_mean = np.mean(moves_mean)

    print('reward', rewards_mean, 'moves', moves_mean)
    
    # hyperopt will minimize objective, number of moves in this case
    return{'loss': moves_mean, 'status': STATUS_OK }

In [None]:
space = {
    'env_copies': hp.choice('env_copies', [10]),
    'num_timesteps': hp.choice('num_timesteps', [1000000]), #np.arange(1000000, 1000001, 1000000, dtype=int)
    'gamma': hp.choice('gamma', [0.99, 0.95, 0.9]),
    'n_steps': hp.choice('n_steps', [5, 1, 10]),
    'vf_coef': hp.choice('vf_coef', [0.25, 0.1, 0.5]),
    'ent_coef': hp.choice('ent_coef', [0.01, 0.1]), 
    'learning_rate': hp.choice('learning_rate', [0.0007]),
    'max_grad_norm': hp.choice('max_grad_norm', [0.5, 0.2, 0.7]), 
    'alpha': hp.choice('lam', [0.99, 0.95, 0.9]), 
    'epsilon': hp.choice('epsilon', [1e-5, 1e-3, 1e-4]), 
    'lr_schedule': hp.choice('lr_schedule', ['constant', 'linear'])
}


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30, 
            trials=trials, verbose=1)

In [None]:
#%debug

In [None]:
param_dist = space_eval(space, best)
param_dist

### Reward scheme

For any action: 
$$r=-1,$$ 
but if an action is illegal (moving to a non-empty cell), a random action is drawn from the action space. 

This action is penalized assigning:

$$r=-2*S.$$

where $S$ is the grid side length.

If an action results into a hit:
$$
r = S.
$$
If all ship cells are hit (game is completed)
$$R = S*S.$$

### Skeleton Battleship Environmnt

In [None]:
class BattleshipEnv(gym.Env):
    
    """Custom Environment that follows gym interface"""
    """see https://github.com/openai/gym/blob/master/gym/core.py"""
    
    metadata = {'render.modes': ['human']} 

    def __init__(self, enemy_board, ship_locs, grid_size, ships):
        
        super(BattleshipEnv, self).__init__()
        
        # Define action and observation space
        # They must be gym.spaces objects
        # In our case the action space is discrete: index of action
        self.action_space = spaces.Discrete(self.grid_size * self.grid_size)
        # The observation will be the state or configuration of the board
        self.observation_space = spaces.Box(low=-1, high=1,shape=(self.grid_size, self.grid_size), 
                                            dtype=np.int)
        
        pass
            
    # an action will be an index of action_space either from epsilon-greedy
    # or from model prediction
    def step(self, action):
            
        """
        Rewards for action and sets next state
        Also, checks if game is completed (done)
        :return: next_state, reward, done, info
        """
        
        pass
    
    def reset(self):
        """
        Resets the state of the environment to an initial state
        :return: (np.array) state
        """
        
        pass
    
    def render(self, mode='human'):
        """
        Human readable state. In this case the scoring board
        """
        
        pass