## Playing with Fishing Gym

In [1]:
pwd

'/Users/MLapeyrolerie/git/open_ai_fishing'

In [2]:
import gym_fishing
import gym
import math
import random
from math import floor
import bokeh.plotting
import numpy as np
import bokeh.io
from itertools import count
import bokeh.plotting
from bokeh.models import Span

bokeh.io.output_notebook()

I write a function to find the first point where the trajectory crosses the threshold. I used 100 in the plots shown below.

In [3]:
def find_collapse_point(trajectory, thresh):
    indices = np.where(trajectory < thresh)[0]
    try:
        return np.min(indices)
    except:
        return None

Now I make plots of 100 trajectories. Here I choose a random harvest quota at each season. The red line shows the average time to collapse.

In [4]:
n_samples = 100
# Set up plots
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='time',
                               y_axis_label='N')


env = gym.make('fishing-v0')
collapse_points = []
harvests = []
# 
for sample in range(n_samples):
    
    env.random_reset()
    sample_traj = [env.fish_population]
    for season in range(100):
        env.step(random.randrange(env.fish_population) * 2 / 1e5)
        sample_traj.append(env.fish_population)
    harvests.append(env.reward)
    sample_traj = np.array(sample_traj)
    collapse_point = find_collapse_point(sample_traj, 100)
    if collapse_point != None:
        collapse_points.append(collapse_point)
    plot.line(range(len(sample_traj)), sample_traj, line_width=0.5, alpha=0.5)

vline = Span(location=np.mean(collapse_points), dimension='height', line_color='red', line_width=1)
plot.renderers.extend([vline])
bokeh.io.show(plot)
env.close()

In [5]:
np.mean(harvests)

1.6103689000000005

Here I choose $ \text{MSY} = \frac{K r}{4}$ quota at each season. The red line shows the average time to collapse.

In [6]:
n_samples = 100
# Set up plots
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='time',
                               y_axis_label='N')


env = gym.make('fishing-v0')
collapse_points = []
harvests = []
for sample in range(n_samples):
    env.random_reset()
    sample_traj = [env.fish_population]
    for season in range(100):
        env.step(floor(int(env.fish_population) / 2) / 1e5 * 2)
        sample_traj.append(env.fish_population)
    harvests.append(env.reward)
    sample_traj = np.array(sample_traj)
    collapse_point = find_collapse_point(sample_traj, 100)
    if collapse_point != None:
        collapse_points.append(collapse_point)
    plot.line(range(len(sample_traj)), sample_traj, line_width=0.5, alpha=0.5)

    
#vline = Span(location=np.mean(collapse_points), dimension='height', line_color='red', line_width=1)
#plot.renderers.extend([vline])
bokeh.io.show(plot)
env.close()

In [7]:
np.mean(harvests)

4.6151004

Now attempting to implement deep Q-learning to choose the fishing quota at each season. Heavily borrowing from the Q-learning tutorial from pytorch (https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html).

In [8]:
import gym
import math
import random
import numpy as np
from collections import namedtuple
from itertools import count
from copy import deepcopy
import torch
import torch.nn as nn
from torch.nn.utils.clip_grad import clip_grad_norm_
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


env = gym.make('fishing-v0')


# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Setting up a named tuple and ReplayMemory class to access past transitions.

In [9]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

Building neural network with 1 hidden layer

In [10]:
class DQN(nn.Module):
    def __init__(self, input_size, hidden_size, num_quotas):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
    def forward(self, N):
        out = self.fc1(N)
        out = F.leaky_relu(out)
        out = self.fc2(out)
        
        return out

Now onto training,

In [11]:
BATCH_SIZE = 100
GAMMA = 0.5
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1e3
TARGET_UPDATE = 1e1
N_ACTIONS = len(env.action_space)

policy_net = DQN(1, N_ACTIONS, N_ACTIONS).to(device).requires_grad_(requires_grad=True)
target_net = DQN(1, N_ACTIONS, N_ACTIONS).to(device).requires_grad_(requires_grad=False)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.SGD(policy_net.parameters(), lr=0.00001)
memory = ReplayMemory(10000)

steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # So here I find the index of the max Q(S,A)
            # I map the indices of my output to the indices of 
            # action space, so I use this index to access the 
            # corresponding action
            max_index = policy_net(state).max(0)[1]
            return torch.tensor([[env.action_space[max_index]]], device=device, dtype=torch.float)
    # Returning a random action
    else:
        return torch.tensor([[env.action_space[random.randrange(N_ACTIONS)]]], device=device, dtype=torch.float)

In [12]:
x = deepcopy(list(policy_net.parameters()))
x

[Parameter containing:
 tensor([[ 0.0126],
         [ 0.4402],
         [-0.3569],
         [ 0.2646]], requires_grad=True), Parameter containing:
 tensor([ 0.6456, -0.4984,  0.8930,  0.7037], requires_grad=True), Parameter containing:
 tensor([[ 0.1127,  0.2409,  0.0662,  0.1244],
         [ 0.3299, -0.0093,  0.1296, -0.3195],
         [ 0.4085, -0.1469, -0.0527, -0.3274],
         [ 0.3671, -0.3976, -0.2513, -0.2793]], requires_grad=True), Parameter containing:
 tensor([-0.4312, -0.1399,  0.2193,  0.0203], requires_grad=True)]

In [13]:
loss_history = []
def optimize_model():
    global loss_history
    if len(memory) < BATCH_SIZE:
        return
    # import pdb; pdb.set_trace()
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))
    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    # THIS DOES NOT MATTER NOW BUT SHOULD EVENTUALLY EDIT THIS
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    non_final_next_states = non_final_next_states.unsqueeze(1)
    state_batch = torch.cat(batch.state)
    state_batch = state_batch.unsqueeze(1)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch.type(torch.long)/ 1000)
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    # Double Q implementation
    _, max_indices = policy_net(non_final_next_states).max(1, keepdim=True)
    next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, max_indices).squeeze(1)
    
    ## Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    # Normalizing Q values
    max_Q_prime = torch.max(expected_state_action_values) / 2
    expected_state_action_values = expected_state_action_values / max_Q_prime - 1
    loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    loss_history.append(loss)
    optimizer.zero_grad()
    loss.backward()
    clip_grad_norm_(policy_net.parameters(), 1)
    optimizer.step()

In [14]:
num_episodes = int(1e2)
steps_done = 0
episode_durations = []
loss_history = []
traj_history = []
for i_episode in range(num_episodes):
    if i_episode % TARGET_UPDATE == 0: print(str(i_episode) + " ", end='')
    # Initialize the environment and state
    env.random_reset()
    state = torch.tensor([env.fish_population], device=device, dtype=torch.float)
    traj_history.append(state)
    for t in count():
        # Select and perform an action
        action = select_action(state)
        _, reward, done, _ = env.step(action)
        reward = torch.Tensor([reward], device=device)

        # Observe new state
        if not done:
            next_state = torch.tensor([max(0, env.fish_population - env.harvest) ], \
                                      device=device, dtype=torch.float)
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state
        traj_history.append(state)

        # Perform one step of the optimization (on the target network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')

0 10 20 30 40 50 60 70 80 90 Complete


In [15]:
index, counts = np.unique([int(x[0]) for x in traj_history if x is not None], return_counts=True)
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='step',
                               y_axis_label='loss')
plot.circle(index[:], counts[:], line_width=0.5, alpha=1)

bokeh.io.show(plot)

In [16]:
list(policy_net.parameters())

[Parameter containing:
 tensor([[-8.9922e-05],
         [ 4.0367e-01],
         [-3.5689e-01],
         [ 2.4638e-01]], requires_grad=True), Parameter containing:
 tensor([ 0.6456, -0.4984,  0.8930,  0.7037], requires_grad=True), Parameter containing:
 tensor([[ 0.1120,  0.1650,  0.0669,  0.0785],
         [ 0.3299, -0.0093,  0.1296, -0.3195],
         [ 0.4085, -0.1469, -0.0527, -0.3274],
         [ 0.3671, -0.3976, -0.2513, -0.2793]], requires_grad=True), Parameter containing:
 tensor([-0.4312, -0.1399,  0.2193,  0.0203], requires_grad=True)]

In [17]:
for param in policy_net.parameters():
    print(param.grad)

tensor([[0.0022],
        [0.3255],
        [0.0013],
        [0.1548]])
tensor([2.3320e-08, 3.4273e-06, 1.3922e-08, 1.6298e-06])
tensor([[-1.6387e-06,  7.9618e-01, -7.0390e-03,  4.8597e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]])
tensor([2.0771e-05, 0.0000e+00, 0.0000e+00, 0.0000e+00])


In [18]:
torch.save(policy_net.state_dict(), './model_random_reset.pth')

In [19]:
model = DQN(1, N_ACTIONS, N_ACTIONS).to(device)
model.load_state_dict(torch.load("model_random_reset.pth"))
model.eval()

DQN(
  (fc1): Linear(in_features=1, out_features=4, bias=True)
  (fc2): Linear(in_features=4, out_features=4, bias=True)
)

In [20]:
def take_Q_step(initial_state):
    max_index = policy_net(torch.tensor([initial_state]).type(torch.float)).max(0)[1]
    return env.action_space[max_index]

In [21]:
n_samples = 100
# Set up plots
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='time',
                               y_axis_label='N')


env = gym.make('fishing-v0')
collapse_points = []
harvests = []
# 
for sample in range(n_samples):
    env.reset()
    sample_traj = [env.fish_population]
    for season in range(100):
        env.step(take_Q_step(env.fish_population))
        sample_traj.append(env.fish_population)
    harvests.append(env.reward)
    all(isinstance(x, int) for x in sample_traj)
    sample_traj = np.array(sample_traj)
    plot.line(range(len(sample_traj)), sample_traj, line_width=0.5, alpha=0.5)


bokeh.io.show(plot)
env.close()

In [22]:
np.mean(harvests)

0.0

In [23]:
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='population',
                               y_axis_label='harvest selection')

v_take_Q_step = np.vectorize(take_Q_step)
x = np.linspace(0, 1e5, 10**5 + 1)
plot.line(x, v_take_Q_step(x), line_width=0.5, alpha=1)

bokeh.io.show(plot)

In [24]:
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='step',
                               y_axis_label='loss')
np_loss_history = np.array(list(map(lambda x: x.detach().numpy(), loss_history)))
plot.line(range(len(np_loss_history[::])), np_loss_history[::], line_width=0.5, alpha=1)

bokeh.io.show(plot)