## Playing with Fishing Gym

In [157]:
import gym_fishing
import gym
import math
import random
from math import floor
import bokeh.plotting
import numpy as np
import bokeh.io
from itertools import count
import bokeh.plotting
from bokeh.models import Span

bokeh.io.output_notebook()

I write a function to find the first point where the trajectory crosses the threshold. I used 100 in the plots shown below.

In [158]:
def find_collapse_point(trajectory, thresh):
    indices = np.where(trajectory < thresh)[0]
    try:
        return np.min(indices)
    except:
        return None

Now I make plots of 100 trajectories. Here I choose a random harvest quota at each season. The red line shows the average time to collapse.

In [159]:
n_samples = 100
# Set up plots
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='time',
                               y_axis_label='N')


env = gym.make('fishing-v0')
collapse_points = []
harvests = []
# 
for sample in range(n_samples):
    
    env.reset()
    sample_traj = [env.fish_population]
    for season in range(100):
        env.step(random.randrange(env.fish_population))
        sample_traj.append(env.fish_population)
    harvests.append(env.reward)
    sample_traj = np.array(sample_traj)
    collapse_point = find_collapse_point(sample_traj, 100)
    if collapse_point != None:
        collapse_points.append(collapse_point)
    plot.line(range(len(sample_traj)), sample_traj, line_width=0.5, alpha=0.5)

vline = Span(location=np.mean(collapse_points), dimension='height', line_color='red', line_width=1)
plot.renderers.extend([vline])
bokeh.io.show(plot)
env.close()

In [161]:
np.mean(harvests)

2.2291856000000005

Here I choose $ \text{MSY} = \frac{K r}{4}$ quota at each season. The red line shows the average time to collapse.

In [162]:
n_samples = 100
# Set up plots
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='time',
                               y_axis_label='N')


env = gym.make('fishing-v0')
collapse_points = []
harvests = []
for sample in range(n_samples):
    env.random_reset()
    sample_traj = [env.fish_population]
    for season in range(100):
        env.step(floor(int(env.fish_population) / 2))
        sample_traj.append(env.fish_population)
    harvests.append(env.reward)
    sample_traj = np.array(sample_traj)
    collapse_point = find_collapse_point(sample_traj, 100)
    if collapse_point != None:
        collapse_points.append(collapse_point)
    plot.line(range(len(sample_traj)), sample_traj, line_width=0.5, alpha=0.5)

    
#vline = Span(location=np.mean(collapse_points), dimension='height', line_color='red', line_width=1)
#plot.renderers.extend([vline])
bokeh.io.show(plot)
env.close()

In [163]:
np.mean(harvests)

4.6341395

Now attempting to implement deep Q-learning to choose the fishing quota at each season. Heavily borrowing from the Q-learning tutorial from pytorch (https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html).

In [221]:
import gym
import math
import random
import numpy as np
from collections import namedtuple
from itertools import count
from copy import deepcopy
import torch
import torch.nn as nn
from torch.nn.utils.clip_grad import clip_grad_norm_
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


env = gym.make('fishing-v0')


# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Setting up a named tuple and ReplayMemory class to access past transitions.

In [239]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

Building neural network with 1 hidden layer

In [240]:
class DQN(nn.Module):
    def __init__(self, input_size, hidden_size, num_quotas):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
    def forward(self, N):
        out = self.fc1(N)
        out = F.leaky_relu(out)
        out = self.fc2(out)
        
        return out

Now onto training,

In [241]:
BATCH_SIZE = 50
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1e10
TARGET_UPDATE = 100
N_ACTIONS = len(env.action_space)

policy_net = DQN(1, N_ACTIONS, N_ACTIONS).to(device).requires_grad_(requires_grad=True)
target_net = DQN(1, N_ACTIONS, N_ACTIONS).to(device).requires_grad_(requires_grad=False)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(1000)

steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # So here I find the index of the max Q(S,A)
            # I map the indices of my output to the indices of 
            # action space, so I use this index to access the 
            # corresponding action
            max_index = policy_net(state).max(0)[1]
            return torch.tensor([[env.action_space[max_index]]], device=device, dtype=torch.float)
    # Returning a random action
    else:
        return torch.tensor([[env.action_space[random.randrange(N_ACTIONS)]]], device=device, dtype=torch.float)

In [242]:
x = deepcopy(list(policy_net.parameters()))
x

[Parameter containing:
 tensor([[-0.0930],
         [ 0.6562],
         [-0.5066],
         [ 0.3932],
         [ 0.0682],
         [ 0.9310],
         [ 0.5486],
         [ 0.7013],
         [ 0.4740],
         [-0.3277],
         [-0.7465],
         [ 0.9775],
         [-0.3063],
         [-0.3029],
         [ 0.4251],
         [ 0.8612],
         [-0.5282],
         [ 0.6949],
         [-0.9108],
         [-0.1205],
         [-0.9449],
         [-0.3213],
         [ 0.9373],
         [-0.9747],
         [-0.9598],
         [ 0.5187],
         [ 0.3478],
         [-0.1632],
         [-0.9478],
         [ 0.2789],
         [ 0.2691],
         [ 0.6469],
         [-0.6877],
         [-0.9199],
         [ 0.9009],
         [-0.3337],
         [ 0.0907],
         [-0.9593],
         [ 0.5737],
         [-0.5637],
         [-0.4300],
         [-0.6002],
         [-0.8402],
         [-0.8375],
         [-0.8799],
         [-0.3267],
         [-0.2112],
         [-0.6258],
         [-0.5543

In [270]:
loss_history = []
def optimize_model():
    global loss_history
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))
    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    # THIS DOES NOT MATTER NOW BUT SHOULD EVENTUALLY EDIT THIS
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    non_final_next_states = non_final_next_states.unsqueeze(1)
    state_batch = torch.cat(batch.state)
    state_batch = state_batch.unsqueeze(1)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch.type(torch.long)/ 1000)
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    
    ## Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    
    # import pdb; pdb.set_trace()
    loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    loss_history.append(loss)
    import pdb; pdb.set_trace()
    optimizer.zero_grad()
    loss.backward()
    clip_grad_norm_(policy_net.parameters(), 1)
    optimizer.step()

In [None]:
num_episodes = int(5e2)
steps_done = 0
episode_durations = []
loss_history = []
for i_episode in range(num_episodes):
    if i_episode % TARGET_UPDATE == 0: print(str(i_episode) + " ", end='')
    # Initialize the environment and state
    env.random_reset()
    state = torch.tensor([env.fish_population], device=device, dtype=torch.float)
    for t in count():
        # Select and perform an action
        action = select_action(state)
        _, reward, done, _ = env.step(action)
        reward = torch.Tensor([reward], device=device)

        # Observe new state
        if not done:
            next_state = torch.tensor([max(0, env.fish_population - env.harvest) ], \
                                      device=device, dtype=torch.float)
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')

0 > <ipython-input-270-dd697a995b4a>(38)optimize_model()
-> optimizer.zero_grad()


(Pdb)  loss


tensor(5.6818e+12, grad_fn=<MseLossBackward>)


(Pdb)  expected_state_action_values


tensor([1.8312e+01, 1.9069e+01, 1.8590e+01, 1.8685e+01, 1.8590e+01, 1.8222e+01,
        1.8410e+01, 1.8222e+01, 1.8590e+01, 1.6855e+07, 1.8312e+01, 1.8222e+01,
        1.8590e+01, 1.8312e+01, 1.9718e+01, 2.0070e+01, 1.8410e+01, 1.8242e+01,
        1.8590e+01, 1.9069e+01, 1.8685e+01, 1.8410e+01, 1.9718e+01, 1.9069e+01,
        1.8242e+01, 1.9718e+01, 1.8685e+01, 1.8590e+01, 1.9069e+01, 1.9497e+01,
        1.9718e+01, 1.8685e+01, 1.8222e+01, 1.9718e+01, 1.8242e+01, 2.0070e+01,
        1.8590e+01, 1.8410e+01, 1.8590e+01, 1.9069e+01, 1.9718e+01, 1.9069e+01,
        1.8410e+01, 1.8222e+01, 1.8242e+01, 1.8222e+01, 1.8590e+01, 2.0070e+01,
        1.8590e+01, 1.8410e+01])


(Pdb)  state_action_values


tensor([[   18.5919],
        [   18.8178],
        [   18.5834],
        [   18.0173],
        [   18.1493],
        [   18.2911],
        [   18.5244],
        [   18.1607],
        [   18.8318],
        [   19.2906],
        [   18.2690],
        [   18.3060],
        [   18.5737],
        [   18.8805],
        [   18.6484],
        [   18.8805],
        [   18.5919],
        [   18.9254],
        [   18.4696],
        [   18.5873],
        [   18.5873],
        [   19.2906],
        [   18.6770],
        [   18.3445],
        [   19.2906],
        [   18.7600],
        [   19.2906],
        [   18.7299],
        [   19.2906],
        [   18.8805],
        [   18.4475],
        [   18.4216],
        [   18.4891],
        [-2785.8250],
        [   18.1267],
        [   18.7299],
        [   18.3932],
        [   18.2280],
        [   18.3408],
        [   18.5673],
        [   18.6983],
        [   18.3445],
        [   18.4097],
        [   18.6983],
        [   18.1493],
        [ 

(Pdb)  c


> <ipython-input-270-dd697a995b4a>(38)optimize_model()
-> optimizer.zero_grad()


(Pdb)  loss


tensor(1.1486e+12, grad_fn=<MseLossBackward>)


(Pdb)   state_action_values


tensor([[1.8105e+01],
        [1.8429e+01],
        [1.8433e+01],
        [1.9148e+01],
        [1.9401e+01],
        [1.8633e+01],
        [1.8445e+01],
        [1.8674e+01],
        [1.8363e+01],
        [1.8536e+01],
        [1.8353e+01],
        [1.8335e+01],
        [1.8597e+01],
        [1.8592e+01],
        [1.8065e+01],
        [1.9401e+01],
        [1.8679e+01],
        [1.8816e+01],
        [1.9025e+01],
        [1.8340e+01],
        [1.8380e+01],
        [1.9401e+01],
        [1.8880e+01],
        [1.8584e+01],
        [1.8297e+01],
        [1.8363e+01],
        [1.8741e+01],
        [1.9401e+01],
        [1.8742e+01],
        [1.8418e+01],
        [1.8297e+01],
        [1.8603e+01],
        [1.9025e+01],
        [1.8364e+01],
        [6.3951e+07],
        [1.8962e+01],
        [1.8633e+01],
        [1.8455e+01],
        [1.8763e+01],
        [1.8395e+01],
        [1.8674e+01],
        [1.8847e+01],
        [1.9018e+01],
        [1.8972e+01],
        [1.9401e+01],
        [1

(Pdb)  expected_state_values


*** NameError: name 'expected_state_values' is not defined


(Pdb)  expected_state_action_values


tensor([1.8685e+01, 1.8222e+01, 1.8242e+01, 2.0070e+01, 1.8312e+01, 1.9718e+01,
        1.9069e+01, 2.0070e+01, 1.9069e+01, 1.9718e+01, 1.8222e+01, 1.8673e+00,
        1.8222e+01, 1.8222e+01, 1.8242e+01, 1.9069e+01, 1.8590e+01, 1.9718e+01,
        1.8685e+01, 1.8590e+01, 1.8590e+01, 1.8312e+01, 1.8590e+01, 1.9497e+01,
        1.9069e+01, 1.9497e+01, 1.8242e+01, 1.8685e+01, 1.9718e+01, 1.9497e+01,
        1.9718e+01, 1.9497e+01, 2.0070e+01, 1.8685e+01, 5.6373e+07, 1.8312e+01,
        1.8590e+01, 1.8312e+01, 1.9497e+01, 1.9497e+01, 1.8410e+01, 1.8410e+01,
        1.9718e+01, 1.9069e+01, 1.8685e+01, 1.8410e+01, 1.8410e+01, 1.8222e+01,
        1.8685e+01, 2.0070e+01])


(Pdb)  loss


tensor(1.1486e+12, grad_fn=<MseLossBackward>)


(Pdb)  F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1))


tensor(1.1486e+12, grad_fn=<MseLossBackward>)


(Pdb)  c


> <ipython-input-270-dd697a995b4a>(38)optimize_model()
-> optimizer.zero_grad()


(Pdb)  state_action_values


tensor([[1.8353e+01],
        [1.8556e+01],
        [5.7033e+07],
        [1.8668e+01],
        [1.8285e+01],
        [1.8674e+01],
        [1.8893e+01],
        [1.8297e+01],
        [1.8517e+01],
        [1.8353e+01],
        [1.8816e+01],
        [1.9401e+01],
        [1.8074e+01],
        [1.8592e+01],
        [1.8495e+01],
        [1.8687e+01],
        [1.9148e+01],
        [1.8444e+01],
        [1.8577e+01],
        [1.8962e+01],
        [1.9401e+01],
        [1.8659e+01],
        [1.8655e+01],
        [1.8847e+01],
        [1.8987e+01],
        [1.2684e+07],
        [1.8074e+01],
        [3.2959e+07],
        [1.8498e+01],
        [1.8429e+01],
        [1.8844e+01],
        [1.8335e+01],
        [1.8508e+01],
        [1.8972e+01],
        [1.8656e+01],
        [1.8687e+01],
        [1.9401e+01],
        [1.8105e+01],
        [1.8363e+01],
        [1.8556e+01],
        [1.8679e+01],
        [1.8892e+01],
        [1.8353e+01],
        [1.8340e+01],
        [1.8715e+01],
        [1

(Pdb)  state_action_values.max()


tensor(57033216., grad_fn=<MaxBackward1>)


In [264]:
list(policy_net.parameters())

[Parameter containing:
 tensor([[-53.3329],
         [-52.9591],
         [-89.7820],
         [-55.4775],
         [-51.7886],
         [-53.2434],
         [-53.1901],
         [-52.4957],
         [-53.2546],
         [-53.5243],
         [-33.7993],
         [-54.4935],
         [-47.6832],
         [-50.9580],
         [-54.1645],
         [-48.5306],
         [-55.3063],
         [-51.0323],
         [-53.7722],
         [-62.2392],
         [-35.6059],
         [-48.6666],
         [-57.1618],
         [-38.0843],
         [-52.4527],
         [-35.5606],
         [-51.4244],
         [-54.0109],
         [-51.3915],
         [-52.4883],
         [-52.5651],
         [-50.0359],
         [-56.2314],
         [-54.6958],
         [-52.9198],
         [-52.0415],
         [-35.6785],
         [-54.4159],
         [-50.8646],
         [-53.7871],
         [-53.5030],
         [-53.3622],
         [-48.3190],
         [-33.0229],
         [-57.0577],
         [-53.8211],
         [-

In [265]:
torch.save(policy_net.state_dict(), './model_random_reset.pth')

In [266]:
model = DQN(1, N_ACTIONS, N_ACTIONS).to(device)
model.load_state_dict(torch.load("model_random_reset.pth"))
model.eval()

DQN(
  (fc1): Linear(in_features=1, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
)

In [267]:
def take_Q_step(initial_state):
    max_index = model(torch.tensor([initial_state]).type(torch.float)).max(0)[1]
    return env.action_space[max_index]

In [268]:
model(torch.tensor([5e4]).type(torch.float)).max(0)

torch.return_types.max(
values=tensor(37007244., grad_fn=<MaxBackward0>),
indices=tensor(2))

In [260]:
n_samples = 100
# Set up plots
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='time',
                               y_axis_label='N')


env = gym.make('fishing-v0')
collapse_points = []
harvests = []
# 
for sample in range(n_samples):
    env.random_reset()
    sample_traj = [env.fish_population]
    for season in range(100):
        env.step(take_Q_step(env.fish_population))
        sample_traj.append(env.fish_population)
    harvests.append(env.reward)
    all(isinstance(x, int) for x in sample_traj)
    sample_traj = np.array(sample_traj)
    plot.line(range(len(sample_traj)), sample_traj, line_width=0.5, alpha=0.5)


bokeh.io.show(plot)
env.close()

In [261]:
np.mean(harvests)

8.497134999999997

In [262]:
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='population',
                               y_axis_label='harvest selection')

v_take_Q_step = np.vectorize(take_Q_step)
x = np.linspace(0, 1e5, 10**5 + 1)
plot.line(x, v_take_Q_step(x), line_width=0.5, alpha=1)

bokeh.io.show(plot)

In [269]:
plot = bokeh.plotting.figure(plot_width=600,
                               plot_height=400,
                               x_axis_label='step',
                               y_axis_label='loss')
np_loss_history = np.array(list(map(lambda x: x.detach().numpy(), loss_history)))
plot.line(range(len(np_loss_history[::10000])), np_loss_history[::10000], line_width=0.5, alpha=1)

bokeh.io.show(plot)