In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from environment import Environment

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
import copy

In [12]:
d = 1

In [14]:
Q_nn = nn.Sequential()

Q_nn.add_module('layer1', nn.Linear(5040+d,64))
Q_nn.add_module('relu1', nn.ReLU())
Q_nn.add_module('layer2', nn.Linear(64,64))
Q_nn.add_module('relu2', nn.ReLU())
Q_nn.add_module('layer3', nn.Linear(64,1))

In [16]:
def get_action(state, epsilon=0):
    """
    sample actions with epsilon-greedy policy
    recap: with p = epsilon pick random action, else pick action with highest Q(s,a)
    """

    actions = np.linspace(0, 1.0, 200)

    q_values =[]

    for action in actions:
      s_a = torch.tensor(np.concatenate((state , [action])), dtype=torch.float32).reshape(1,-1)
      q_values.append(Q_nn(s_a).detach().numpy())

    q_values = np.array(q_values).flatten()
    greedy_action = np.argmax(q_values)

    should_exp = np.random.binomial(n=1,p=epsilon)

    if should_exp:
      action = np.random.choice(range(len(q_values)))
    else:
      action = greedy_action

    return actions[action]

### Q-learning via gradient descent

We shall now train our agent's Q-function by minimizing the TD loss:
$$ L = { 1 \over N} \sum_i (Q_{\theta}(s,a) - [r(s,a) + \gamma \cdot max_{a'} Q_{-}(s', a')]) ^2 $$


Where
* $s, a, r, s'$ are current state, action, reward and next state respectively
* $\gamma$ is a discount factor defined two cells above.

The tricky part is with  $Q_{-}(s',a')$. From an engineering standpoint, it's the same as $Q_{\theta}$ - the output of your neural network policy. However, when doing gradient descent, __we won't propagate gradients through it__ to make training more stable (see lectures).

To do so, we shall use `x.detach()` function which basically says "consider this thing constant when doing backprop".

In [None]:
def compute_td_loss(states, actions, rewards, next_states, is_done, gamma=0.99, check_shapes=False):
    """ Compute td loss using torch operations only. Use the formula above. """
    states = torch.tensor(states, dtype=torch.float32)                # shape: [batch_size, state_size]
    actions = torch.tensor(actions, dtype=torch.long)                 # shape: [batch_size]
    rewards = torch.tensor(rewards, dtype=torch.float32)              # shape: [batch_size]
    # shape: [batch_size, state_size]
    next_states = torch.tensor(next_states, dtype=torch.float32)
    is_done = torch.tensor(is_done, dtype=torch.uint8)                # shape: [batch_size]

    # get q-values for all actions in current states
    actions = np.linspace(0, 1, 200)

    predicted_qvalues =[]


    for action in actions:
      s_a = torch.tensor(np.concatenate((state , [action])), dtype=torch.float32).reshape(1,-1)
      predicted_qvalues.append(Q_nn(s_a).detach().numpy())

    predicted_qvalues = np.array(predicted_qvalues).flatten()

    #predicted_qvalues = network(states)                               # shape: [batch_size, n_actions]

    # select q-values for chosen actions
    predicted_qvalues_for_actions = predicted_qvalues[                # shape: [batch_size]
      range(states.shape[0]), actions
    ]

    # compute q-values for all actions in next states
    predicted_next_qvalues = network(next_states)

    # compute V*(next_states) using predicted next q-values
    next_state_values = torch.max(predicted_next_qvalues, dim=-1)[0]
    assert next_state_values.dtype == torch.float32

    # compute "target q-values" for loss - it's what's inside square parentheses in the above formula.
    target_qvalues_for_actions = rewards+gamma*next_state_values

    # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
    target_qvalues_for_actions = torch.where(
        is_done, rewards, target_qvalues_for_actions)

    # mean squared error loss to minimize
    loss = torch.mean((predicted_qvalues_for_actions -
                       target_qvalues_for_actions.detach()) ** 2)

    if check_shapes:
        assert predicted_next_qvalues.data.dim(
        ) == 2, "make sure you predicted q-values for all actions in next state"
        assert next_state_values.data.dim(
        ) == 1, "make sure you computed V(s') as maximum over just the actions axis and not all axes"
        assert target_qvalues_for_actions.data.dim(
        ) == 1, "there's something wrong with target q-values, they must be a vector"

    return loss

### Playing the game

In [18]:
opt = torch.optim.Adam(Q_nn.parameters(), lr=1e-4)

In [20]:
def generate_session(env, t_max=3, epsilon=0, train=False):
    """play env with approximate q-learning agent and train it at the same time"""
    total_reward = 0
    s = environment.Reset()
    print(s)

    for t in range(t_max):
        a = get_action(s, epsilon=epsilon)
        next_s, r, terminated, truncated = environment.Step(a)

        if train:
            opt.zero_grad()
            compute_td_loss([s], [a], [r], [next_s], [terminated]).backward()
            opt.step()

        total_reward += r
        s = next_s
        if terminated or truncated:
            break

    return total_reward

In [23]:
for i in range(1000):
    session_rewards = [generate_session(env, epsilon=epsilon, train=True) for _ in range(100)]
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(i, np.mean(session_rewards), epsilon))

    epsilon *= 0.99
    assert epsilon >= 1e-4, "Make sure epsilon is always nonzero during training"

    if np.mean(session_rewards) > 300:
        print("You Win!")
        break

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x3401 and 5041x64)