In [1]:
import torch
from collections import deque
import pandas as pd

import numpy as np

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import gym
from gym import spaces

In [2]:
!pip install wandb -qqq
import wandb
wandb.login()
#the command below will generate an error if you haven't been addded to the wandb team yet
#you must fill out the form posted on ed discsussion and get added to the team
#until you are added, replace the command below by
#run=wandb.init()

  and should_run_async(code)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.8/252.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h

  return LooseVersion(v) >= LooseVersion(check)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## DQN (Deep Q Network)

In previous Labs, we have learned to use Pytorch to build deep learning models. In this lab, we will apply deep learning as function approximations in reinforcement learning.

Reference: DQN https://arxiv.org/abs/1312.5602

In tabular Q-learning, we maintain a table of state-action pairs $(s,a)$ and save one action value for each entry $Q(s,a),\forall (s,a)$. At each time step $t$, we are in state $s_t$, then we choose action based on $\epsilon-$greedy strategy. With prob $\epsilon$, choose action uniformly random; with prob $1-\epsilon$, choose action based on $$a_t = \arg\max_a Q(s_t,a)$$

We then get the instant reward $r_t$, update the Q-table using the following rule

$$Q(s_t,a_t) \leftarrow (1-\alpha)Q(s_t,a_t) + \alpha (r_t + \max_a \gamma Q(s_{t+1},a))$$

where $\alpha \in (0,1)$ is learning rate. The algorithm is shown to converge in tabular cases. However, in cases where we cannot keep a table for state and action, we need function approximation. Consider using neural network with parameter $\theta$, the network takes as input state $s$ and action $a$. (*there are alternative parameterizations here*). Let $Q_\theta(s,a)$ be the output of the network, to estimate the optimal action value function in state $s$ and take action $a$ (and follow optimal policy thereafter).

$$Q_\theta(s,a) \approx Q^\ast(s,a)$$

### Bellman optimality equation

We will use Bellman optimality equation to find $\theta$ such that the above approximation holds better. Recall that for optimal Q function $Q^\ast(s,a)$ the following holds for all $(s,a)$

$$Q^\ast(s_t,a_t) = \mathbb{E}\big[r_t + \gamma \max_a Q^\ast(s_{t+1},a)\big]$$

where the expectation is wrt the random reward $r_t$ and transition to the next state $s_{t+1}$. A natural objective to consider is

$$\min_\theta\  (Q_\theta(s_t,a_t) -\mathbb{E}\big[r_t + \gamma  \max_a  Q_{\hat \theta}(s_{t+1},a)\big])^2$$
at the current or previous $\hat \theta$.

### Building the DQN model

The first step is to build a neural network with parameters $\theta$ that predicts $Q_\theta(s,a)$ for any $(s,a)$. You can either build a network that

* (in case of small number $K$ of discrete actions) takes as input a  representation of state $s$ and outputs a $K$-dimensional vector giving scores $Q(s,a), a=1,\ldots, K$ for all actions

or

* takes as input a concatenated representation of state and action $(s,a)$ and output one dimensional score $Q_\theta(s,a)$,

Below we have provided a skeleton code (incomplete) for defining and training the Q-function. **You need to fill in the DNN model definition and loss function definition**. Refer to regression lab (lab 2) for help.

In [3]:
# define neural net Q_\theta(s,a) as a class

class Qfunction(object):

    def __init__(self, obssize, actsize, lr, device, loss='huber', opt='adam'):
        """
        obssize: dimension of state space
        actsize: dimension of action space
        sess: sess to execute this Qfunction
        optimizer:
        """
        # DEFINE THE MODEL
        self.model = torch.nn.Sequential(
                    #TODO
                    #input layer
                    torch.nn.Linear(obssize, 512),
                    torch.nn.ReLU(),
                    torch.nn.Linear(512, 256),
                    torch.nn.ReLU(),
                    torch.nn.Linear(256, 128),
                    torch.nn.ReLU(),
                    torch.nn.Linear(128, actsize)
                ).to(device)

        # DEFINE THE OPTIMIZER
        if opt == 'adam':
          self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        elif opt == 'rmsprop':
          self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=lr, alpha=0.95, eps=0.01)
        elif opt == 'adamw':
          self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr, amsgrad=True)

        # RECORD HYPER-PARAMS
        self.obssize = obssize
        self.actsize = actsize
        self.device = device
        self.loss = loss

    def _to_one_hot(self, y, num_classes):
        """
        convert an integer vector y into one-hot representation
        """
        scatter_dim = len(y.size())
        y_tensor = y.view(*y.size(), -1)
        zeros = torch.zeros(*y.size(), num_classes, dtype=y.dtype, device=self.device)
        return zeros.scatter(scatter_dim, y_tensor, 1)


    def compute_Qvalues(self, states, actions):
        """
        input: list of numsamples state-action pairs
        output: List of Q values for each input (s,a). The output will have size [numsamples, 1]
        """
        #Below is example code when neural network is set to take as input state and output Q-value for all actions.
        #This will be different for neural network that takes as input a state-action pair

        try:
          states = torch.FloatTensor(states).to(self.device)
        except TypeError:
          pass
        q_preds = self.model(states)
        action_onehot = self._to_one_hot(actions.to(self.device), actsize)
        q_preds_selected = torch.sum(q_preds * action_onehot, axis=-1)

        return q_preds_selected

    def compute_maxQvalues(self, states):
        """
        input: a list of numsamples states
        output: max_a Q(s,a) values for every input state s in states. The output will have size numsamples
        """
        #Below is example code when neural network is set to take as input state and output Q-value for all actions.
        #if the neural takes as input a state-action pair, then the code will need to loop over all actions to compute all values

        states = torch.FloatTensor(states).to(self.device)
        Qvalues = self.model(states)
        q_preds_greedy = Qvalues.max(1).values

        return q_preds_greedy

    def compute_argmaxQ(self, state):
        """
        input: one state s
        output: arg max_a Q(self.model(states).cpu().data.numpy()s,a) values for the input state s. The output will have size 1
        """
        #Below is example code when neural network is set to take as input state and output Q-value for all actions.
        #if the neural takes as input a state-action pair, then the code will need to loop over all actions to compute all values

        state = torch.FloatTensor(state).to(self.device)
        Qvalue = self.model(state)
        greedy_action = Qvalue.argmax().item()

        return greedy_action

    def take_action(self, state, possible_actions, epsilon):
        if np.random.random() < epsilon:
          return np.random.choice(possible_actions)
        else:
          return self.compute_argmaxQ(state)


    def train(self, states, actions, targets, verbose=False):
        """
        states: numpy array as input to compute loss (s)
        actions: numpy array as input to compute loss (a)
        targets: numpy array as input to compute loss (Q targets)
        """
        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).long().to(device)
        targets = torch.from_numpy(targets).float().to(device)

        # COMPUTE Q PREDICTIONS for all state-action pairs
        q_preds_selected = self.compute_Qvalues(states, actions)

        # LOSS
        if verbose: print(q_preds_selected.shape, targets.shape)
        if self.loss == 'mse':
          loss = torch.mean((q_preds_selected - targets)**2)
        elif self.loss == 'huber':
          loss_func = torch.nn.SmoothL1Loss()
          loss = loss_func(q_preds_selected, targets)

        # BACKWARD PASS
        self.optimizer.zero_grad()
        loss.backward()

        # UPDATE
        self.optimizer.step()

        return loss.detach().cpu().data.numpy()

In [2]:

BASE_PATH = '/Users/amanchopra/Documents/School/MS/Fall 2023/RL/Final Project/Data/Processed'
data = pd.read_csv(f"{BASE_PATH}/transition_model_90_min_history_rewards.csv", index_col=0)

In [3]:
data.head()

Unnamed: 0,BG_t0,BG_t1,BG_t2,BG_t3,BG_t4,BG_t5,BG_t6,BG_t7,BG_t8,BG_t9,...,IOB_t28,IOB_t29,IOB_t30,IOB_t31,IOB_t32,IOB_t33,IOB_t34,IOB_t35,simple_reward,magni_reward
0,127.0,121.0,113.0,99.0,95.0,97.0,101.0,107.0,111.0,118.0,...,0.491872,0.502033,0.510965,0.531246,0.543538,0.555829,0.568121,0.580412,0,-10.211281
1,132.0,127.0,121.0,113.0,99.0,95.0,97.0,101.0,107.0,111.0,...,0.471303,0.491872,0.502033,0.510965,0.531246,0.543538,0.555829,0.568121,0,-12.579462
2,130.0,132.0,127.0,121.0,113.0,99.0,95.0,97.0,101.0,107.0,...,0.469787,0.471303,0.491872,0.502033,0.510965,0.531246,0.543538,0.555829,0,-12.139707
3,120.0,130.0,132.0,127.0,121.0,113.0,99.0,95.0,97.0,101.0,...,0.458479,0.469787,0.471303,0.491872,0.502033,0.510965,0.531246,0.543538,0,-12.800759
4,106.0,120.0,130.0,132.0,127.0,121.0,113.0,99.0,95.0,97.0,...,0.444016,0.458479,0.469787,0.471303,0.491872,0.502033,0.510965,0.531246,0,-14.602987


In [5]:
data['simple_reward'].value_counts()

simple_reward
 0     23812
 1     18604
-10     3301
Name: count, dtype: int64

In [6]:
data['magni_reward'].describe()

count    45717.000000
mean        -8.245843
std         11.819608
min        -84.319007
25%         -8.024595
50%         -5.669213
75%         -1.246025
max         -0.000047
Name: magni_reward, dtype: float64

At this point you can skip ahead to implementing the basic Q-learning that at every step $t$ in the environment
* given state $s_t$, computes greedy actions from Q-values (using compute_argmaxQ function above) and uses $\epsilon$-greedy select an action $a_t$,
* makes observation of reward $r_t$ and next state $s_{t+1}$
* using compute_maxQvalues() function, computes target
  $$r_t + \gamma \max_a Q_\theta(s_{t+1},a)$$
and then retrains the Q-function using train() function above (with numsamples=1)

However, for improved performance you may want to consider ideas like batch training (numsamples>1 is the batch size) with experience replay buffer and target-networks.

**Replay Buffer**

Maintain a buffer $R$ to store trainsition tuples $(s_t,a_t,r_t,s_{t+1})$, when we minimize the Bellman error. When optimizing the Bellman error loss, we sample batches from the replay buffer and compute gradients for update on these batches. In particular, in each update, we sample $N$ tuples from buffer $(s_i,a_i,r_i,s_{i}') \sim R$ and then compute
targets

$$d_i=r_i + \max_a \gamma Q_{\theta}(s_i^\prime,a)$$
for all $i$. Use the above training function train() with input as list $(s_i, a_i, d_i)_{i=1}^N$  to update parameters using backprop.

**Target Network**

Maintain a target network in addition to the original pricipal network. The target network is just a copy of the original network but the parameters are not updated by gradients. The target network $\theta^-$ is copied from the principal network every $\tau$ time steps. Target network is used to compute the targets for update

$$d_i =  r_t + \gamma \max_a Q_{\theta^{-}}(s_{i}^\prime,a)$$

the targets are used in the loss function to update the principal network parameters. This slowly updated target network ensures that the targets come from a relatively stationary distribution and hence stabilize learning.

Hence several critical parts of the complete pseudocode for DQN is as follows:

**Initialization.**
principal network $Q_\theta(s,a)$, target network $Q_{\theta^{-}}(s,a)$. Replay buffer $R = \{\}$ (empty).

**At each time step $t.$**
The agent executes action using $\epsilon-$greedy based on the principal network $Q_\theta(s,a)$. To update $\theta$: sample $N$ tuples $(s_i,a_i,r_i,s_i^\prime) \sim R$, compute empirical loss

$$\frac{1}{N} \sum_{i=1}^N (Q_\theta(s_i,a_i) - (r_i + \gamma \max_a Q_{\theta^{-}}(s_i^\prime,a))^2$$

and update parameter $\theta$ using backprop (just take one gradient step).

**Update target network.**
Every $\tau$ time steps, update target network by copying $\theta_{\text{target}} \leftarrow \theta$.

**Bellman target.**
Above, we have defined the target values as being computed from a target net with parameter $\theta^-$
$$r_i + \gamma \max_a Q_{\theta^{-}}(s_i^\prime,a)$$
It is worth thinking about what happens if we are at the end of an episode, that is, what if $s_i^\prime$ here is a terminal state. In this case, should the Bellman error be defined exactly the same as above? Do we need some modifications? Think carefully about this as this will greatly impact the algorithmic performance.

### Implementation of replay buffer

In [7]:
# Implement replay buffer
import random
class ReplayBuffer(object):

    def __init__(self, maxlength):
        """
        maxlength: max number of tuples to store in the buffer
        if there are more tuples than maxlength, pop out the oldest tuples
        """
        self.buffer = deque()
        self.number = 0
        self.maxlength = maxlength

    def append(self, experience):
        """
        this function implements appending new experience tuple
        experience: a tuple of the form (s,a,r,s^\prime)
        """
        self.buffer.append(experience)
        self.number += 1

    def pop(self):
        """
        pop out the oldest tuples if self.number > self.maxlength
        """
        while self.number > self.maxlength:
            self.buffer.popleft()
            self.number -= 1

    def sample(self, batchsize):
        """
        this function samples 'batchsize' experience tuples
        batchsize: size of the minibatch to be sampled
        return: a list of tuples of form (s,a,r,s^\prime)
        """
        minibatch = random.sample(self.buffer,batchsize)
        return minibatch


### Code snippet for copying target network
You may use th following to update target network i.e. to copy from principal network to target network. We need to use tensorflow scope to distinguish the computational graphs of target and principal networks. The following function builds a tensorflow operation that does the copying $\theta^- \leftarrow \theta$

In [8]:
def run_target_update(Qprincipal, Qtarget):
    for v,v_ in zip(Qprincipal.model.parameters(), Qtarget.model.parameters()):
        v_.data.copy_(v.data)

## Main code for DQN
Now that we have all the ingredients for DQN, we can write the main procedure to train DQN on a given environment. The implementation is straightforward if you follow the pseudocode pdf.

In [51]:
class CustomEnv(gym.Env):
    def __init__(self, dataframe, reward='simple_reward'):
        super(CustomEnv, self).__init__()
        self.df = dataframe
        
        feature_dim = int((max([int(col[4:]) for col in dataframe.columns if 'BG' in col]) + 1) / 2)

        self.s_cols = [f'BG_t{i}' for i in range(feature_dim)] + [f'HR_t{i}' for i in range(feature_dim)] + [f'IOB_t{i}' for i in range(feature_dim)]
        self.a_col = 'InsulinDelivered'
        self.s_prime_cols = [f'BG_t{i}' for i in range(feature_dim, 2*feature_dim)] + [f'HR_t{i}' for i in range(feature_dim, 2*feature_dim)] + [f'IOB_t{i}' for i in range(feature_dim, 2*feature_dim)]
        self.reward_col = reward

        actions = dataframe[self.a_col]
        self.action_space = spaces.Box(low=np.array([actions.min()]), high=np.array([actions.max()]))

        bg_min = min([data[f'BG_t{i}'].min() for i in range(feature_dim*2)])
        bg_max = max([data[f'BG_t{i}'].max() for i in range(feature_dim*2)])
        hr_min = min([data[f'HR_t{i}'].min() for i in range(feature_dim*2)])
        hr_max = max([data[f'HR_t{i}'].max() for i in range(feature_dim*2)])
        iob_min = min([data[f'IOB_t{i}'].min() for i in range(feature_dim*2)])
        iob_max = max([data[f'IOB_t{i}'].max() for i in range(feature_dim*2)])
        
        self.observation_space = spaces.Box(low=np.array([bg_min]*feature_dim + [hr_min]*feature_dim + [iob_min]*feature_dim), high=np.array([bg_max]*feature_dim + [hr_max]*feature_dim + [iob_max]*feature_dim))
        self.current_state_ind = None

        
    def reset(self, seed=None):
        if seed:
            sample = self.df.sample(1, random_state=seed)
            s_ind = sample.index[0]
            s = sample[self.s_cols].values.flatten()
        else:
            s = self.df.sample(1)
            s_ind = sample.index[0]
            s = sample[self.s_cols].values.flatten()
        self.current_state_ind = s_ind
        return s

    def step(self, action):
        # get the next state and reward based on the current state and action
        next_state = self.iloc[self.current_state_ind]
        if next_state[self.a_col] != action:
            raise Exception(f"Cannot take action (InsulinDelivered={action}) as this action doesn't exist for the current state at index {self.current_state_ind}!")
        next_state  = next_state[self.current_state_ind][self.s_prime_cols].values
        reward = next_state[self.current_state_ind][self.reward_col]
        done = self.is_done()
        info = {}
        self.current_state_ind = 

        return next_state, reward, done, info
    
    def does_action_exist(self, action):
        if action in self.df[self.a_col]:
            return True
        return False
    
    def get_closest_action(self, action):
        ind = np.random.choice(np.where((self.df[self.a_col] - action).abs() == (self.df[self.a_col] - action).abs().min())[0])
        return data.iloc[ind][self.a_col]

    def is_done(self):
        return False

In [76]:
env = CustomEnv(data)
observation = env.reset()
print(observation.shape)
action = env.action_space.sample()
next_observation, reward, done, info = env.step(action)

(54,)


ValueError: ('Lengths must match to compare', (45717,), (1,))

In [13]:
#%%wandb
#remove above line if you do not want to see inline plots from wandb

# hyper-parameters
lr = 1e-3  # learning rate for gradient update
batchsize = 64  # batchsize for buffer sampling
maxlength = 1000  # max number of tuples held by buffer
envname = "CartPole-v0"  # environment name
tau = 100  # time steps for target update
episodes = 300  # number of episodes to run
initialsize = 500  # initial time steps before start training
epsilon = .2  # constant for exploration
gamma = .99  # discount
q_update_num_steps = 1 # time steps for q network update
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
verbose = True
opt = 'adamw'
loss = 'huber'

run = wandb.init(
    # set the wandb project where this run will be logged
    project="rl-cartpole",

    # track hyperparameters and run metadata
    config={
    "method": "dqn",
    "lr": lr,
    "buffer_batchsize": batchsize,
    "buffer_maxlength": maxlength,
    "tau": tau,
    "episodes": episodes,
    "initial_size": initialsize,
    "epsilon": epsilon,
    "gamma": gamma,
    "q_update_num_steps": q_update_num_steps,
    "device": device,
    "optimizer": opt,
    "loss": loss
    }
)

# initialize environment
# env = gym.make(envname)
env = CustomEnv(data)
obssize = env.observation_space.low.size
actsize = env.action_space.n

# initialize Q-function networks (princpal and target)
Qprincipal = Qfunction(obssize, actsize, lr, device, opt=opt, loss=loss)
Qtarget = Qfunction(obssize, actsize, lr, device, opt=opt, loss=loss)

# initialization of graph and buffer
buffer = ReplayBuffer(maxlength)

# main iteration
rrecord = []
totalstep = 0

for episode in range(episodes):

    obs = env.reset()
    done = False
    rsum = 0
    eps_actions = {i: 0 for i in range(env.action_space.n)}

    while not done:

        #greedy choice below. Use epsilon greedy for exploration
        action = Qprincipal.take_action(np.expand_dims(obs,0), [i for i in range(env.action_space.n)], epsilon)
        eps_actions[action] += 1

        newobs, r, done, _ = env.step(action)
        done_ = 1 if done else 0
        e = (obs, action, r, done_, newobs)

        #IF NOT USING BUFFER:
        #use single sample (obs, action, r, done_, newobs) with Qtarget to compute target and train Qprincipal

        # ELSE IF USING REPLAY BUFFER
        # append experiences e to buffer

        buffer.append(e)
        buffer.pop()

        #every few episodes (decide the frequency) sample a minibatch from buffer
        #compute targets in batch using Qtarget and train  Qprincipal
        if totalstep % q_update_num_steps == 0 and buffer.number >= batchsize:
          samples = buffer.sample(batchsize)
          states = [e[0] for e in samples]
          actions = [e[1] for e in samples]
          rewards = [e[2] for e in samples]
          dones = np.array([e[3] for e in samples], dtype=bool)
          next_states = [e[4] for e in samples]

          non_ternimal_next_states = [next_state for next_state, done in zip(next_states, dones) if not done]
          non_ternimal_next_states_values = Qtarget.compute_maxQvalues(non_ternimal_next_states)

          next_states_values = np.zeros(batchsize)
          next_states_values[~dones] = non_ternimal_next_states_values.cpu().data.numpy()

          #next_states_values = [0 if dones[i] else next(non_ternimal_next_states_values) for i in range(batchsize)]

          targets = [rewards[i] if dones[i] else rewards[i] + gamma * next_states_values[i] for i in range(batchsize)]
          Qprincipal.train(np.array(states), np.array(actions), np.array(targets), verbose=False)


        #UPDATE target network
        #every tau steps update copy the principal network to the target network
        if totalstep % tau == 0:
            run_target_update(Qprincipal, Qtarget)

        # update
        totalstep += 1
        rsum += r
        obs = newobs

    rrecord.append(rsum)




    # printing functions for debugging purposes. Feel free to add more
    if verbose and episode % 10 == 0:
       print('buffersize {}'.format(buffer.number))
       print('episode {} ave training returns {}'.format(episode, np.mean(rrecord[-10:])))

    #printing moving averages for smoothed visualization.
    fixedWindow=100
    movingAverage=0
    if len(rrecord) >= fixedWindow:
        movingAverage=np.mean(rrecord[len(rrecord)-fixedWindow:len(rrecord)-1])

    wandb.log({ "training reward" : rsum, "train reward moving average" : movingAverage})


VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded\r'), FloatProgress(value=0.11213168424881775, max=1.…

buffersize 35
episode 0 ave training returns 0.0


  states = torch.FloatTensor(states).to(self.device)


buffersize 385
episode 10 ave training returns 0.0
buffersize 735
episode 20 ave training returns 0.0
buffersize 1000
episode 30 ave training returns 0.0
buffersize 1000
episode 40 ave training returns 0.0
buffersize 1000
episode 50 ave training returns 0.0
buffersize 1000
episode 60 ave training returns 0.0
buffersize 1000
episode 70 ave training returns 0.0
buffersize 1000
episode 80 ave training returns 0.0
buffersize 1000
episode 90 ave training returns 0.0
buffersize 1000
episode 100 ave training returns 0.0
buffersize 1000
episode 110 ave training returns 0.0
buffersize 1000
episode 120 ave training returns 0.0
buffersize 1000
episode 130 ave training returns 0.0
buffersize 1000
episode 140 ave training returns 0.0
buffersize 1000
episode 150 ave training returns 0.0
buffersize 1000
episode 160 ave training returns 0.0
buffersize 1000
episode 170 ave training returns 0.0
buffersize 1000
episode 180 ave training returns 0.0
buffersize 1000
episode 190 ave training returns 0.0
buff

Finally, we evaluate the performance of the trained agent. We will evaluate the performance of the greedy policy wrt learned Q-function. The evaluation will be run 10 times, each for eval_epsiodes and print out the average performance across these episodes. Please **do not** change the code below.

In [16]:
### DO NOT CHANGE
def evaluate(Q, env, episodes):
    # main iteration
    score = 0.0
    for episode in range(episodes):

        obs = env.reset()
        done = False
        rsum = 0

        while not done:
            # always greedy
            action = Q.compute_argmaxQ(np.expand_dims(obs,0))


            # mdp stepping forward
            newobs, r, done, _ = env.step(action)

            # update data
            rsum += r
            obs = newobs


        wandb.log({"eval reward" : rsum})
        score = score + rsum
    score = score/episodes

    return score

In [18]:
# DO NOT CHANGE CODE HERE
# after training, we will evaluate the performance of the agent
# on a target environment
# env_test = gym.make(envname)
env_test = CustomEnv(data)
eval_episodes = 1000
score = evaluate(Qprincipal, env_test, eval_episodes)
wandb.run.summary["score"]=score

print("eval performance of DQN agent: {}".format(score))

  and should_run_async(code)


eval performance of DQN agent: 0.0


In [19]:
run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train reward moving average,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
training reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval reward,0.0
score,0.0
train reward moving average,0.0
training reward,0.0
