# Step 1. Loading Environment

Before starting running this notebook, please make sure you have finished all installments in the [preparation document](https://github.com/ZeratuuLL/Reinforcement-Learning/blob/master/Navigation/Preparation.md).

Run the following blocks will initialize the environment. A window should appear where you can see what the agent sees.

In [None]:
from unityagents import UnityEnvironment
import numpy as np

In [None]:
env = UnityEnvironment(file_name="Banana.app")

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

# Step 2. Watching a random agent

In [None]:
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0                                          # initialize the score
step = 0
while True:
    step +=1
    action = np.random.randint(action_size)        # select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if done:                                       # exit loop if episode finished
        break
    
print("Score: {}, Total step is {}".format(score,step))

When finished, you can close the environment.

In [None]:
#env.close()

# Step 3. Set up a trainable agent

This step you load necessary code to build a agent. To be more specific it would be the network and basic set-ups. You should go to the training jupyter notebook and copy first two blocks in step 2. Here I used PPO agent as an example.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Discrete_Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, fc_units=[256,256,256]):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(Discrete_Actor, self).__init__()
        self.fc1=nn.Linear(state_size,fc_units[0])
        self.fc2=nn.Linear(fc_units[0],fc_units[1])
        self.fc3=nn.Linear(fc_units[1],fc_units[2])
        self.fc4=nn.Linear(fc_units[2],action_size)
        self.output = nn.LogSoftmax()

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x=F.relu(self.fc1(state))
        x=F.relu(self.fc2(x))
        x=F.relu(self.fc3(x))
        x=self.output(self.fc4(x))
        return(x)
    
class Critic(nn.Module):
    '''This critic does not consider action'''
    def __init__(self, state_size, hidden=[256,256,256]):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden[0])
        self.fc2 = nn.Linear(hidden[0], hidden[1])
        self.fc3 = nn.Linear(hidden[1], hidden[2])
        self.fc4 = nn.Linear(hidden[2], 1)
        self.bn1 = nn.BatchNorm1d(hidden[0])
        self.bn2 = nn.BatchNorm1d(hidden[1])
        self.bn3 = nn.BatchNorm1d(hidden[2])
        
    def forward(self, state):
        x = self.bn1(F.relu(self.fc1(state)))
        x = self.bn2(F.relu(self.fc2(x)))
        x = self.bn3(F.relu(self.fc3(x)))
        x = self.fc4(x)
        return x

In [None]:
import numpy as np
import random
from collections import namedtuple, deque

import torch
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, TensorDataset, DataLoader

Batch_Size = 128
GAMMA = 0.99            # discount factor for reward
TAU = 0.95              # discount factor for advantage
Beta = 0                # Coefficient for KL divergence
LR1 = 5e-4              # learning rate 
LR2 = 5e-4              # learning rate 
Eps = 0.2               # torelated error of ratio

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, lr1=LR1, lr2=LR2, gamma=GAMMA, tau=TAU, beta=Beta, eps=Eps, method='MC', learning_time=4):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            lr1/lr2 (float): learning rate for actor/critic
            tau (float): decay rate for advantage function
            gamma (float): decay rate for future rewards
        """
        self.state_size = state_size
        self.action_size = action_size
        self.lr1 = lr1
        self.lr2 = lr2
        self.gamma = gamma
        self.tau = tau
        self.beta = beta
        self.batch_size = Batch_Size
        self.eps = eps
        
        self.actor = Discrete_Actor(state_size, action_size).to(device)
        self.critic = Critic(state_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.lr1)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.lr2)
        self.critic.eval()
        if method in ['MC', 'TD']:
            self.method = method
        else:
            print('Only support MC or TD method. Input not supported. Use MC by default')
            self.method = 'MC'
        
        self.learn_time = learning_time # How many updates for each episode

    def act(self, state):
        """
        Returns actions' probabilities for given state as per current policy.
        Also saves the history for later update.
        
        Inputs:
        ==========================
        state(float): the current state, which is a tensor
        """
        log_probs = self.actor(state)
        return log_probs

    def learn(self, states, actions, log_probs, advantages, returns):
        """Update value parameters using the memory of current episode"""
        
        mydata = TensorDataset(states.detach(), actions.detach(), log_probs.detach(), advantages.detach(), returns.detach())
        Loader = DataLoader(mydata, batch_size = self.batch_size, shuffle = True)
        self.critic.train()
        
        for _ in range(self.learn_time):
            for sampled_states, sampled_actions, sampled_log_probs, sampled_advantages, sampled_returns in iter(Loader):
                sampled_actions = sampled_actions.long()
                new_log_probs = self.act(sampled_states)
                ratio = (new_log_probs - sampled_log_probs).exp().gather(1, sampled_actions)
                KL = -new_log_probs.exp()*(new_log_probs - sampled_log_probs)
                KL = torch.sum(KL, dim=1, keepdim=True)
                
                estimated_values = self.critic(sampled_states)
                
                Actor_Loss = -torch.min(input=ratio*sampled_advantages, other=torch.clamp(ratio, 1-self.eps, 1+self.eps)*sampled_advantages).mean()
                Actor_Loss -= self.beta*KL.mean()
                self.actor_optimizer.zero_grad()
                Actor_Loss.backward()
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 5)
                self.actor_optimizer.step()
                
                Critic_Loss = 0.5*(estimated_values - sampled_returns).pow(2).mean()
                self.critic_optimizer.zero_grad()
                Critic_Loss.backward()
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 5)
                self.critic_optimizer.step()       
                
        self.critic.eval()

For the following block, you establish the agent like the first line of code in step 3 in training jupyter noteebook

In [None]:
agent = Agent(state_size=state_size, action_size=action_size, method='MC', learning_time=3)

# Step 4. Load the trained agent and watch!

Download the trained weights and save it to the same direction as this .ipynb file and you can start watching!

In [None]:
#Be sure to load the correct weights
agent.actor.load_state_dict(torch.load('PPO_TD_checkpoint.pth'))

In [None]:
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0                                          # initialize the score
while True:
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)        
    action_values = agent.act(state)
    action=np.argmax(action_values.cpu().data.numpy())# select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if done:                                       # exit loop if episode finished
        break
    
print("Score: {}".format(score))

After you have enjoyed, you can close the environment.

In [None]:
env.close()