In [1]:
import numpy as np
from itertools import count
from collections import deque


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from agent import Agent

In [3]:
env = UnityEnvironment(file_name="Banana_Linux/Banana.x86_64", no_graphics=True)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [4]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [5]:
env_info = env.reset(train_mode=True)[brain_name]
action_size = brain.vector_action_space_size
state = env_info.vector_observations[0]
state_size = len(state)

In [6]:
LR = 5e-4
BATCH_SIZE = 64
GAMMA = 0.99

In [7]:
class Policy(nn.Module):
    def __init__(self, state_size, action_size, seed = 123, h1_size = 64, h2_size = 64, lr=5e-4):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(state_size, h1_size)
        self.dropout = nn.Dropout(p=0.6)
        self.affine2 = nn.Linear(h1_size, h2_size)
        self.affine3 = nn.Linear(h2_size, action_size)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = F.relu(self.affine2(x))
        action_scores = self.affine3(x)
        return F.softmax(action_scores, dim=1)


policy = Policy(state_size, action_size)
policy.load_state_dict(torch.load('checkpoint_re.pth'))
policy.train()
optimizer = optim.Adam(policy.parameters(), lr=LR)
eps = np.finfo(np.float32).eps.item()

In [8]:
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()


In [9]:
def finish_episode():
    R = 0
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + GAMMA * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]


In [None]:
def main():
    scores_window = deque(maxlen=100)
    max_score = -1
    for i_episode in range(1, 1001):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        ep_reward = 0
        for t in range(1, 3000):  # Don't infinite loop while learning
            action = select_action(state)
            env_info = env.step(action)[brain_name]
            state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]            
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                break

        scores_window.append(ep_reward)
        finish_episode()
        print("Episode Reward {} : {:.2f}".format(i_episode, ep_reward))
        if i_episode % 100 == 0:
            cur = np.mean(scores_window)
            
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, cur))
            
            torch.save(policy.state_dict(), 'checkpoint_' + str(i_episode) + str(cur) + '.pth')
            
            if cur >= 13:
                torch.save(policy.state_dict(), 'checkpoint_final' + str(i_episode) + str(cur) + '.pth')
            
            if cur > max_score:
                max_score = cur
                torch.save(policy.state_dict(), 'checkpoint_re.pth')


if __name__ == '__main__':
    main()

Episode Reward 1 : 7.00
Episode Reward 2 : 3.00
Episode Reward 3 : 8.00
Episode Reward 4 : 5.00
Episode Reward 5 : 5.00
Episode Reward 6 : 3.00
Episode Reward 7 : 0.00
Episode Reward 8 : 3.00
Episode Reward 9 : 0.00
Episode Reward 10 : 3.00
Episode Reward 11 : 2.00
Episode Reward 12 : 5.00
Episode Reward 13 : 5.00
Episode Reward 14 : 1.00
Episode Reward 15 : 8.00
Episode Reward 16 : 3.00
Episode Reward 17 : 4.00
Episode Reward 18 : 2.00
Episode Reward 19 : 5.00
Episode Reward 20 : 3.00
Episode Reward 21 : 5.00
Episode Reward 22 : 9.00
Episode Reward 23 : 6.00
Episode Reward 24 : 9.00
Episode Reward 25 : 2.00
Episode Reward 26 : 5.00
Episode Reward 27 : 2.00
Episode Reward 28 : 8.00
Episode Reward 29 : 1.00
Episode Reward 30 : 3.00
Episode Reward 31 : 4.00
Episode Reward 32 : 7.00
Episode Reward 33 : 8.00
Episode Reward 34 : 6.00
Episode Reward 35 : 4.00
Episode Reward 36 : 2.00
Episode Reward 37 : 6.00
Episode Reward 38 : 0.00
Episode Reward 39 : 2.00
Episode Reward 40 : 2.00
Episode R