# Continuous Control

---

You are welcome to use this coding environment to train your agent for the project.  Follow the instructions below to get started!

### 1. Start the Environment

In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
env = gym.make('Pendulum-v0')

  result = entry_point.load(False)


### 2. Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [3]:
env.action_space, env.observation_space

(Box(1,), Box(3,))

In [4]:
action_size = env.action_space.shape[0]
state_size = env.observation_space.shape[0]

In [5]:
action_size, state_size

(1, 3)

In [6]:
env.observation_space.low, env.observation_space.high

(array([-1., -1., -8.], dtype=float32), array([1., 1., 8.], dtype=float32))

In [7]:
env.action_space.low, env.action_space.high

(array([-2.], dtype=float32), array([2.], dtype=float32))

In [8]:
env.reward_range

(-inf, inf)

### 3. Instantiate DDPG Agent

In [9]:
RANDOM_SEED = 10        # random seed for reproducibility of results
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 128        # minibatch size
GAMMA = 0.999            # discount factor
LR_D = 1e-4         # learning rate of the actor/discriminator 
LR_G = 1e-3        # learning rate of the generator
H_SIZE = 400

In [10]:
env.seed(RANDOM_SEED)

[10]

In [11]:
from agent import Agent

agent = Agent(s_size=state_size, a_size=action_size, h_size=H_SIZE,
              random_seed=RANDOM_SEED, buffer_size=BUFFER_SIZE, 
              batch_size=BATCH_SIZE, gamma=GAMMA, lr_d=LR_D, lr_g=LR_G)

### 4. Train the Agent with DDPG

In [13]:
n_episodes=10000
# n_steps=1000
# n_learn=10
# learn_every=20
R_goal=0 # TOCHECK
Rs_deque = deque(maxlen=100)
list_episodes = []

In [None]:
for i_episode in range(n_episodes):
    
    s = env.reset() # s: state
    #print(s.shape)

    # initialize the score (for each agent)
    R = 0 # R: total reward
    list_steps = []
    
    #for i_step in range(n_steps):
    while True:
        
        s = np.reshape(s, [1, -1])
        a = agent.act(s) # a=[-2, 2]
        a *= 2
        #print(a.shape)
        
        a = np.reshape(a, [-1])
        s2, r, done, _ = env.step(a)
        #print(s2.shape, r, done)

        s = np.reshape(s, [-1])
        agent.memory.add(s, a, r, s2, done)

        R += r                           # update the score (for each agent)
        s = s2                                # roll over states to next time step

        gloss, dloss, reward, reward_in = agent.start_learn()
        list_steps.append([gloss, dloss, reward, reward_in])

        # exit loop if episode finished
        if done:
            break

    Rs_deque.append(R) # not a list but deque
    
    print('\rEpisode {}\tTotal Average Score: {:.2f}'.format(i_episode+1, np.mean(Rs_deque)), 
          'Gloss:{:.4f}'.format(np.mean(list_steps, axis=0)[0]), 
          'Dloss:{:.4f}'.format(np.mean(list_steps, axis=0)[1]),
          'reward:{:.4f}'.format(np.mean(list_steps, axis=0)[2]), 
          'reward_in:{:.4f}'.format(np.mean(list_steps, axis=0)[3]))

    # Plotting
    list_episodes.append([np.mean(Rs_deque), 
                          np.mean(list_steps, axis=0)[0], # gloss
                          np.mean(list_steps, axis=0)[1], # dloss
                          np.mean(list_steps, axis=0)[2], # rewards mean
                          np.mean(list_steps, axis=0)[3]]) # reward_in mean

    
    if np.mean(Rs_deque) >= R_goal:
        torch.save(agent.g.state_dict(), 'g-pendulum.pth')
        torch.save(agent.d.state_dict(), 'd-pendulum.pth')
        break

Episode 1	Total Average Score: -1215.13 Gloss:287.6008 Dloss:-0.0483 reward:-2.2470 reward_in:0.0000
Episode 2	Total Average Score: -1520.60 Gloss:894.4889 Dloss:-0.1282 reward:-6.9919 reward_in:0.0001
Episode 3	Total Average Score: -1341.53 Gloss:898.5991 Dloss:-0.1159 reward:-7.0353 reward_in:0.0001
Episode 4	Total Average Score: -1233.84 Gloss:813.7974 Dloss:-0.0955 reward:-6.3909 reward_in:0.0001
Episode 5	Total Average Score: -1314.84 Gloss:806.3221 Dloss:-0.0675 reward:-6.3560 reward_in:0.0001


### 5. Plot the result

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
arr = np.array(list_episodes)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

plt.plot(np.arange(1, len(arr)+1), arr[0])
plt.ylabel('total_average_scores')
plt.xlabel('Episode #')
plt.show()

### 6. Watch the agent running with saved weights

In [14]:
# Load the saved weights into Pytorch model
agent.g.load_state_dict(torch.load('g-pendulum.pth', map_location='cpu'))
agent.d.load_state_dict(torch.load('d-pendulum.pth', map_location='cpu'))

# env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
# S = env_info.vector_observations                  # get the current states (S) (for each agent)
s = env.reset() # s: state # get the current state (s) (for an agent)

#Rs = np.zeros(num_agents)                          # initialize the total scores (Rs) (for each agent)
R = 0 # R:total reward # initialize the total score (R) (for an agent)

while True:
    #A = agent.act(S)                        # select actions (A) from loaded model agents
    a = agent.act(np.reshape(s, [1, -1])) # a: [-1, +1]    # select action (a) from loaded model agent
    
    # env_info = env.step(A)[brain_name]           # send all actions (A) to tne environment (env)
    # S2 = env_info.vector_observations         # get next states (S2) (for each agent)
    # rewards = env_info.rewards                         # get rewards (for each agent)
    # dones = env_info.local_done                        # see if the episode is done/finished (terminal)
    s2, r, done, _ = env.step(np.reshape(a, [-1]))
    
    # Rs += env_info.rewards                         # update the total scores (Rs) (for each agent)
    # S = S2                               # roll over current states (S) to next states (S2)
    R += r # update the total score (R) (for an agent)
    s = s2 # roll over current state (s) to next state (s2)
    
    #if np.any(dones):                                  # exit loop if episode is done/finished
    if done: # exit loop if episode is done/finished (terminal)
        break
        
print('Average of total scores: {}'.format(R))

FileNotFoundError: [Errno 2] No such file or directory: 'g-pendulum.pth'

When finished, you can close the environment.

In [6]:
env.close()