# Continuous Control

---

You are welcome to use this coding environment to train your agent for the project.  Follow the instructions below to get started!

### 1. Start the Environment

In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
env = gym.make('Pendulum-v0')

random_seed = 10
env.seed(random_seed)

# from agent import Agent
# agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], 
#               random_seed=random_seed)

  result = entry_point.load(False)


[10]

### 2. Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [3]:
env.action_space, env.observation_space

(Box(1,), Box(3,))

In [4]:
action_size = env.action_space.shape[0]
state_size = env.observation_space.shape[0]

In [5]:
action_size, state_size

(1, 3)

In [6]:
env.observation_space.low, env.observation_space.high

(array([-1., -1., -8.], dtype=float32), array([1., 1., 8.], dtype=float32))

In [7]:
env.action_space.low, env.action_space.high

(array([-2.], dtype=float32), array([2.], dtype=float32))

### 3. Instantiate DDPG Agent

In [8]:
from agent import Agent

agent = Agent(s_size=state_size, a_size=action_size, random_seed=random_seed)

### 4. Train the Agent with DDPG

In [13]:
from collections import deque
import numpy as np
import time
import torch

n_episodes=1000
# n_steps=1000
# n_learn=10
# learn_every=20
R_goal=-100 # TOCHECK
Rs_deque = deque(maxlen=100)
meanRs = [] # plot
Rs = [] # plot

In [None]:
for i_episode in range(n_episodes):
    
    # # Reset Env
    # env_info = env.reset(train_mode=train_mode)[brain_name]       # reset the environment    
    # S = env_info.vector_observations                   # get the current state (for each agent)
    s = env.reset() # s: state [-1:1, -1:1, -8:8]
    s_ = np.zeros_like(s) # s_: predicted state [-1:1, -1:1, -8:8]
    #print(s.shape)

    #Rs = np.zeros(num_agents)                            # initialize the score (for each agent)
    R = 0 # R: total reward

    #for i_step in range(n_steps):
    while True:
        s = np.reshape(s, [1, -1])
        s_ = np.reshape(s_, [1, -1])
        a = agent.act(s, s_) # a=[-2, 2]
        a *= 2
        #print(a.shape)
        
        # Exploration noise
        # noise = np.random.normal(loc=0, scale=0.1, size=[num_agents, action_size])
        # A += noise
        # A = np.clip(a=A, a_min=-1, a_max=1)

        # env_info = env.step(A)[brain_name]            # send all actions to the environment
        # S2 = env_info.vector_observations          # get next state (for each agent)
        # rewards = env_info.rewards                          # get reward (for each agent)
        # dones = env_info.local_done                         # see if episode finished
        #s2: next_state, r: reward, done/terminal, env: environment
        s2, r, done, _ = env.step(np.reshape(a, [-1]))
        #print(s2.shape, r, done)

        #s2_, q_ = agent.env(s, a)
        s2_, _ = agent.env(s, a)
        # print(s2_)
        # s2_ = np.tanh(s2_) # predicted next state [-1:1, -1:1, -8:8]
        # print(s2_)
        # s2_[2] = s2_[2] * 8
        # print(s2_)

        # for s, a, reward, s2, done in zip(S, A, rewards, S2, dones):
        # agent.step(state, action, reward, next_state, done) # send actions to the agent
        agent.step(s, a, r, s2, done, s_, s2_)

        R += r                           # update the score (for each agent)
        s = s2                                # roll over the states to the next state
        s_ = s2_                                # roll over predicted states to the predicted next state

        # if i_step%learn_every == 0:
        #     for _ in range(n_learn):
        #         agent.start_learn()
        #print(len(agent.memory))
        agent.start_learn()

        # if np.any(dones):                                   # exit loop if episode finished
        #     break
        if done:
            break

    #meanR = np.mean(Rs)
    #Rs_deque.append(meanR)
    Rs_deque.append(R)
    meanRs.append(np.mean(Rs_deque)) # plot
    Rs.append(R)

    print('\rEpisode {}\tTotal Average Score: {:.2f}'.format(i_episode+1, np.mean(Rs_deque)))  
    if np.mean(Rs_deque) >= R_goal:
        torch.save(agent.g.state_dict(), 'g-pendulum.pth') # pendulum
        torch.save(agent.d.state_dict(), 'd-pendulum.pth') # pendulum
        break

Episode 1	Total Average Score: -1856.08
Episode 2	Total Average Score: -1376.73
Episode 3	Total Average Score: -1118.66
Episode 4	Total Average Score: -1235.17
Episode 5	Total Average Score: -1302.35
Episode 6	Total Average Score: -1320.18
Episode 7	Total Average Score: -1257.19
Episode 8	Total Average Score: -1239.40
Episode 9	Total Average Score: -1227.09
Episode 10	Total Average Score: -1208.21
Episode 11	Total Average Score: -1196.78
Episode 12	Total Average Score: -1212.01
Episode 13	Total Average Score: -1206.11
Episode 14	Total Average Score: -1199.92
Episode 15	Total Average Score: -1201.48
Episode 16	Total Average Score: -1196.17
Episode 17	Total Average Score: -1196.49
Episode 18	Total Average Score: -1190.70
Episode 19	Total Average Score: -1184.07
Episode 20	Total Average Score: -1183.65
Episode 21	Total Average Score: -1184.02
Episode 22	Total Average Score: -1184.30
Episode 23	Total Average Score: -1174.23
Episode 24	Total Average Score: -1174.51
Episode 25	Total Average 

Episode 199	Total Average Score: -1190.97
Episode 200	Total Average Score: -1187.76
Episode 201	Total Average Score: -1187.63
Episode 202	Total Average Score: -1186.37
Episode 203	Total Average Score: -1184.47
Episode 204	Total Average Score: -1182.96
Episode 205	Total Average Score: -1181.29
Episode 206	Total Average Score: -1180.44
Episode 207	Total Average Score: -1177.81
Episode 208	Total Average Score: -1174.51
Episode 209	Total Average Score: -1173.36
Episode 210	Total Average Score: -1176.09
Episode 211	Total Average Score: -1173.67
Episode 212	Total Average Score: -1172.07
Episode 213	Total Average Score: -1175.16
Episode 214	Total Average Score: -1178.35
Episode 215	Total Average Score: -1178.84
Episode 216	Total Average Score: -1179.23
Episode 217	Total Average Score: -1177.85
Episode 218	Total Average Score: -1181.48
Episode 219	Total Average Score: -1178.89
Episode 220	Total Average Score: -1177.79
Episode 221	Total Average Score: -1181.66
Episode 222	Total Average Score: -

Episode 395	Total Average Score: -1098.47
Episode 396	Total Average Score: -1099.76
Episode 397	Total Average Score: -1100.26
Episode 398	Total Average Score: -1100.34
Episode 399	Total Average Score: -1103.50
Episode 400	Total Average Score: -1105.89
Episode 401	Total Average Score: -1108.25
Episode 402	Total Average Score: -1109.65
Episode 403	Total Average Score: -1111.37
Episode 404	Total Average Score: -1114.64
Episode 405	Total Average Score: -1116.04
Episode 406	Total Average Score: -1118.50
Episode 407	Total Average Score: -1119.84
Episode 408	Total Average Score: -1124.47
Episode 409	Total Average Score: -1124.93
Episode 410	Total Average Score: -1129.40
Episode 411	Total Average Score: -1131.65
Episode 412	Total Average Score: -1133.86
Episode 413	Total Average Score: -1134.52
Episode 414	Total Average Score: -1135.02
Episode 415	Total Average Score: -1138.44
Episode 416	Total Average Score: -1141.09
Episode 417	Total Average Score: -1142.69
Episode 418	Total Average Score: -

Episode 591	Total Average Score: -1119.38
Episode 592	Total Average Score: -1118.26
Episode 593	Total Average Score: -1116.91
Episode 594	Total Average Score: -1110.88
Episode 595	Total Average Score: -1116.11
Episode 596	Total Average Score: -1110.63
Episode 597	Total Average Score: -1105.31
Episode 598	Total Average Score: -1105.21
Episode 599	Total Average Score: -1109.27
Episode 600	Total Average Score: -1104.52
Episode 601	Total Average Score: -1104.33
Episode 602	Total Average Score: -1102.73
Episode 603	Total Average Score: -1108.08
Episode 604	Total Average Score: -1108.10
Episode 605	Total Average Score: -1102.59
Episode 606	Total Average Score: -1101.40
Episode 607	Total Average Score: -1100.54
Episode 608	Total Average Score: -1105.71
Episode 609	Total Average Score: -1111.20
Episode 610	Total Average Score: -1116.56
Episode 611	Total Average Score: -1116.27
Episode 612	Total Average Score: -1116.21
Episode 613	Total Average Score: -1110.69
Episode 614	Total Average Score: -

Episode 787	Total Average Score: -989.36
Episode 788	Total Average Score: -980.07
Episode 789	Total Average Score: -979.99
Episode 790	Total Average Score: -979.13
Episode 791	Total Average Score: -985.91
Episode 792	Total Average Score: -993.49
Episode 793	Total Average Score: -984.48
Episode 794	Total Average Score: -983.75
Episode 795	Total Average Score: -984.58
Episode 796	Total Average Score: -977.92
Episode 797	Total Average Score: -977.95
Episode 798	Total Average Score: -978.96
Episode 799	Total Average Score: -977.85
Episode 800	Total Average Score: -970.85
Episode 801	Total Average Score: -971.68
Episode 802	Total Average Score: -970.39
Episode 803	Total Average Score: -975.90
Episode 804	Total Average Score: -975.54
Episode 805	Total Average Score: -975.27
Episode 806	Total Average Score: -968.13
Episode 807	Total Average Score: -968.06
Episode 808	Total Average Score: -974.54
Episode 809	Total Average Score: -965.73
Episode 810	Total Average Score: -957.79
Episode 811	Tota

### 5. Plot the result

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

plt.plot(np.arange(1, len(meanRs)+1), meanRs)
plt.ylabel('total_average_scores')
plt.xlabel('Episode #')
plt.show()

### 6. Watch the agent running with saved weights

In [14]:
# Load the saved weights into Pytorch model
agent.g.load_state_dict(torch.load('g-pendulum.pth', map_location='cpu'))
agent.d.load_state_dict(torch.load('d-pendulum.pth', map_location='cpu'))

# env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
# S = env_info.vector_observations                  # get the current states (S) (for each agent)
s = env.reset() # s: state # get the current state (s) (for an agent)

#Rs = np.zeros(num_agents)                          # initialize the total scores (Rs) (for each agent)
R = 0 # R:total reward # initialize the total score (R) (for an agent)

while True:
    #A = agent.act(S)                        # select actions (A) from loaded model agents
    a = agent.act(np.reshape(s, [1, -1])) # a: [-1, +1]    # select action (a) from loaded model agent
    
    # env_info = env.step(A)[brain_name]           # send all actions (A) to tne environment (env)
    # S2 = env_info.vector_observations         # get next states (S2) (for each agent)
    # rewards = env_info.rewards                         # get rewards (for each agent)
    # dones = env_info.local_done                        # see if the episode is done/finished (terminal)
    s2, r, done, _ = env.step(np.reshape(a, [-1]))
    
    # Rs += env_info.rewards                         # update the total scores (Rs) (for each agent)
    # S = S2                               # roll over current states (S) to next states (S2)
    R += r # update the total score (R) (for an agent)
    s = s2 # roll over current state (s) to next state (s2)
    
    #if np.any(dones):                                  # exit loop if episode is done/finished
    if done: # exit loop if episode is done/finished (terminal)
        break
        
print('Average of total scores: {}'.format(R))

FileNotFoundError: [Errno 2] No such file or directory: 'g-pendulum.pth'

When finished, you can close the environment.

In [6]:
env.close()