# Solution

## Initialization

In [1]:
from unityagents import UnityEnvironment
import numpy as np

In [3]:
env = UnityEnvironment(file_name='./Reacher_Linux_NoVis_One/Reacher.x86_64')

KeyboardInterrupt: 

In [None]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
env_info = env.reset(train_mode=True)[brain_name]
assert len(env_info.agents) == 1
action_size = brain.vector_action_space_size
state_size = env_info.vector_observations.shape[1]

## DDPG

In [None]:
import random
import torch
from collections import deque
from datetime import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from ddpg.ddpg_agent import Agent

In [None]:
agent = Agent(state_size=state_size, action_size=action_size, random_seed=2)

In [None]:
def run_ddpg(n_episodes=1000, max_t=1000, print_every=100):
    solved = False
    target_score = 30
    scores_deque = deque(maxlen=print_every)
    scores = []
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        ## state = env.reset() 
        state = env_info.vector_observations[0]
        agent.reset()
        score = 0
        for t in range(max_t):
            # while True:
            agent.reset()
            action = agent.act(state)
            
            ## next_state, reward, done, _ = env.step(action) ##
            ## agent.step(state, action, reward, next_state, done) ##
            
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            
            agent.step(state, action, reward, next_state, done)
            
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        torch.save(agent.actor_local.state_dict(), f'checkpoints/checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), f'checkpoints/checkpoint_critic.pth')
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=target_score and not solved:
            solved = True
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-print_every, np.mean(scores_deque)))
            timestamp = dt.now().strftime("%Y-%m-%d_%H:%M:%S")
            torch.save(agent.actor_local.state_dict(), f'checkpoints/checkpoint_actor_{timestamp}.pth')
            torch.save(agent.critic_local.state_dict(), f'checkpoints/checkpoint_critic_{timestamp}.pth')
            
    return scores

In [None]:
%%time
scores = run_ddpg(n_episodes=200)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()