In [None]:
from unityagents import UnityEnvironment
import numpy as np
import torch
from collections import deque
import matplotlib.pyplot as plt
from DDPGAgent import Agent

# Load HEADLESS environment for training, the default brain and reset the environment
env = UnityEnvironment(file_name="Reacher_Linux/Reacher.x86_64")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
states = env_info.vector_observations  # get the current state (for each agent)
scores = np.zeros(num_agents)  # initialize the score (for each agent)
count = 0


# Create an agent, pass a desired size for the hiden layers.
agent = Agent(state_size=33, action_size=4, seed=0, add_noise=False)


# Define dqn algorithm
def ddpg(n_episodes=300, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        agent.reset()
        score = 0
        while True:
            action = agent.act(env_info.vector_observations, random=False)
            env_info = env.step(action)[brain_name]
            next_state, reward, done = env_info.vector_observations, env_info.rewards, env_info.local_done

            #next_state = agent.state_normalizer(next_state)
            #reward = agent.reward_normalizer(reward)

            agent.step(state, action, reward[0], next_state, done[0])
            state = next_state
            score += reward[0]
            if done[0]:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}\tepsilon: {:.2f}'.format(i_episode, np.mean(scores_window), eps), end="")
        if i_episode % 100 == 0:
            print('\nEpisode {}\tAverage Score: {:.2f}\tepsilon: {:.2f}'.format(i_episode, np.mean(scores_window), eps), end="")
        if np.mean(scores_window) >= 30.0 and i_episode > 50:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode - 100,
                                                                                         np.mean(scores_window)))
            torch.save(agent.critic.state_dict(), 'my_critic.pth')
            torch.save(agent.actor.state_dict(), 'my_actor.pth')
            break
        # A small step in learning rate to allow for quicker convergence with above set parameters
        #if i_episode == 1200:
        #    agent.adjust_learning_rate(1200, 2E-5)
    return scores


scores = ddpg()

env.close()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# Save scores
with open('scores.txt', 'w') as f:
    for item in scores:
        f.write("%f\n" % item)



