In [None]:
from unityagents import UnityEnvironment
import numpy as np

In [None]:
env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe")

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

## Take Random Actions

In [None]:
for i in range(1, 6):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    while True:
        actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
        actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        print ("Actions: ", actions)
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        print ("Next state: ", next_states)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

In [None]:
import torch
from MADDPG import MADDPG
# Initialize the agents
agents = MADDPG(state_size, action_size, random_seed = 1)


for i in range(1, 30):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    # action
    for t in range(1, 1000000):
        
        actions = agents.act(states)
        
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        
        next_states = env_info.vector_observations         # get next state (for each agent)
                
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        
        agents.step(states, actions, rewards, next_states, dones)

        
        states = next_states                               # roll over states to next time step
        t += 1
        if np.any(dones):                                 # exit loop if episode finished
            break
    print ("Timesteps: ", t)
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

# Train the agents

In [None]:
import numpy as np
import random
import torch
from collections import deque
from MADDPG import MADDPG

agents = MADDPG(state_size, action_size, random_seed = 2)

n_episodes = 15000
rand_episodes = 1200

scores_list = []
scores_window = deque(maxlen=100)

# from utils import keep_awake
# for i_episode in keep_awake(range(n_episodes)):
for i_episode in range(1, n_episodes + 1):
    env_info = env.reset(train_mode=True)[brain_name]         
    states = env_info.vector_observations                  
    scores = np.zeros(num_agents)  
    
    while True:
        # Random actions for first 1200 episodes to encourage exploration
        if i_episode < rand_episodes:
            actions = agents.act(states, rand = True)
        # Some random actions for next 900 episodes to encourage exploration
        elif i_episode < rand_episodes*1.75 and np.random.randint(1, 10) <= 5:
            actions = agents.act(states, rand = True)
        else: 
            actions = agents.act(states)
        
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)              
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        
        # learn while exploring
        if i_episode < rand_episodes:
            agents.step(states, actions, rewards, next_states, dones, learn = True)    
        else: 
            agents.step(states, actions, rewards, next_states, dones, learn = True)
        
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                 # exit loop if episode finished
            break
    
    scores_list.append(np.max(scores))
    scores_window.append(np.max(scores))
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.max(scores)), end="")
    
    if i_episode % 100 == 0:
        torch.save(agents.agent1.actor_local.state_dict(), 'checkpoint_agent1_actor.pth')
        torch.save(agents.agent1.critic_local.state_dict(), 'checkpoint_agent1_critic.pth')
        
        torch.save(agents.agent2.actor_local.state_dict(), 'checkpoint_agent2_actor.pth')
        torch.save(agents.agent2.critic_local.state_dict(), 'checkpoint_agent2_critic.pth')
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

    if np.mean(scores_window) > 0.5:
        torch.save(agents.agent1.actor_local.state_dict(), 'checkpoint_agent1_actor.pth')
        torch.save(agents.agent1.critic_local.state_dict(), 'checkpoint_agent1_critic.pth')
        
        torch.save(agents.agent2.actor_local.state_dict(), 'checkpoint_agent2_actor.pth')
        torch.save(agents.agent2.critic_local.state_dict(), 'checkpoint_agent2_critic.pth')
        
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        break   

# PLOT THE SCORES
import matplotlib.pyplot as plt
%matplotlib inline

score = scores_list
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(score)), score)
plt.ylabel('Score')
plt.xlabel('Episode Number')
plt.show()

## Watch trained agents

In [None]:
import torch
from collections import deque
from MADDPG import MADDPG

agents = MADDPG(state_size, action_size, random_seed = 1)

agents.agent1.actor_local.load_state_dict(torch.load('checkpoint_agent1_actor.pth'))
agents.agent1.critic_local.load_state_dict(torch.load('checkpoint_agent1_critic.pth'))
agents.agent2.actor_local.load_state_dict(torch.load('checkpoint_agent2_actor.pth'))
agents.agent2.critic_local.load_state_dict(torch.load('checkpoint_agent2_critic.pth'))

for i_episode in range(1, 6):                                      
    env_info = env.reset(train_mode=False)[brain_name]         
    states = env_info.vector_observations                  
    scores = np.zeros(num_agents)  
    
    while True:

        actions = agents.act(states)
        
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)              
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                 # exit loop if episode finished
            break
    
    print("Episode", i_episode, "Score is", np.max(scores))

In [None]:
env.close()