In [1]:

import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import sys, os
sys.path.append(os.path.abspath('..'))
from environment.grid import GridEnv, Action
from enum import Enum


%load_ext autoreload
%autoreload 2
from agent import Agent

def mean(lst):
    return sum(lst) / len(lst)


Not all _agents_ have fixed starting positions, rest (1) will be random
Not all _goals_ have fixed starting positions, rest (1) will be random

Final Self players:  [(5, 2)]
Final Self goals:  [(0, 4)]
[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [11.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]]
([[0.625, 0.375, 0.0, 0.5]], [-1], False)
----
[(5, 3)]
[(0, 4)]


In [2]:
env = GridEnv(3, agents_start=[], goals_start=[])
env.seed(0)
print('Number of actions: ', env.action_space.n)
print('State space: ', env.state_space)


Not all _agents_ have fixed starting positions, rest (3) will be random
Not all _goals_ have fixed starting positions, rest (3) will be random

Final Self players:  [(5, 6), (7, 6), (0, 3)]
Final Self goals:  [(2, 4), (1, 4), (2, 2)]
Number of actions:  4
State space:  12


In [3]:


agent = Agent(state_size=env.state_space, action_size=env.action_space.n, seed=0)

In [4]:
# watch an untrained agent
state = [item for sublist in env.reset() for item in sublist]
print(state)
for j in range(200):
    action = agent.act(state)
    states, rewards, done = env.step([action])
    #state, reward = states[0], rewards[0]
    if done:
        break 
        
env.close()

[0.875, 0.75, 0.125, 0.625, 0.0, 0.5, 0.25, 0.125, 0.75, 0.75, 0.625, 0.375]


In [5]:

def flatten(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]


def dqn(agents, num_agents = 2, n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = [[] for _ in np.arange(num_agents)]                       # list containing scores from each episode
    scores_window = [deque(maxlen=100)  for _ in np.arange(num_agents)] # last 100 scores for each agent
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        states = env.reset()
        eps_scores = [0 for _ in np.arange(num_agents)]
        for t in range(max_t):
            actions = [agent.act(flatten(states), eps) for agent in agents]
            next_states, rewards, done = env.step(actions)
            [agent.step(flatten(states), action, reward, flatten(next_states), done) for agent, action, reward in zip(agents, actions, rewards)]
            states = next_states
            eps_scores = [score + reward for score, reward in zip(eps_scores, rewards)]
            if done:
                break 

        [scores_window[index].append(eps_scores[index]) for index, _ in enumerate(agents)]       # save most recent score
        [scores[index].append(eps_scores[index]) for index, _ in enumerate(agents)]       # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=200.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
    return scores

In [6]:
num_agents = 3
agents = [Agent(state_size=env.state_space, action_size=env.action_space.n, seed=0) for _ in np.arange(num_agents)]

print(agents[0].qnetwork_local)

QNetwork(
  (fc1): Linear(in_features=12, out_features=96, bias=True)
  (fc2): Linear(in_features=96, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=4, bias=True)
)


In [None]:
scores = dqn(agents, num_agents, n_episodes=3000)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
plt.title("Score per episode")
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores[0][:3000])), scores[0][:3000])
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
# load the weights from file
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

for i in range(3):
    state = env.reset()
    for j in range(200):
        action = agent.act(state)
        env.render()
        state, reward, done, _ = env.step(action)
        if done:
            break 
            
env.close()