In [13]:

import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import sys, os
sys.path.append(os.path.abspath('..'))
from environment.grid import GridEnv, Action
from enum import Enum

%load_ext autoreload
%autoreload 2

def mean(lst):
    return sum(lst) / len(lst)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
env = GridEnv(2)
env.seed(0)
print('Number of actions: ', env.action_space.n)


Number of actions:  4


In [19]:
from agent import Agent

agent = Agent(state_size=env.state_space, action_size=env.action_space.n, seed=0)

# watch an untrained agent
state = env.reset()[0]
for j in range(200):
    action = agent.act(state)
    states, rewards, done = env.step([action])
    state, reward = states[0], rewards[0]
    if done:
        break 
        
env.close()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x4 and 8x8)

In [26]:
num_agents = 2
agents = [Agent(state_size=env.state_space, action_size=env.action_space.n, seed=0) for _ in np.arange(num_agents)]
def flatten(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]


def dqn(agents, num_agents = 2, n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = [[] for _ in np.arange(num_agents)]                       # list containing scores from each episode
    scores_window = [deque(maxlen=100)  for _ in np.arange(num_agents)] # last 100 scores for each agent
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        states = env.reset()
        eps_scores = [0 for _ in np.arange(num_agents)]
        for t in range(max_t):
            actions = [agent.act(flatten(states), eps) for agent in agents]
            next_states, rewards, done = env.step(actions)
            [agent.step(flatten(states), action, reward, flatten(next_states), done) for agent, action, reward in zip(agents, actions, rewards)]
            states = next_states
            eps_scores = [score + reward for score, reward in zip(eps_scores, rewards)]
            if done:
                break 

        [scores_window[index].append(eps_scores[index]) for index, _ in enumerate(agents)]       # save most recent score
        [scores[index].append(eps_scores[index]) for index, _ in enumerate(agents)]       # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=200.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
    return scores

scores = dqn(agents, 2)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

[-48, -37]
[deque([], maxlen=100), deque([], maxlen=100)]
Episode 1	Average Score: -42.50[-47, -58]
[deque([-48], maxlen=100), deque([-37], maxlen=100)]
Episode 2	Average Score: -47.50[-10, -21]
[deque([-48, -47], maxlen=100), deque([-37, -58], maxlen=100)]
Episode 3	Average Score: -36.83[-55, -44]
[deque([-48, -47, -10], maxlen=100), deque([-37, -58, -21], maxlen=100)]
Episode 4	Average Score: -40.00[-9, 2]
[deque([-48, -47, -10, -55], maxlen=100), deque([-37, -58, -21, -44], maxlen=100)]
Episode 5	Average Score: -32.70[-230, -241]
[deque([-48, -47, -10, -55, -9], maxlen=100), deque([-37, -58, -21, -44, 2], maxlen=100)]
Episode 6	Average Score: -66.50[-6, 5]
[deque([-48, -47, -10, -55, -9, -230], maxlen=100), deque([-37, -58, -21, -44, 2, -241], maxlen=100)]
Episode 7	Average Score: -57.07[-72, -83]
[deque([-48, -47, -10, -55, -9, -230, -6], maxlen=100), deque([-37, -58, -21, -44, 2, -241, 5], maxlen=100)]
Episode 8	Average Score: -59.62[-337, -348]
[deque([-48, -47, -10, -55, -9, -23

KeyboardInterrupt: 

In [None]:
# load the weights from file
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

for i in range(3):
    state = env.reset()
    for j in range(200):
        action = agent.act(state)
        env.render()
        state, reward, done, _ = env.step(action)
        if done:
            break 
            
env.close()