In [106]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import heapq

In [75]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

env = gym.make('MountainCarContinuous-v0')

print('observation space:', env.observation_space)
print('action space:', env.action_space)
print('  - low:', env.action_space.low)
print('  - high:', env.action_space.high)
print('Training on {}.'.format(device))

observation space: Box(2,)
action space: Box(1,)
  - low: [-1.]
  - high: [1.]
Training on cpu.




In [149]:
class Agent(nn.Module):
    def __init__(self, env, h_size=16):
        super(Agent, self).__init__()
        self.env = env
        # state, hidden layer, action sizes
        self.s_size = env.observation_space.shape[0]
        self.h_size = h_size
        self.a_size = env.action_space.shape[0]
        # define layers
        self.fc1 = nn.Linear(self.s_size, self.h_size)
        self.fc2 = nn.Linear(self.h_size, self.a_size)
    
    def forward(self,x ):
        x = F.relu(self.fc1(x))
        x = F.tanh(self.fc2(x))
        return x.cpu().data
        
    def populate(self, best_wb, noise):
        w_fc1 = best_wb[0] + noise*(np.random.randn(self.h_size, self.s_size))#noise*(np.random.rand(self.s_size, self.h_size))
        w_fc2 = best_wb[1] + noise*(np.random.randn(self.a_size, self.h_size))#noise*(np.random.rand(self.h_size, self.a_size))
        
        b_fc1 = best_wb[2] + noise*np.random.randn(self.h_size)
        b_fc2 = best_wb[3] + noise*np.random.randn(self.a_size)
        return np.array((w_fc1, w_fc2, b_fc1, b_fc2))
    
    def assign_weights(self, new_weights):
        self.fc1.weight.data.copy_(torch.from_numpy(new_weights[0].astype(float)))
        self.fc2.weight.data.copy_(torch.from_numpy(new_weights[1].astype(float)))
        self.fc1.bias.data.copy_(torch.from_numpy(new_weights[2].astype(float)))
        self.fc2.bias.data.copy_(torch.from_numpy(new_weights[3].astype(float)))
        
    def evaluate(self, weights, gamma=1.0, max_t=5000):
        self.assign_weights(weights)
        episode_return = 0.0
        state = self.env.reset()
        for t in range(max_t):
            state = torch.from_numpy(state).float().to(device)
            action = self.forward(state)
            state, reward, done, _ = self.env.step(action)
            episode_return += reward * math.pow(gamma, t)
            if done:
                break
        return episode_return

In [248]:
def cem(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, topk=10, noise=0.5):
    scores_deque = deque(maxlen=100)
    scores = []
    best_weight = agent.populate([0,0,0,0], noise=1)
    
    for i_iteration in range(1, n_iterations+1):
        weights_pop = [agent.populate(best_weight, noise) for i in range(pop_size)]
        rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])
        
        elite_idxs = rewards.argsort()[-topk:]
        elite_weights = [weights_pop[i] for i in elite_idxs]
        best_weight = np.array(elite_weights).mean(axis=0)
        
        reward = agent.evaluate(best_weight, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)
        
        if i_iteration % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))

        if np.mean(scores_deque)>=90.0:
            print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
            break
    return scores

In [249]:
s_size = 2
h_size = 16
a_size = 1



In [None]:
agent=Agent(env).to(device)
scores = cem()

Episode 10	Average Score: -4.13
Episode 20	Average Score: -3.23
Episode 30	Average Score: -3.24
Episode 40	Average Score: -3.04
Episode 50	Average Score: -2.88
Episode 60	Average Score: -2.75
Episode 70	Average Score: -2.73


In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
agent=Agent(env).to(device)
noise = 0.5
pop_size=50
gamma=1.0
max_t=1000

In [200]:
best_weight = agent.populate([0,0,0,0], noise=1)
weights_pop = [agent.populate(best_weight, noise) for i in range(pop_size)]

In [201]:
rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])

In [229]:
rewards[2]

-0.2305362998429407

In [243]:
elite_idxs = rewards.argsort()[-10:]
elite_idxs

array([33, 39, 25,  0, 41, 18, 27, 21,  2, 43])

In [244]:
elite_weights = [weights_pop[i] for i in elite_idxs]
best_weight = np.mean(np.array(elite_weights), axis=0) #.mean(axis=0)

In [245]:
agent.evaluate(best_weight, gamma=1.0)

-42.85210447035296

In [237]:
agent.evaluate(weights_pop[21], gamma=1.0)

-3.351523173804016

In [240]:
best_weight-weights_pop[21]

array([array([[ 0.09813731,  0.15601263],
       [ 0.72544051, -0.39018466],
       [ 1.12492783, -0.36036814],
       [-0.17594344,  0.27272546],
       [-0.02291312, -0.23619531],
       [-0.66313218, -0.30721579],
       [ 0.43746565,  0.29307826],
       [-0.4971635 ,  0.03438454],
       [ 0.84659517,  0.76442069],
       [-0.03850961,  0.26902303],
       [ 0.02958754,  0.38765538],
       [-0.13086719, -0.37753442],
       [-0.38442353,  0.65246781],
       [-0.01446417,  0.10237782],
       [ 0.0822133 , -0.43648435],
       [-0.17675986,  0.00927517]]),
       array([[ 0.33061632,  0.56700223, -0.20851992, -0.14496004, -0.08518741,
         0.21187735,  0.18381412,  0.16480907,  0.3139015 , -0.657997  ,
         0.45035074,  0.87067443, -0.52884111, -0.56074772, -0.25299005,
        -0.2626033 ]]),
       array([ 0.01750513, -0.39839713, -0.35299685,  0.55293329, -0.16216973,
       -1.00737627, -0.08060279, -0.60388184, -0.22604667,  0.13959995,
        0.21225064, -0.2965577

In [239]:
weights_pop[21]

array([array([[-1.66843477,  0.00523438],
       [-0.97596543, -1.26506249],
       [-1.64361276,  2.13121824],
       [-2.39649791,  1.71295331],
       [-0.74878221,  0.63322646],
       [ 1.31969003, -1.44515961],
       [ 0.34982547, -0.6357634 ],
       [ 0.4930625 ,  0.14354734],
       [-3.49799067, -0.8796155 ],
       [-0.97634482,  0.53084925],
       [-1.8955993 ,  0.35928997],
       [-1.74462099,  0.09744174],
       [-0.99976082, -0.43186455],
       [ 2.05634493, -0.32473247],
       [ 0.39929847, -0.0238132 ],
       [ 0.36733429, -0.04152168]]),
       array([[-2.87177358, -0.80518133, -0.09384823, -1.5234505 ,  0.18728362,
        -1.43055033, -0.47133305,  0.8082374 , -0.92859169,  1.15138671,
        -1.44374902,  1.84780965,  0.20901451,  1.07447283, -0.51253749,
        -0.69369788]]),
       array([-1.2335802 , -0.9229481 , -0.28314658, -1.074112  ,  0.22399321,
        1.14859592, -2.09604614, -0.76735675,  0.37135673, -2.10138884,
       -0.39109368,  0.9542614