In [106]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import heapq

In [75]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

env = gym.make('MountainCarContinuous-v0')

print('observation space:', env.observation_space)
print('action space:', env.action_space)
print('  - low:', env.action_space.low)
print('  - high:', env.action_space.high)
print('Training on {}.'.format(device))

observation space: Box(2,)
action space: Box(1,)
  - low: [-1.]
  - high: [1.]
Training on cpu.




In [149]:
class Agent(nn.Module):
    def __init__(self, env, h_size=16):
        super(Agent, self).__init__()
        self.env = env
        # state, hidden layer, action sizes
        self.s_size = env.observation_space.shape[0]
        self.h_size = h_size
        self.a_size = env.action_space.shape[0]
        # define layers
        self.fc1 = nn.Linear(self.s_size, self.h_size)
        self.fc2 = nn.Linear(self.h_size, self.a_size)
    
    def forward(self,x ):
        x = F.relu(self.fc1(x))
        x = F.tanh(self.fc2(x))
        return x.cpu().data
        
    def populate(self, best_wb, noise):
        w_fc1 = best_wb[0] + noise*(np.random.randn(self.h_size, self.s_size))#noise*(np.random.rand(self.s_size, self.h_size))
        w_fc2 = best_wb[1] + noise*(np.random.randn(self.a_size, self.h_size))#noise*(np.random.rand(self.h_size, self.a_size))
        
        b_fc1 = best_wb[2] + noise*np.random.randn(self.h_size)
        b_fc2 = best_wb[3] + noise*np.random.randn(self.a_size)
        return np.array((w_fc1, w_fc2, b_fc1, b_fc2))
    
    def assign_weights(self, new_weights):
        self.fc1.weight.data.copy_(torch.from_numpy(new_weights[0].astype(float)))
        self.fc2.weight.data.copy_(torch.from_numpy(new_weights[1].astype(float)))
        self.fc1.bias.data.copy_(torch.from_numpy(new_weights[2].astype(float)))
        self.fc2.bias.data.copy_(torch.from_numpy(new_weights[3].astype(float)))
        
    def evaluate(self, weights, gamma=1.0, max_t=5000):
        self.assign_weights(weights)
        episode_return = 0.0
        state = self.env.reset()
        for t in range(max_t):
            state = torch.from_numpy(state).float().to(device)
            action = self.forward(state)
            state, reward, done, _ = self.env.step(action)
            episode_return += reward * math.pow(gamma, t)
            if done:
                break
        return episode_return

In [150]:
def cem(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=10, topk=3, noise=0.5):
    scores_deque = deque(maxlen=100)
    scores = []
    best_weight = agent.populate([0,0,0,0], noise=1)
    
    for i_iteration in range(1, n_iterations+1):
        weights_pop = [agent.populate(best_weight, noise) for i in range(pop_size)]
        rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])
        
        elite_idxs = rewards.argsort()[-top_k:]
        elite_weights = [weights_pop[i] for i in elite_idxs]
        best_weight = np.array(elite_weights).mean(axis=0)
        
        reward = agent.evaluate(best_weight, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)
        
        if i_iteration % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))

        if np.mean(scores_deque)>=90.0:
            print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
            break
    return scores

In [151]:
s_size = 2
h_size = 16
a_size = 1

agent=Agent(env).to(device)

for w in agent.parameters():
    print(w)
scores, returns = cross_entropy(noise=0.5, top_k=3, pop_size=50)


Parameter containing:
tensor([[-0.5435,  0.4337],
        [ 0.1759, -0.1078],
        [-0.6787,  0.5970],
        [-0.5624,  0.6658],
        [ 0.0637, -0.5016],
        [-0.6866,  0.5928],
        [ 0.4005,  0.0614],
        [ 0.5549,  0.4862],
        [-0.5914,  0.0002],
        [ 0.0211,  0.5650],
        [ 0.4424, -0.0828],
        [-0.0907, -0.4006],
        [ 0.5319, -0.6775],
        [-0.2055,  0.4269],
        [ 0.2030, -0.2235],
        [ 0.3039,  0.5185]])
Parameter containing:
tensor([-0.0614,  0.6316,  0.0161, -0.1616, -0.1852, -0.3757,  0.3215,
        -0.5533,  0.3579, -0.5142,  0.0057, -0.6440, -0.2505, -0.2055,
        -0.4169,  0.4777])
Parameter containing:
tensor([[-0.1825, -0.0013, -0.2164, -0.2430,  0.2280,  0.1424, -0.1250,
          0.0698,  0.1644, -0.2071,  0.0605,  0.1910,  0.1102,  0.0124,
          0.1594, -0.2184]])
Parameter containing:
tensor(1.00000e-02 *
       [-8.6033])
Episode 10	Average Score: -0.00
Episode 20	Average Score: -0.00
Episode 30	Average

KeyboardInterrupt: 

In [152]:
for w in agent.parameters():
    print(w)

Parameter containing:
tensor([[ 0.0614,  0.1692],
        [-0.8823,  1.5956],
        [-0.0972,  0.0391],
        [-0.5868,  0.2889],
        [ 0.7220,  0.3663],
        [-0.2734,  0.4492],
        [ 0.3824,  0.5011],
        [-0.9493, -0.3422],
        [ 0.4634, -0.0633],
        [ 0.8418,  0.1666],
        [ 0.4793,  0.5779],
        [-0.5331,  0.0238],
        [ 0.2947, -1.0668],
        [-0.3186,  0.0386],
        [-0.0201,  0.1045],
        [ 1.4893,  1.0009]])
Parameter containing:
tensor([-0.7365,  0.1482,  0.1981, -0.0267, -0.0667,  0.8111, -0.6171,
        -0.4651, -0.0031, -0.7924,  0.7831, -1.1438, -0.4139,  0.5018,
        -0.1284, -0.2525])
Parameter containing:
tensor([[ 0.1210, -0.6038,  0.7100,  1.0209,  0.8563,  0.7840,  0.1620,
          0.0663,  0.4519,  0.3011, -0.0663, -0.3929,  0.5400, -0.0023,
          0.0354,  0.0111]])
Parameter containing:
tensor(1.00000e-02 *
       [ 2.6186])
