In [106]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import heapq

In [75]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

env = gym.make('MountainCarContinuous-v0')

print('observation space:', env.observation_space)
print('action space:', env.action_space)
print('  - low:', env.action_space.low)
print('  - high:', env.action_space.high)
print('Training on {}.'.format(device))

observation space: Box(2,)
action space: Box(1,)
  - low: [-1.]
  - high: [1.]
Training on cpu.




In [76]:
class Agent(nn.Module):
    def __init__(self, env, h_size=16):
        super(Agent, self).__init__()
        self.env = env
        # state, hidden layer, action sizes
        self.s_size = env.observation_space.shape[0]
        self.h_size = h_size
        self.a_size = env.action_space.shape[0]
        # define layers
        self.fc1 = nn.Linear(self.s_size, self.h_size)
        self.fc2 = nn.Linear(self.h_size, self.a_size)
    
    def forward(self,x ):
        x = F.relu(self.fc1(x))
        x = F.tanh(self.fc2(x))
        return x.cpu().data
        
    def populate(self, best_wb, noise=1e-3):
        w_fc1 = best_wb[0] + noise*(np.random.rand(self.h_size, self.s_size))#noise*(np.random.rand(self.s_size, self.h_size))
        w_fc2 = best_wb[1] + noise*(np.random.rand(self.a_size, self.h_size))#noise*(np.random.rand(self.h_size, self.a_size))
        
        b_fc1 = best_wb[2] + noise*np.random.rand(self.h_size)
        b_fc2 = best_wb[3] + noise*np.random.rand(self.a_size)
        return np.array((w_fc1, w_fc2, b_fc1, b_fc2))
    
    def assign_weights(self, new_weights):
        self.fc1.weight.data.copy_(torch.from_numpy(new_weights[0].astype(float)))
        self.fc2.weight.data.copy_(torch.from_numpy(new_weights[1].astype(float)))
        self.fc1.bias.data.copy_(torch.from_numpy(new_weights[2].astype(float)))
        self.fc2.bias.data.copy_(torch.from_numpy(new_weights[3].astype(float)))

In [136]:
def cross_entropy(n_episodes=500, max_t=1000, gamma=1.0, print_every=10, pop_size=10, top_k=3, noise=1e-3):
    scores_deque = deque(maxlen=100)
    scores = []
    best_wb = agent.populate([0,0,0,0], noise=1e-5)
    
    for i_episode in range(1, n_episodes+1):
        episode_rewards = np.zeros((max_t, pop_size))
        candidate_policies = [agent.populate(best_wb, noise) for i in range(pop_size)]
        state_alpha = env.reset()
        
        for p in range(pop_size):
            state = state_alpha
            agent.assign_weights(candidate_policies[p])
        
            for t in range(max_t):
                action = agent.forward(torch.from_numpy(state).float())
                state, reward, done, _ = env.step(action)
                episode_rewards[t,p] = reward
                if done:
                    env.reset()
                    break
                    
        discounts = np.array([gamma**i for i in range(max_t)])
        returns = np.sum(np.array([a*b for a,b in zip(episode_rewards, discounts)]), axis = 0)
        
        best_p = np.argmax(returns)
        top_ps = heapq.nlargest( top_k , range(len(returns)), returns.__getitem__)
        
        top_policies = [candidate_ps[i] for i in top_ps]
        best_parameters = np.mean(top_policies, axis=0)
        
        agent.assign_weights(best_parameters)
        
        scores_deque.append(np.sum(episode_rewards[:,best_p]))
        scores.append(np.sum(episode_rewards[:,best_p]))
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque) >= 90.0:
            if len(scores_deque) == 100:
                print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-99, np.mean(scores_deque)))
                agent.assign_weights(best_parameters)
                break
            
    return scores, returns

In [141]:
s_size = 2
h_size = 16
a_size = 1

agent=Agent(env).to(device)
scores, returns = cross_entropy(noise=1e-1, top_k=2)


Parameter containing:
tensor([[ 0.2843, -0.3614],
        [ 0.3059,  0.4609],
        [-0.2916,  0.1454],
        [-0.1825,  0.6808],
        [-0.1339,  0.4716],
        [ 0.6329,  0.1993],
        [ 0.4111, -0.6844],
        [ 0.5990,  0.1772],
        [-0.4535, -0.2630],
        [-0.2680, -0.2584],
        [-0.5455, -0.3634],
        [-0.0699,  0.1330],
        [-0.2852,  0.5207],
        [-0.5103, -0.4414],
        [-0.0993, -0.6163],
        [-0.0735,  0.0413]])
Parameter containing:
tensor([-0.2862, -0.0851, -0.1273, -0.4504, -0.1355, -0.6108, -0.5397,
         0.1756,  0.6461, -0.0182,  0.1546,  0.3281,  0.1728, -0.6930,
        -0.2634, -0.0949])
Parameter containing:
tensor([[-0.1489, -0.0494,  0.2436, -0.0175,  0.0619, -0.2255,  0.1341,
          0.0889, -0.1292,  0.0803,  0.0871, -0.1735,  0.2345, -0.1314,
         -0.2450,  0.1375]])
Parameter containing:
tensor(1.00000e-02 *
       [-1.6531])
Episode 10	Average Score: -2.35
Episode 20	Average Score: -2.42
