In [1]:
# import Libraries
import os
import numpy as np
import gym
from gym import wrappers
import pybullet_envs

In [8]:
# Setting the Hyper Parameters

class Hp():
    
    def __init__(self):
        self.nb_steps = 1000
        self.episode_length = 1000
        self.learning_rate = 0.02
        self.nb_directions = 16
        self.nb_best_directions = 16
        assert self.nb_best_directions <= self.nb_directions
        self.noise = 0.03
        self.seed = 1
        self.env_name = 'HalfCheetahBulletEnv-v0'

In [9]:
# Normalizing the states (For performance purposes)

class Normalizer():
    
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)
    
    def observe(self, x):
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def normalize(self, inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std


In [10]:
# Building the AI

class Policy():
    
    def __init__(self, input_size, output_size):
        self.theta = np.zeros((output_size, input_size))
    
    def evaluate(self, input, delta = None, direction = None):
        if direction is None:
            return self.theta.dot(input)
        elif direction == "positive":
            return (self.theta + hp.noise*delta).dot(input)
        else:
            return (self.theta - hp.noise*delta).dot(input)
    
    def sample_deltas(self):
        return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)]
    
    def update(self, rollouts, sigma_r):
        step = np.zeros(self.theta.shape)
        for r_pos, r_neg, d in rollouts:
            step += (r_pos - r_neg) * d
        self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step


In [11]:
# Exploring the policy on one specific direction and over one episode

def explore(env, normalizer, policy, direction = None, delta = None):
    state = env.reset()
    done = False
    num_plays = 0.
    sum_rewards = 0
    while not done and num_plays < hp.episode_length:
        normalizer.observe(state)
        state = normalizer.normalize(state)
        action = policy.evaluate(state, delta, direction)
        state, reward, done, _ = env.step(action)
        reward = max(min(reward, 1), -1)
        sum_rewards += reward
        num_plays += 1
    return sum_rewards


In [12]:
# Training the AI

def train(env, policy, normalizer, hp):
    
    for step in range(hp.nb_steps):
        
        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        
        # Getting the positive rewards in the positive directions
        for k in range(hp.nb_directions):
            positive_rewards[k] = explore(env, normalizer, policy, direction = "positive", delta = deltas[k])
        
        # Getting the negative rewards in the negative/opposite directions
        for k in range(hp.nb_directions):
            negative_rewards[k] = explore(env, normalizer, policy, direction = "negative", delta = deltas[k])
        
        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array(positive_rewards + negative_rewards)
        sigma_r = all_rewards.std()
        
        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
        order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:hp.nb_best_directions]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
        
        # Updating our policy
        policy.update(rollouts, sigma_r)
        
        # Printing the final reward of the policy after the update
        reward_evaluation = explore(env, normalizer, policy)
        print('Step:', step, 'Reward:', reward_evaluation)


In [13]:
# Running the main code

def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')

hp = Hp()
np.random.seed(hp.seed)
env = gym.make(hp.env_name)
env = wrappers.Monitor(env, monitor_dir, force = True)
nb_inputs = env.observation_space.shape[0]
nb_outputs = env.action_space.shape[0]
policy = Policy(nb_inputs, nb_outputs)
normalizer = Normalizer(nb_inputs)
train(env, policy, normalizer, hp)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
WalkerBase::__init__ start
[33mWARN: Environment '<class 'pybullet_envs.gym_locomotion_envs.HalfCheetahBulletEnv'>' has deprecated methods '_step' and '_reset' rather than 'step' and 'reset'. Compatibility code invoked. Set _gym_disable_underscore_compat = True to disable this behavior.[0m
Step: 0 Reward: -960.8180044257189
Step: 1 Reward: -962.0348783831652
Step: 2 Reward: -937.6406633444094
Step: 3 Reward: -949.1764930916991
Step: 4 Reward: -955.7390501904398
Step: 5 Reward: -955.808016672099
Step: 6 Reward: -926.2073772029693
Step: 7 Reward: -864.2011214432

Step: 209 Reward: 706.3682150889264
Step: 210 Reward: 752.8612180297097
Step: 211 Reward: 733.6733094857133
Step: 212 Reward: 716.3082683514871
Step: 213 Reward: 726.1085516194314
Step: 214 Reward: 742.2171248481856
Step: 215 Reward: 676.8339827116222
Step: 216 Reward: 727.7194846118008
Step: 217 Reward: 719.6967204870238
Step: 218 Reward: 724.4401977313242
Step: 219 Reward: 745.5421080333215
Step: 220 Reward: 750.2864298217526
Step: 221 Reward: 736.0997717049994
Step: 222 Reward: 739.7398869748121
Step: 223 Reward: 730.5783703331638
Step: 224 Reward: 707.0237887255944
Step: 225 Reward: 714.2254422161164
Step: 226 Reward: 715.3195480736371
Step: 227 Reward: 757.5189712767692
Step: 228 Reward: 693.1450395549165
Step: 229 Reward: 752.0670489457957
Step: 230 Reward: 727.1836687231227
Step: 231 Reward: 763.5712729621346
Step: 232 Reward: 743.9027310462195
Step: 233 Reward: 741.2033332863709
Step: 234 Reward: 767.1726852467574
Step: 235 Reward: 768.873179026524
Step: 236 Reward: 762.4766129

Step: 438 Reward: 838.231642961429
Step: 439 Reward: 833.4779411032229
Step: 440 Reward: 832.9031341572419
Step: 441 Reward: 833.0448591275403
Step: 442 Reward: 852.1477890660938
Step: 443 Reward: 841.6194163555368
Step: 444 Reward: 825.4490088456471
Step: 445 Reward: 840.1885107209174
Step: 446 Reward: 836.9912299686495
Step: 447 Reward: 872.8707432484833
Step: 448 Reward: 853.4927926086401
Step: 449 Reward: 856.5655160358614
Step: 450 Reward: 844.6913207675872
Step: 451 Reward: 860.1607995625614
Step: 452 Reward: 843.5900216404309
Step: 453 Reward: 864.5213206992619
Step: 454 Reward: 867.5028243435697
Step: 455 Reward: 850.492264944266
Step: 456 Reward: 852.1664418083558
Step: 457 Reward: 859.8709726970388
Step: 458 Reward: 873.1622658363539
Step: 459 Reward: 852.9878338679827
Step: 460 Reward: 861.9455485213456
Step: 461 Reward: 856.7026095410171
Step: 462 Reward: 872.5271544428465
Step: 463 Reward: 849.3746127343957
Step: 464 Reward: 857.6105376150884
Step: 465 Reward: 875.14967600

Step: 667 Reward: 951.316315855763
Step: 668 Reward: 960.7496831406487
Step: 669 Reward: 952.3717288233794
Step: 670 Reward: 951.0767313556947
Step: 671 Reward: 960.7783791368108
Step: 672 Reward: 961.0708189889283
Step: 673 Reward: 949.9489775243535
Step: 674 Reward: 968.2836354330016
Step: 675 Reward: 958.2432195630871
Step: 676 Reward: 964.5188112282653
Step: 677 Reward: 965.8516710901121
Step: 678 Reward: 961.2648202782335
Step: 679 Reward: 958.1441394434376
Step: 680 Reward: 963.155480813735
Step: 681 Reward: 953.6709563290146
Step: 682 Reward: 962.4266191473099
Step: 683 Reward: 962.8659244246672
Step: 684 Reward: 959.7242041857827
Step: 685 Reward: 968.5646884639376
Step: 686 Reward: 957.7207434984257
Step: 687 Reward: 957.9847400435066
Step: 688 Reward: 968.768329949523
Step: 689 Reward: 960.2954648158416
Step: 690 Reward: 966.7430179893455
Step: 691 Reward: 965.7523913394949
Step: 692 Reward: 955.4547352060587
Step: 693 Reward: 963.2560000731196
Step: 694 Reward: 961.754681709

Step: 896 Reward: 973.858075991601
Step: 897 Reward: 978.057925137345
Step: 898 Reward: 978.1776275500758
Step: 899 Reward: 979.9995527874108
Step: 900 Reward: 976.3842131648915
Step: 901 Reward: 979.7291649630748
Step: 902 Reward: 966.4675005818166
Step: 903 Reward: 979.6861863021362
Step: 904 Reward: 974.1585762089011
Step: 905 Reward: 973.830771354839
Step: 906 Reward: 978.1281220577055
Step: 907 Reward: 979.4331875016298
Step: 908 Reward: 976.9464628675653
Step: 909 Reward: 972.7354227389753
Step: 910 Reward: 975.1810639624263
Step: 911 Reward: 983.1357148507175
Step: 912 Reward: 980.2017661596233
Step: 913 Reward: 977.3076745060905
Step: 914 Reward: 982.7565119099054
Step: 915 Reward: 977.9015968833255
Step: 916 Reward: 975.276620278174
Step: 917 Reward: 980.5247522051327
Step: 918 Reward: 979.8138734848096
Step: 919 Reward: 978.1698426192529
Step: 920 Reward: 975.3538683259001
Step: 921 Reward: 984.117496522782
Step: 922 Reward: 980.90204358074
Step: 923 Reward: 976.8595777589866