In [1]:
import os
import time
import numpy as np
import gym
from gym import wrappers
from PIL import Image

from ARS.ars import Normalizer, Policy
from ARS.utils import mkdir

In [2]:
env_name = 'MountainCarContinuous-v0'
max_episodes = 30000
max_timesteps = 2000
learning_rate = 0.5
num_deltas = 32
num_best_deltas = 32
noise = 0.9
random_seed = 42
log_interval = 10 

In [3]:
class ARSTrainer():
    def __init__(self,
                 env_name='BipedalWalker-v2',
                 max_episodes=30000,
                 max_timesteps=2000,
                 learning_rate=0.02,
                 num_deltas=16,
                 num_best_deltas=16,
                 noise=0.03,
                 random_seed=1,                
                 input_size=None,
                 output_size=None,
                 normalizer=None,
                 record_videos=True, record_interval=100, log_interval=5, threshold=None): 
           
        self.algorithm_name = 'ars'
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.record_videos = record_videos
        self.record_interval = record_interval        
        if self.record_videos == True:
            videos_dir = mkdir('.', 'videos')
            monitor_dir = mkdir(videos_dir, self.algorithm_name)
            should_record = lambda i: self.should_record
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)            
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_low = self.env.action_space.low
        self.action_high = self.env.action_space.high   
        self.should_record = False
        if not threshold == None:
            self.threshold = threshold
        else:    
            self.threshold = self.env.spec.reward_threshold         
        
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps        
        self.learning_rate = learning_rate
        self.num_deltas = num_deltas
        self.num_best_deltas = num_best_deltas
        assert self.num_best_deltas <= self.num_deltas
        self.noise = noise
        self.random_seed = random_seed
                 
        self.input_size = input_size or self.state_dim
        self.output_size = output_size or self.action_dim       
        self.log_interval = log_interval
                 
        self.rewards_list = []
        self.optimal_policy = None
                        
        self.normalizer = normalizer or Normalizer(self.input_size)
        self.policy = Policy(self.input_size, self.output_size, self.noise, self.learning_rate, 
                             self.num_deltas, self.num_best_deltas)         
                 
        if not random_seed == None:
            print("Random Seed: {} ".format(self.random_seed))
            self.env.seed(self.random_seed)           
            np.random.seed(self.random_seed)                    
                 

    # Explore the policy on one specific direction and over one episode
    def explore(self, direction=None, delta=None):
        state = self.env.reset()
        done = False
        num_plays = 0.0
        sum_rewards = 0.0
        while not done and num_plays < self.max_timesteps:
            self.normalizer.observe(state)
            state = self.normalizer.normalize(state)
            action = self.policy.evaluate(state, delta, direction)
            state, reward, done, _ = self.env.step(action)
            reward = max(min(reward, 1), -1)
            sum_rewards += reward
            num_plays += 1
        return sum_rewards

    def train(self):
        start_time = time.time()
        print("Training started ... \n")
        print("action_space={}".format(self.env.action_space))
        print("obs_space={}".format(self.env.observation_space))
        print("threshold={}".format(self.threshold)) 
        print("action_low={} action_high={} \n".format(self.action_low, self.action_high))    

        
        # logging variables:        
        log_f = open("train_{}.txt".format(self.algorithm_name), "w+")
            
        for episode in range(self.max_episodes):
            # initialize the random noise deltas and the positive/negative rewards
            deltas = self.policy.sample_deltas()
            positive_rewards = [0] * self.num_deltas
            negative_rewards = [0] * self.num_deltas

            # play an episode each with positive deltas and negative deltas, collect rewards
            for k in range(self.num_deltas):
                positive_rewards[k] = self.explore(direction="+", delta=deltas[k])
                negative_rewards[k] = self.explore(direction="-", delta=deltas[k])
                
            # Compute the standard deviation of all rewards
            sigma_rewards = np.array(positive_rewards + negative_rewards).std()

            # Sort the rollouts by the max(r_pos, r_neg) and select the deltas with best rewards
            scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
            order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:self.num_best_deltas]
            rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]

            # Update the policy
            self.policy.update(rollouts, sigma_rewards)
                        
            # Only record video during evaluation, every n steps
            if episode % self.record_interval == 0:
                self.should_record = True
                
            # Play an episode with the new weights and print the score
            reward_evaluation = self.explore()
            
            self.rewards_list.append(reward_evaluation)
            avg_reward = np.mean(self.rewards_list[-100:])
            
            # logging updates:        
            log_f.write('{},{}\n'.format(episode, reward_evaluation))
            log_f.flush()
            
            if episode % self.log_interval == 0:
                print('Episode: {}  reward: {:6.4f}  avg_reward: {:6.4f}   sigma: {:6.4f}'.format(
                    episode, reward_evaluation, avg_reward, sigma_rewards))
                
            self.should_record = False
            
            # if avg reward > threshold then save and stop traning:
            if avg_reward >= self.threshold:                 
                print("########## Solved! ###########")                                 
                self.optimal_policy = self.policy
                log_f.close()
                training_time = time.time() - start_time
                print("Training time: {:6.2f} sec".format(training_time))
                break
                
    def test(self, direction=None, delta=None, episodes=3, render=True, save_gif=True):              

        gifdir = mkdir('.','gif')
        algdir = mkdir(gifdir, self.algorithm_name)
        
        for episode in range(1, episodes+1):            
            state = self.env.reset()    
            epdir = mkdir(algdir, str(episode))            
            ep_reward = 0.0
                       
            for t in range(self.max_timesteps):
                self.normalizer.observe(state)
                state = self.normalizer.normalize(state)
                action = self.optimal_policy.evaluate(state, delta, direction)
                state, reward, done, _ = self.env.step(action)
                reward = max(min(reward, 1), -1)
                ep_reward += reward                                            
                
                if save_gif:                                       
                    img = self.env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('{}/{}.jpg'.format(epdir, t))
                if done:
                    break
                    
            print('Test episode: {}\tReward: {:4.2f}'.format(episode, ep_reward))
            ep_reward = 0
            self.env.close()                    

In [4]:
agent = ARSTrainer(env_name, random_seed=random_seed, max_episodes=max_episodes, max_timesteps=max_timesteps,
                 learning_rate=learning_rate, num_deltas=num_deltas, num_best_deltas=num_best_deltas, noise=noise,
                 log_interval=log_interval)
agent.train()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Random Seed: 42 
Training started ... 

action_space=Box(1,)
obs_space=Box(2,)
threshold=90.0
action_low=[-1.] action_high=[1.] 

Episode: 0  reward: -0.2045  avg_reward: -0.2045   sigma: 125.6280
Episode: 10  reward: -1.1656  avg_reward: -0.6261   sigma: 75.4637
Episode: 20  reward: -1.1421  avg_reward: -0.6319   sigma: 2.1912
Episode: 30  reward: -1.1492  avg_reward: -0.6316   sigma: 107.1849
Episode: 40  reward: -3.2531  avg_reward: -1.0077   sigma: 6.2239
Episode: 50  reward: -0.3810  avg_reward: -0.9132   sigma: 2.3816
Episode: 60  reward: -0.0049  avg_reward: -0.9020   sigma: 19.0239
Episode: 70  reward: -1.4769  avg_reward: -0.8622   sigma: 79.7218
Episode: 80  reward: -0.7453  avg_reward: -0.8174   sigma: 18.8226
Episode: 90  reward: -1.9211  avg_reward: -0.7717   s

Episode: 1170  reward: -0.0683  avg_reward: -0.3175   sigma: 1.0326
Episode: 1180  reward: -0.0492  avg_reward: -0.3212   sigma: 0.6623
Episode: 1190  reward: -0.0028  avg_reward: -0.3152   sigma: 0.6742
Episode: 1200  reward: -0.4808  avg_reward: -0.2985   sigma: 104.9186
Episode: 1210  reward: -0.4909  avg_reward: -0.3042   sigma: 0.4679
Episode: 1220  reward: -0.1557  avg_reward: -0.3252   sigma: 1.1021
Episode: 1230  reward: -0.3314  avg_reward: -0.3262   sigma: 31.7650
Episode: 1240  reward: -0.0008  avg_reward: -0.3119   sigma: 0.6518
Episode: 1250  reward: -0.0668  avg_reward: -0.3065   sigma: 11.0517
Episode: 1260  reward: -0.1218  avg_reward: -0.3096   sigma: 0.4136
Episode: 1270  reward: -0.0641  avg_reward: -0.2936   sigma: 0.7323
Episode: 1280  reward: -0.3188  avg_reward: -0.2869   sigma: 7.0541
Episode: 1290  reward: -0.2728  avg_reward: -0.2871   sigma: 0.8259
Episode: 1300  reward: -0.1788  avg_reward: -0.2798   sigma: 0.5671
Episode: 1310  reward: -0.0136  avg_reward: 

KeyboardInterrupt: 

In [None]:
agent.test()