In [None]:
import os
import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

from TD3_keras.td3 import TD3

args = {
    'render': True,
    'log_interval': 1
}

env = gym.make('BipedalWalker-v2')
episodes = 100000
reward_history = []

task = {
        'state_size': 24,
        'action_size': 4,
        'action_high': 1,
        'action_low': -1
    }
agent = TD3(task)    


def main(agent):   
    
    
    for i_episode in range(episodes):
        running_reward = 0        
        state = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            action, noise_coeff = agent.act(state, i_episode)                
            state, reward, done, _ = env.step(action)  
            agent.step(action, reward, state, done, t)
            if args['render']:
                env.render()                   
            running_reward += reward            
            if done:
                break
        
        reward_history.append(running_reward)
        
        if i_episode % args['log_interval'] == 0:
            avg_reward = np.mean(reward_history[-100:])            
            print('Episode {}\tLast length: {:5d}\t Reward: {:7.2f}\t Avg Reward: {:7.2f}\t Noise: {:.2f}'.format(
                i_episode, t, running_reward, avg_reward, noise_coeff))
        if avg_reward > env.spec.reward_threshold and i_episode > 100:
            print("Solved! Average 100-episode reward is now {}!".format(avg_reward))
            break
            
print("action_space={}".format(env.action_space))
print("obs_space={}".format(env.observation_space))
print("threshold={} \n".format(env.spec.reward_threshold))
main(agent)



Using TensorFlow backend.


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
action_space=Box(4,)
obs_space=Box(24,)
threshold=300 

Episode 0	Last length:    84	 Reward: -101.03	 Avg Reward: -101.03	 Noise: 1.00
Episode 1	Last length:    89	 Reward: -108.50	 Avg Reward: -104.77	 Noise: 1.00
Episode 2	Last length:    46	 Reward: -110.69	 Avg Reward: -106.74	 Noise: 1.00
Episode 3	Last length:    62	 Reward: -117.69	 Avg Reward: -109.48	 Noise: 1.00
Episode 4	Last length:   119	 Reward: -122.91	 Avg Reward: -112.17	 Noise: 1.00
Episode 5	Last length:    81	 Reward: -116.83	 Avg Reward: -112.94	 Noise: 0.99
Episode 6	Last length:    61	 Reward: -113.38	 Avg Reward: -113.01	 Noise: 0.99
Episode 7	Last length:    35	 Reward: -109.67	 Avg Reward: -112.59	 Noise: 0.99
Episode 8	Last length:  1599	 Reward: -132.44	 Avg Reward: -114.79	 Noise: 0.99
Episode 

In [None]:
from PIL import Image
def test(agent):   
    random_seed = 0
    episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
     
    for i_episode in range(1, episodes):
        running_reward = 0        
        state = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            action, noise_coeff = agent.act(state, i_episode)                
            state, reward, done, _ = env.step(action)  
            agent.step(action, reward, state, done, t)
            if args['render']:
                env.render()  
                if save_gif:
                    dirname = './gif/td3_keras/{}'.format(i_episode)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/td3_keras/{}/{}.jpg'.format(i_episode,t))
            running_reward += reward            
            if done:
                break    
   
            
        print('Episode: {}\tReward: {}'.format(i_episode, int(running_reward)))
        running_reward = 0
        env.close()        
                
test(agent)