In [None]:
import torch
import gym
import numpy as np
from TD3 import TD3
from PIL import Image
from utils import ReplayBuffer

env_name = 'BipedalWalker-v2'
episodes = 100000
log_interval = 1           # print avg reward after interval
random_seed = 0
gamma = 0.99                # discount for future rewards
batch_size = 100            # num of transitions sampled from replay buffer
exploration_noise = 0.1 
polyak = 0.995              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 1500         # max num of episodes
max_timesteps = 2000        # max timesteps in one episode
directory = "./preTrained/{}".format(env_name) # save trained models
filename = "TD3_{}_{}".format(env_name, random_seed)
reward_history = []


def train():
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    replay_buffer = ReplayBuffer()
    
    print("action_space={}".format(env.action_space))
    print("obs_space={}".format(env.observation_space))
    print("threshold={} \n".format(env.spec.reward_threshold))
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # logging variables:        
    log_f = open("log.txt","w+")
    
    # training procedure:
    for episode in range(1, max_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-100:]) 
        
        # logging updates:        
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
       
        
        # if avg reward > 300 then save and stop traning:
        if avg_reward >= 300:
            print("########## Solved! ###########")
            name = filename + '_solved'
            policy.save(directory, name)
            log_f.close()
            break
        
        if episode > 500:
            policy.save(directory, filename)
        
        # print avg reward every log interval:
        if episode % log_interval == 0:            
            print("Episode: {}\tReward: {}\tAverage Reward: {}".format(episode, ep_reward, avg_reward))
        



train()








[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
action_space=Box(4,)
obs_space=Box(24,)
threshold=300 

Episode: 1	Reward: -117.38364837284166	Average Reward: -117.38364837284166
Episode: 2	Reward: -126.15036346807246	Average Reward: -121.76700592045705
Episode: 3	Reward: -159.2917101232462	Average Reward: -134.27524065472008
Episode: 4	Reward: -134.5799279937211	Average Reward: -134.35141248947033
Episode: 5	Reward: -107.80148504645553	Average Reward: -129.04142700086737
Episode: 6	Reward: -106.55710944241395	Average Reward: -125.29404074112513
Episode: 7	Reward: -123.92387453411007	Average Reward: -125.09830271155155
Episode: 8	Reward: -107.63605870294455	Average Reward: -122.9155222104757
Episode: 9	Reward: -121.90678385961833	Average Reward: -122.80344017149156
Episode: 10	Reward: -100.34347067362197	Average Reward: 

Episode: 106	Reward: -108.7059167199559	Average Reward: -113.92556948596885
Episode: 107	Reward: -110.17181592048047	Average Reward: -113.78804889983259
Episode: 108	Reward: -107.56271374501422	Average Reward: -113.78731545025329
Episode: 109	Reward: -91.26323039170036	Average Reward: -113.48087991557408
Episode: 110	Reward: -130.6876588873643	Average Reward: -113.78432179771148
Episode: 111	Reward: -92.8741092530532	Average Reward: -113.70341865289257
Episode: 112	Reward: -125.7380270756353	Average Reward: -113.95349700663247
Episode: 113	Reward: -119.74557199535198	Average Reward: -114.13654185355868
Episode: 114	Reward: -123.74916249418882	Average Reward: -114.35553161844629
Episode: 115	Reward: -112.146923895015	Average Reward: -114.46654659184526
Episode: 116	Reward: -147.0620669034024	Average Reward: -114.93021479963915
Episode: 117	Reward: -133.24938323745044	Average Reward: -114.90410833709919
Episode: 118	Reward: -141.46073135089046	Average Reward: -115.26309611262833
Episode:

Episode: 214	Reward: -143.0680615006412	Average Reward: -112.75274456106972
Episode: 215	Reward: -96.29601060889941	Average Reward: -112.59423542820858
Episode: 216	Reward: -106.81813395882389	Average Reward: -112.19179609876281
Episode: 217	Reward: -112.32354242590125	Average Reward: -111.98253769064733
Episode: 218	Reward: -79.31342834594489	Average Reward: -111.36106466059788
Episode: 219	Reward: -96.4996510107235	Average Reward: -111.23880357539922
Episode: 220	Reward: -97.17802140467847	Average Reward: -110.78326436471271
Episode: 221	Reward: -112.82714751141644	Average Reward: -110.8397549365869
Episode: 222	Reward: -107.01563296387118	Average Reward: -110.93578959764102
Episode: 223	Reward: -99.49317347766414	Average Reward: -110.96640688171489
Episode: 224	Reward: -67.65176933771633	Average Reward: -110.5672066181252
Episode: 225	Reward: -39.468847207414946	Average Reward: -109.89603596854347
Episode: 226	Reward: -114.4823157763033	Average Reward: -110.00645364712813
Episode: 2

In [None]:

def test():
    env_name = "BipedalWalker-v2"
    random_seed = 0
    n_episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = False
    
    filename = "TD3_{}_{}".format(env_name, random_seed)
    filename += '_solved'
    directory = "./preTrained/{}/ONE".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                     img = env.render(mode = 'rgb_array')
                     img = Image.fromarray(img)
                     img.save('./gif/{}.jpg'.format(t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
                
test()
    
    