In [None]:
import os
import torch
import gym
import numpy as np
from TD3_torch.TD3 import TD3
from PIL import Image
from TD3_torch.utils import ReplayBuffer

env_name = 'BipedalWalkerHardcore-v2'
learning_rate_base = 0.0001
log_interval = 10           # print avg reward after interval
random_seed = 234
gamma = 0.99                # discount for future rewards
batch_size = 512        # num of transitions sampled from replay buffer
exploration_noise = 0.3 
polyak_int = [0.9999, 0.999999]              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 2000000
directory = "./preTrained/td3_torch/{}".format(env_name) # save trained models
filename = "TD3_torch_{}_{}".format(env_name, random_seed)
reward_history = []



def train():
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    polyak = polyak_int[0]
    
    policy = TD3(state_dim, action_dim, max_action, lr=learning_rate_base)
    replay_buffer = ReplayBuffer(max_length=max_buffer_length)
    
    print("action_space={}".format(env.action_space))
    print("obs_space={}".format(env.observation_space))
    print("threshold={} \n".format(env.spec.reward_threshold))
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # logging variables:        
    log_f = open("log.txt","w+")
    
    # training procedure:
    for episode in range(1, max_episodes+1):
        ep_reward = 0
        state = env.reset()
       
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-100:]) 
        
        # logging updates:        
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
       
        
        # if avg reward > 300 then save and stop traning:
        if avg_reward >= env.spec.reward_threshold: 
            print("########## Solved! ###########")
            name = filename + '_solved'
            policy.save(directory, name)
            log_f.close()
            break
            
        # Calculate polyak
        #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        #if part > 1:
        #    part = 1
        #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     
        
        # Calculate LR
        part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        if part > 1:
            part = 1
        learning_rate = learning_rate_base - learning_rate_base * (1 - part) * 0.9
        policy.set_optimizers(lr=learning_rate)
        
        
        if episode > 500:
            policy.save(directory, filename)
        
        # print avg reward every log interval:
        if episode % log_interval == 0:            
            print("Ep: {}   Rew: {:3.2f}   Avg Rew: {:3.2f}   LR: {:8.8f}   Polyak: {:6.6f}   Bf: {:2.0f}   Loss: {:5.3f}  {:5.3f}  {:5.3f}".format(
                episode, ep_reward, avg_reward, learning_rate, polyak, replay_buffer.get_fill(), policy.actor_loss, policy.loss_Q1, policy.loss_Q2))

train()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
action_space=Box(4,)
obs_space=Box(24,)
threshold=300 

Random Seed: 234
Ep: 10   Rew: -107.11   Avg Rew: -120.00   LR: 0.00009400   Polyak: 0.999900   Bf:  0   Loss: -1.852  0.807  1.389
Ep: 20   Rew: -113.38   Avg Rew: -115.61   LR: 0.00009312   Polyak: 0.999900   Bf:  0   Loss: -1.594  1.927  1.811
Ep: 30   Rew: -111.10   Avg Rew: -114.20   LR: 0.00009284   Polyak: 0.999900   Bf:  0   Loss: -1.214  10.083  7.537
Ep: 40   Rew: -116.17   Avg Rew: -113.02   LR: 0.00009260   Polyak: 0.999900   Bf:  0   Loss: -0.791  36.276  34.943
Ep: 50   Rew: -107.77   Avg Rew: -111.96   LR: 0.00009239   Polyak: 0.999900   Bf:  0   Loss: -1.091  6.533  7.460
Ep: 60   Rew: -116.55   Avg Rew: -112.11   LR: 0.00009242   Polyak: 0.999900   Bf:  0   Loss: -0.352  36.690  31.461
Ep: 70   Rew: -1

Ep: 690   Rew: -101.45   Avg Rew: -102.48   LR: 0.00009050   Polyak: 0.999900   Bf:  3   Loss: -1.262  2.673  2.281
Ep: 700   Rew: -105.93   Avg Rew: -102.41   LR: 0.00009048   Polyak: 0.999900   Bf:  3   Loss: -0.085  1.077  1.335
Ep: 710   Rew: -101.96   Avg Rew: -102.44   LR: 0.00009049   Polyak: 0.999900   Bf:  3   Loss: 0.230  4.044  3.900
Ep: 720   Rew: -101.12   Avg Rew: -102.61   LR: 0.00009052   Polyak: 0.999900   Bf:  3   Loss: -0.364  3.377  2.959
Ep: 730   Rew: -102.02   Avg Rew: -102.52   LR: 0.00009050   Polyak: 0.999900   Bf:  3   Loss: 0.812  2.222  2.214
Ep: 740   Rew: -101.49   Avg Rew: -102.50   LR: 0.00009050   Polyak: 0.999900   Bf:  3   Loss: -0.204  2.040  2.416
Ep: 750   Rew: -103.17   Avg Rew: -102.57   LR: 0.00009051   Polyak: 0.999900   Bf:  3   Loss: -0.105  2.082  2.340
Ep: 760   Rew: -102.00   Avg Rew: -102.60   LR: 0.00009052   Polyak: 0.999900   Bf:  4   Loss: 0.344  1.821  2.536
Ep: 770   Rew: -102.44   Avg Rew: -102.54   LR: 0.00009051   Polyak: 0.9999

Ep: 1400   Rew: -120.03   Avg Rew: -129.40   LR: 0.00009588   Polyak: 0.999900   Bf: 10   Loss: 5.050  0.642  0.679
Ep: 1410   Rew: -114.41   Avg Rew: -128.47   LR: 0.00009569   Polyak: 0.999900   Bf: 10   Loss: 3.764  1.211  0.828
Ep: 1420   Rew: -121.41   Avg Rew: -128.46   LR: 0.00009569   Polyak: 0.999900   Bf: 10   Loss: 4.853  1.323  1.395
Ep: 1430   Rew: -128.66   Avg Rew: -128.17   LR: 0.00009563   Polyak: 0.999900   Bf: 11   Loss: 4.314  0.878  0.853
Ep: 1440   Rew: -118.69   Avg Rew: -128.81   LR: 0.00009576   Polyak: 0.999900   Bf: 11   Loss: 5.307  0.665  0.660
Ep: 1450   Rew: -115.96   Avg Rew: -126.88   LR: 0.00009538   Polyak: 0.999900   Bf: 11   Loss: 4.763  0.938  1.279
Ep: 1460   Rew: -115.67   Avg Rew: -126.08   LR: 0.00009522   Polyak: 0.999900   Bf: 12   Loss: 4.236  0.454  0.302
Ep: 1470   Rew: -124.05   Avg Rew: -126.12   LR: 0.00009522   Polyak: 0.999900   Bf: 12   Loss: 5.975  0.875  0.899
Ep: 1480   Rew: -135.84   Avg Rew: -128.32   LR: 0.00009566   Polyak: 0.

Ep: 2110   Rew: -132.69   Avg Rew: -114.28   LR: 0.00009286   Polyak: 0.999900   Bf: 34   Loss: 7.124  1.076  0.827
Ep: 2120   Rew: -99.24   Avg Rew: -113.12   LR: 0.00009262   Polyak: 0.999900   Bf: 35   Loss: 7.678  1.552  1.136
Ep: 2130   Rew: -118.69   Avg Rew: -114.01   LR: 0.00009280   Polyak: 0.999900   Bf: 35   Loss: 10.505  0.626  0.691
Ep: 2140   Rew: -98.27   Avg Rew: -114.42   LR: 0.00009288   Polyak: 0.999900   Bf: 36   Loss: 8.025  0.665  0.677
Ep: 2150   Rew: -95.19   Avg Rew: -116.07   LR: 0.00009321   Polyak: 0.999900   Bf: 36   Loss: 8.340  1.058  1.233
Ep: 2160   Rew: -108.04   Avg Rew: -114.53   LR: 0.00009291   Polyak: 0.999900   Bf: 36   Loss: 7.037  0.827  1.021
Ep: 2170   Rew: -94.17   Avg Rew: -115.55   LR: 0.00009311   Polyak: 0.999900   Bf: 37   Loss: 9.160  1.015  1.094
Ep: 2180   Rew: -108.32   Avg Rew: -113.62   LR: 0.00009272   Polyak: 0.999900   Bf: 37   Loss: 10.240  0.732  1.070
Ep: 2190   Rew: -114.15   Avg Rew: -112.60   LR: 0.00009252   Polyak: 0.99

Ep: 2820   Rew: -108.15   Avg Rew: -102.97   LR: 0.00009059   Polyak: 0.999900   Bf: 86   Loss: 8.525  2.032  2.022
Ep: 2830   Rew: -124.68   Avg Rew: -103.99   LR: 0.00009080   Polyak: 0.999900   Bf: 87   Loss: 8.709  1.357  1.513
Ep: 2840   Rew: -121.98   Avg Rew: -105.88   LR: 0.00009118   Polyak: 0.999900   Bf: 88   Loss: 10.281  0.901  0.876
Ep: 2850   Rew: -104.08   Avg Rew: -106.11   LR: 0.00009122   Polyak: 0.999900   Bf: 89   Loss: 10.721  1.088  1.091
Ep: 2860   Rew: -107.56   Avg Rew: -106.77   LR: 0.00009135   Polyak: 0.999900   Bf: 90   Loss: 9.332  0.879  1.332
Ep: 2870   Rew: -80.07   Avg Rew: -106.34   LR: 0.00009127   Polyak: 0.999900   Bf: 91   Loss: 8.047  1.029  1.759
Ep: 2880   Rew: -115.58   Avg Rew: -110.25   LR: 0.00009205   Polyak: 0.999900   Bf: 92   Loss: 10.079  0.601  0.907
Ep: 2890   Rew: -100.38   Avg Rew: -110.73   LR: 0.00009215   Polyak: 0.999900   Bf: 92   Loss: 9.995  0.991  0.975
Ep: 2900   Rew: -103.59   Avg Rew: -110.76   LR: 0.00009215   Polyak: 

Ep: 3530   Rew: -69.69   Avg Rew: -89.81   LR: 0.00008796   Polyak: 0.999900   Bf: 100   Loss: 8.237  0.215  0.165
Ep: 3540   Rew: -78.62   Avg Rew: -89.76   LR: 0.00008795   Polyak: 0.999900   Bf: 100   Loss: 7.639  0.510  0.583
Ep: 3550   Rew: -104.23   Avg Rew: -89.99   LR: 0.00008800   Polyak: 0.999900   Bf: 100   Loss: 7.619  0.171  0.083
Ep: 3560   Rew: -101.70   Avg Rew: -90.34   LR: 0.00008807   Polyak: 0.999900   Bf: 100   Loss: 7.373  0.114  0.148
Ep: 3570   Rew: -98.32   Avg Rew: -91.38   LR: 0.00008828   Polyak: 0.999900   Bf: 100   Loss: 7.399  0.349  0.388
Ep: 3580   Rew: -80.17   Avg Rew: -89.62   LR: 0.00008792   Polyak: 0.999900   Bf: 100   Loss: 7.828  0.199  0.453
Ep: 3590   Rew: -134.35   Avg Rew: -88.48   LR: 0.00008770   Polyak: 0.999900   Bf: 100   Loss: 7.834  0.155  0.185
Ep: 3600   Rew: -177.50   Avg Rew: -87.85   LR: 0.00008757   Polyak: 0.999900   Bf: 100   Loss: 7.785  0.212  0.117
Ep: 3610   Rew: -109.36   Avg Rew: -88.67   LR: 0.00008773   Polyak: 0.99990

Ep: 4250   Rew: -91.61   Avg Rew: -76.26   LR: 0.00008525   Polyak: 0.999900   Bf: 100   Loss: 6.906  0.260  0.325
Ep: 4260   Rew: -102.89   Avg Rew: -75.61   LR: 0.00008512   Polyak: 0.999900   Bf: 100   Loss: 6.864  0.083  0.073
Ep: 4270   Rew: -81.54   Avg Rew: -75.24   LR: 0.00008505   Polyak: 0.999900   Bf: 100   Loss: 7.112  0.111  0.097
Ep: 4280   Rew: -77.58   Avg Rew: -76.48   LR: 0.00008530   Polyak: 0.999900   Bf: 100   Loss: 6.604  0.056  0.058
Ep: 4290   Rew: -68.90   Avg Rew: -74.98   LR: 0.00008500   Polyak: 0.999900   Bf: 100   Loss: 6.920  0.196  0.130
Ep: 4300   Rew: -71.57   Avg Rew: -75.76   LR: 0.00008515   Polyak: 0.999900   Bf: 100   Loss: 6.405  0.104  0.097
Ep: 4310   Rew: -67.33   Avg Rew: -77.23   LR: 0.00008545   Polyak: 0.999900   Bf: 100   Loss: 6.798  0.100  0.205
Ep: 4320   Rew: -65.89   Avg Rew: -77.52   LR: 0.00008550   Polyak: 0.999900   Bf: 100   Loss: 6.779  0.112  0.130
Ep: 4330   Rew: -102.40   Avg Rew: -78.39   LR: 0.00008568   Polyak: 0.999900  

In [None]:
def test():  
    random_seed = 0
    n_episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
    
    filename = "TD3_torch_{}_{}".format(env_name, random_seed)
    filename += ''
    directory = "./preTrained/td3_torch/{}".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                    dirname = './gif/td3_torch/{}'.format(ep)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/td3_torch/{}/{}.jpg'.format(ep,t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
                
test()
    
    