In [None]:
import torch
import gym
import numpy as np
from TD3 import TD3
from PIL import Image
from utils import ReplayBuffer

env_name = 'BipedalWalker-v2'
episodes = 100000
log_interval = 1           # print avg reward after interval
random_seed = 0
gamma = 0.99                # discount for future rewards
batch_size = 100            # num of transitions sampled from replay buffer
exploration_noise = 0.1 
polyak = 0.995              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 1500         # max num of episodes
max_timesteps = 2000        # max timesteps in one episode
directory = "./preTrained/{}".format(env_name) # save trained models
filename = "TD3_{}_{}".format(env_name, random_seed)
reward_history = []


def train():
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    replay_buffer = ReplayBuffer()
    
    print("action_space={}".format(env.action_space))
    print("obs_space={}".format(env.observation_space))
    print("threshold={} \n".format(env.spec.reward_threshold))
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # logging variables:        
    log_f = open("log.txt","w+")
    
    # training procedure:
    for episode in range(1, max_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-100:]) 
        
        # logging updates:        
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
       
        
        # if avg reward > 300 then save and stop traning:
        if avg_reward >= 300:
            print("########## Solved! ###########")
            name = filename + '_solved'
            policy.save(directory, name)
            log_f.close()
            break
        
        if episode > 500:
            policy.save(directory, filename)
        
        # print avg reward every log interval:
        if episode % log_interval == 0:            
            print("Episode: {}\tReward: {}\tAverage Reward: {}".format(episode, ep_reward, avg_reward))

train()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
action_space=Box(4,)
obs_space=Box(24,)
threshold=300 

Episode: 1	Reward: -111.94799280005101	Average Reward: -111.94799280005101
Episode: 2	Reward: -169.39124473457474	Average Reward: -140.66961876731287
Episode: 3	Reward: -104.10741320528936	Average Reward: -128.48221691330502
Episode: 4	Reward: -104.16521062110635	Average Reward: -122.40296534025535
Episode: 5	Reward: -148.81723593470284	Average Reward: -127.68581945914484
Episode: 6	Reward: -115.46865353348791	Average Reward: -125.64962513820201
Episode: 7	Reward: -174.01234525231067	Average Reward: -132.55858515450325
Episode: 8	Reward: -112.59571118287062	Average Reward: -130.0632259080492
Episode: 9	Reward: -123.0018789985723	Average Reward: -129.2786318069962
Episode: 10	Reward: -129.79082206622985	Average Reward: 

Episode: 106	Reward: -100.77963167258562	Average Reward: -116.12273489792928
Episode: 107	Reward: -99.5317904891655	Average Reward: -115.37792935029783
Episode: 108	Reward: -101.59261023417662	Average Reward: -115.2678983408109
Episode: 109	Reward: -100.45902319233355	Average Reward: -115.0424697827485
Episode: 110	Reward: -105.02379312776337	Average Reward: -114.79479949336383
Episode: 111	Reward: -99.65810292033525	Average Reward: -114.11264678634063
Episode: 112	Reward: -100.29190202059813	Average Reward: -114.07684347333378
Episode: 113	Reward: -107.7522186392473	Average Reward: -113.68457453276852
Episode: 114	Reward: -147.76324815952006	Average Reward: -114.11492880654525
Episode: 115	Reward: -130.21664333756675	Average Reward: -114.35741703312105
Episode: 116	Reward: -117.59012726857691	Average Reward: -114.49306026719323
Episode: 117	Reward: -118.4455529520438	Average Reward: -114.617392856112
Episode: 118	Reward: -127.01861564508118	Average Reward: -114.7893116154326
Episode: 

Episode: 214	Reward: -68.92463747856989	Average Reward: -119.34513515123548
Episode: 215	Reward: -117.1460507479781	Average Reward: -119.21442922533959
Episode: 216	Reward: -75.5451007163691	Average Reward: -118.79397895981751
Episode: 217	Reward: -101.66613128659148	Average Reward: -118.62618474316298
Episode: 218	Reward: -107.09673518674725	Average Reward: -118.42696593857963
Episode: 219	Reward: -72.31125885839445	Average Reward: -117.9412779993639
Episode: 220	Reward: -108.95455740179631	Average Reward: -117.9030014534183
Episode: 221	Reward: -70.64472711861407	Average Reward: -117.49169443724506
Episode: 222	Reward: -109.3769403010366	Average Reward: -117.40181320178247
Episode: 223	Reward: -73.97449891395677	Average Reward: -116.92063393731
Episode: 224	Reward: -74.11769288077893	Average Reward: -116.60608574400133
Episode: 225	Reward: -78.99056482591126	Average Reward: -116.17273678118673
Episode: 226	Reward: -106.06983785455071	Average Reward: -116.06506924403806
Episode: 227	R

Episode: 322	Reward: -72.80691323936489	Average Reward: -93.26036528046046
Episode: 323	Reward: -77.78049479516197	Average Reward: -93.29842523927253
Episode: 324	Reward: -118.18904747778309	Average Reward: -93.73913878524257
Episode: 325	Reward: -58.742101096057	Average Reward: -93.53665414794405
Episode: 326	Reward: -76.25235094783314	Average Reward: -93.23847927887685
Episode: 327	Reward: -124.32968398471132	Average Reward: -93.49341130837387
Episode: 328	Reward: -94.70692856598846	Average Reward: -93.64560745002355
Episode: 329	Reward: -83.18643005488661	Average Reward: -93.08422759850191
Episode: 330	Reward: -67.91019313244875	Average Reward: -92.6090366825805
Episode: 331	Reward: -80.67112787283953	Average Reward: -92.5472711608386
Episode: 332	Reward: -21.50838306404464	Average Reward: -91.6349434486042
Episode: 333	Reward: -51.124166925768925	Average Reward: -91.38364828461037
Episode: 334	Reward: -33.90689101355089	Average Reward: -90.44216518207742
Episode: 335	Reward: -32.57

Episode: 432	Reward: 218.87315019024015	Average Reward: 90.4192269523343
Episode: 433	Reward: 220.4726093983586	Average Reward: 93.13519471557558
Episode: 434	Reward: 224.13333983258522	Average Reward: 95.71559702403697
Episode: 435	Reward: 222.33678263816276	Average Reward: 98.2647420010476
Episode: 436	Reward: 224.18444652791956	Average Reward: 100.74884880576921
Episode: 437	Reward: 212.18561935748522	Average Reward: 103.31348360968934
Episode: 438	Reward: 220.59567977796036	Average Reward: 105.77134007117117
Episode: 439	Reward: 189.6078055591241	Average Reward: 108.20039633154148
Episode: 440	Reward: 225.34832600540537	Average Reward: 110.5899364090972
Episode: 441	Reward: 226.0881217289594	Average Reward: 113.34422960331194
Episode: 442	Reward: 219.52238026206186	Average Reward: 115.9554237540985
Episode: 443	Reward: 214.64193391819148	Average Reward: 118.31844921581065
Episode: 444	Reward: 220.7271617309921	Average Reward: 120.85926414357216
Episode: 445	Reward: 231.020701983706

Episode: 543	Reward: 265.12256846234084	Average Reward: 239.1282261912254
Episode: 544	Reward: 249.13565695252288	Average Reward: 239.41231114344072
Episode: 545	Reward: 259.4193390926212	Average Reward: 239.69629751452987
Episode: 546	Reward: 252.59675512341423	Average Reward: 239.96447934938143
Episode: 547	Reward: 261.6546006718762	Average Reward: 240.23868977658128
Episode: 548	Reward: 258.7501667819723	Average Reward: 240.50775224935126
Episode: 549	Reward: 264.1716188502226	Average Reward: 240.87702524461466
Episode: 550	Reward: 254.40587958397728	Average Reward: 241.075893432074
Episode: 551	Reward: 255.83091047623634	Average Reward: 241.30247280339205
Episode: 552	Reward: 248.99106130560375	Average Reward: 241.4961797916751
Episode: 553	Reward: 259.83255638093067	Average Reward: 241.77042026435137
Episode: 554	Reward: 258.608587892843	Average Reward: 242.11440933681658
Episode: 555	Reward: 260.72926961743633	Average Reward: 242.40769513176645
Episode: 556	Reward: 264.2939670470

Episode: 654	Reward: 263.0895136364635	Average Reward: 250.14708932041
Episode: 655	Reward: 133.79832714996314	Average Reward: 248.87777989573524
Episode: 656	Reward: 265.6544579197504	Average Reward: 248.89138480446243
Episode: 657	Reward: 269.2619143454584	Average Reward: 248.9495804518558
Episode: 658	Reward: 268.93823068637624	Average Reward: 249.04160022374484
Episode: 659	Reward: 265.3391531889636	Average Reward: 249.10347919394314
Episode: 660	Reward: 258.0937353617029	Average Reward: 249.03092038115764
Episode: 661	Reward: 261.29047082314077	Average Reward: 249.04074005316124
Episode: 662	Reward: 265.8424851275979	Average Reward: 249.11805728836853
Episode: 663	Reward: 262.83259557698733	Average Reward: 249.07781285919467
Episode: 664	Reward: 261.29611540986866	Average Reward: 249.0125382497109
Episode: 665	Reward: 272.0476746338553	Average Reward: 249.15478831929576
Episode: 666	Reward: 264.8308807056754	Average Reward: 249.26178237778643
Episode: 667	Reward: -112.175445257629

In [None]:

def test():
    env_name = "BipedalWalker-v2"
    random_seed = 0
    n_episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
    
    filename = "TD3_{}_{}".format(env_name, random_seed)
    filename += '_solved'
    directory = "./preTrained/{}/ONE".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                     img = env.render(mode = 'rgb_array')
                     img = Image.fromarray(img)
                     img.save('./gif/{}.jpg'.format(t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
                
test()
    
    