-
Notifications
You must be signed in to change notification settings - Fork 0
/
TD3.py
133 lines (105 loc) · 5.53 KB
/
TD3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from Replay_Memory import *
from TD3_agent import *
from Render import *
import gym
import tqdm
import math
import pickle
"""
Implementation of Twin Delayed Deep Deterministic Policy Gradient (TD3) using pytorch
Reference 1: https://github.com/sfujim/TD3/blob/master/main.py
Reference 2: https://github.com/sfujim/TD3
Reference 3: DQN programming assignment
Environments
The following two environments chosen have a continuous state-space and continuous action-space:
Hopper: Make a two-dimensional one-legged robot hop forward as fast as possible (https://gym.openai.com/envs/Hopper-v2/).
HalfCheetah: (https://gym.openai.com/envs/HalfCheetah-v2/).
"""
Env_Hopper = gym.make('Hopper-v3')
Env_HalfCheetah = gym.make('HalfCheetah-v2')
def train_td3(env, replay_size, replay_populate_steps, total_timesteps, exploration_noise, gamma, reward_threshold, \
sample_batch_size, tau, policy_noise, noise_clip, policy_update_frequency, path, visulize_num_episodes):
""" Training a Td3 (agent) policy network model.
:param env: Openai Gym environment.
:param replay_size: Maximum size of the buffer.
:param replay_populate_steps: Number of steps for which to populate the replay memory.
:param total_timesteps: Total number of timesteps for which to repeat policy training.
:param exploration_noise: Exploration noise to be added to action.
:param gamma:The discount factor.
:param reward_threshold: The threshold for reward after which stop training.
:param sample_batch_size: Number of experiences in a batch.
:param tau: Soft update for main networks to target networks.
:param policy_noise: amount of noise to be added to actions.
:param noise_clip: decides the range for clipping the noise.
:param: policy_update_frequency: after how many timesteps the policy network needs to be updated.
:param path: directory to store data.
:param visulize_num_episodes: number of episodes to visualize the trained agent for.
"""
# get the state_size, action_size, and max_action from the environment
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
# initialize the replay memory and prepopulate it
memory = ReplayMemory(state_size, action_size, replay_size)
memory.populate(env, replay_populate_steps)
# initialize the TD3 agent (policy) network model
policy_network = TD3(env, state_size, action_size, max_action)
#initialize first state of episode
state = env.reset()
#initializing episode length and best average reward (average of rewards achieved in last 100 timestepssteps)
ep_length = 0
best_average_reward = -2000
# initiate lists to store returns, current episode rewards, rewards of all the timsteps lengths and
#average rewards (average of rewards achieved in last 100 timestepssteps)
returns = []
ep_rewards = []
all_rewards = []
average_rewards = []
# iterate for a total number of timesteps
pbar = tqdm.tnrange(total_timesteps, ncols='100%')
for step in pbar:
#Sample a clipped action with exploration noise from the TD3 policy and use the action to advance
#the environment by one step. Store the transition into the replay memory.
action = policy_network.select_clipped_action(state, exploration_noise)
next_state, reward, done, _ = env.step(action)
memory.add(state, action, reward, next_state, done)
ep_rewards.append(reward)
all_rewards.append(reward)
state = next_state
ep_length += 1
#maximum length of an episode can be 200
if ep_length + 1 == 200:
done = True
if done:
#calculating episode return
G = sum([math.pow(gamma, i) * ep_rewards[i] for i in range(len(ep_rewards))])
returns.append(G)
#calculating average reward (average of rewards achieved in last 100 timestepssteps)
average_reward = np.mean(all_rewards[-100:])
average_rewards.append(average_reward)
pbar.set_description(
f'Episode Length: {ep_length} | Return: {G:5.2f} | Average Reward: {average_reward:4.2f}'
)
#if average_reward > best_average_reward till now, new best_average_reward = average_reward
#and save the hence acheived best policy
if best_average_reward < average_reward:
best_average_reward = average_reward
policy_network.custom_save(path)
#if average_reward > reward_threshold, stop training the policy
if average_reward >= reward_threshold:
break
#train and update the policy network
policy_network.train_network(memory, ep_length, sample_batch_size, gamma, tau, \
policy_noise, noise_clip, policy_update_frequency)
ep_length = 0
ep_rewards = []
#reset the environment state
state = env.reset()
#saving the average rewards list
with open(path + 'average_rewards.txt','wb') as fp:
pickle.dump(average_rewards, fp)
#saving the episode rewards list
with open(path + 'returns.txt','wb') as fp:
pickle.dump(returns, fp)
#visualizing the performance of the trained agent/ learnt policy
render(env, path, policy_network, visulize_num_episodes)