-
Notifications
You must be signed in to change notification settings - Fork 0
/
OC_RL_control_trajectory.py
251 lines (149 loc) · 6.19 KB
/
OC_RL_control_trajectory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
"""
The goal of this script is to train a TD3 RL algorithm. An object is falling with
constant speed and needs to be sttered to a certain point on the ground. The
dynamics is given by the usual differential equations governing the relation-
ship between acceleration, velocity, and position. The acceleration can be chosen
by the RL agent as reaction to a state observation.
For this, do the following
1. Definitions and imports
2. Initialize the environment class
3. Step method
4. Reset method
5. Render method
6. Train with stable baselines
7. Summarize and plot results
Control landing environment:
This environment consists of a model of a falling object that can act by
accelerating to the left or the right. Each step it falls closer to the ground
and is punished by its deviation on the x axis from a target value. The parameters
passed to it upon its creation are the the action dimension and the target x value.
INPUTS
Name Interpretation Type
dims Array of form [n_action] Array
x_target Target coordinate Number
action The action to be performed during step Index
The script is meant solely for educational and illustrative purposes. Written by
Jemil Avers Butt, Atlas optimization GmbH, www.atlasoptimization.ch.
"""
"""
1. Definitions and imports -----------------------------------------------
"""
# i) Imports
import numpy as np
import matplotlib.pyplot as plt
import gym
from gym import spaces
from collections import namedtuple
from stable_baselines3.common.env_checker import check_env
# ii) Definitions
n_learn=30000 # Determines the amount of learning - change
# to adjust training time
Transition = namedtuple('Transition',
('state', 'action', 'next_state', 'reward'))
"""
2. Initialize the environment class ---------------------------------------
"""
class Control_landing_env(gym.Env):
def __init__(self, dims, x_target):
super(Control_landing_env, self).__init__()
# i) Definition of fixed quantities
z_max=10
n_a=dims[0]
x=1*(np.random.normal(0,1))
vx=0*(np.random.normal(0,1))
self.n_state=3
# ii) Space definitions
self.action_space = spaces.Box(low=-0.2, high=0.2,
shape=(n_a,), dtype=np.float32)
self.observation_space = spaces.Box(low=-10, high=10,
shape=(self.n_state,), dtype=np.float32)
# iii) Initialization of updeatable logging parameters
self.state=np.hstack((z_max,x,vx))
self.epoch=0
self.max_epoch=z_max
self.state_sequence=self.state
self.action_sequence=[]
self.reward_sequence=[]
self.n_action=n_a
self.dim=np.array([3,n_a])
self.z_max=z_max
self.x_target=x_target
"""
3. Step method -------------------------------------------------------
"""
def step(self,action):
# i) Dynamics and differential relations
acceleration_value=action
state_change=np.array([-1,self.state[2].item(),acceleration_value.item()])
state=self.state
new_state=state+state_change
# ii) Logging of changes
self.state=new_state
self.state_sequence=np.vstack((self.state_sequence,new_state))
self.action_sequence.append(acceleration_value)
# iii) Reward calculation and update
reward=-np.abs(self.state[1]-self.x_target)
self.reward_sequence=np.hstack((self.reward_sequence,reward))
self.last_transition=Transition(state,action,new_state,reward)
self.epoch=self.epoch+1
if self.epoch==self.max_epoch:
done=True
else:
done=False
info = {}
return self.state, reward, done, info
"""
4. Reset method ------------------------------------------------------
"""
def reset(self):
# i) Generate starting position randomly
x=1*(np.random.normal(0,1))
vx=0*(np.random.normal(0,1))
# ii) Reset updateable logging parameters
self.state=np.hstack((self.max_epoch,x,vx))
self.epoch=0
self.state_sequence=self.state
self.action_sequence=[]
self.reward_sequence=[]
observation=self.state
return observation
"""
5. Render method -----------------------------------------------------
"""
# i) Render plot with trajectories
def render(self, reward, mode='console'):
plt.figure(1,dpi=300,figsize=(2,4))
plt.plot(self.state_sequence[:,1],self.state_sequence[:,0])
plt.title('Steered trajectory')
plt.xlabel('x axis')
plt.ylabel('z axis')
print('Reward is ', reward)
# ii) Close method
def close (self):
pass
"""
6. Train with stable baselines -------------------------------------------
"""
# i) Generate Environment
new_env=Control_landing_env(np.array([1]),0)
check_env(new_env)
# ii) Import and use stable baselines
from stable_baselines3 import PPO
model = PPO("MlpPolicy", new_env,verbose=1)
model.learn(total_timesteps=n_learn)
"""
7. Summarize and plot results --------------------------------------------
"""
# i) Let the agent run on a few episodes and plot the results
obs = new_env.reset()
n_episodes=30
for k in range(n_episodes):
done=False
obs = new_env.reset()
while done ==False:
action, _states = model.predict(obs, deterministic=True)
obs, reward, done, info = new_env.step(action)
if done:
new_env.render(reward)
# time.sleep(0.5)
break