/
acrobot.js
116 lines (99 loc) · 2.8 KB
/
acrobot.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import { RLRealRange, RLEnvironmentBase } from './base.js'
/**
* Acrobot environment
*/
export default class AcrobotRLEnvironment extends RLEnvironmentBase {
constructor() {
super()
this._theta1 = 0
this._theta2 = 0
this._dtheta1 = 0
this._dtheta2 = 0
this._link_len1 = 1
this._link_len2 = 1
this._link_mass1 = 1
this._link_mass2 = 1
this._link_com_pos1 = 0.5
this._link_com_pos2 = 0.5
this._moi = 1
this._max_vel1 = 4 * Math.PI
this._max_vel2 = 9 * Math.PI
this._g = 9.8
this._dt = 0.1
this._max_step = 200
this._reward = {
goal: 0,
step: -1,
fail: 0,
}
}
get actions() {
return [[-1, 0, 1]]
}
get states() {
return [
new RLRealRange(-Math.PI, Math.PI),
new RLRealRange(-Math.PI, Math.PI),
new RLRealRange(-this._max_vel1, this._max_vel1),
new RLRealRange(-this._max_vel2, this._max_vel2),
]
}
reset() {
super.reset()
this._theta1 = Math.random() * 0.2 - 0.1
this._theta2 = Math.random() * 0.2 - 0.1
this._dtheta1 = Math.random() * 0.2 - 0.1
this._dtheta2 = Math.random() * 0.2 - 0.1
return this.state()
}
state() {
return [this._theta1, this._theta2, this._dtheta1, this._dtheta2]
}
setState(state) {
this._theta1 = state[0]
this._theta2 = state[1]
this._dtheta1 = state[2]
this._dtheta2 = state[3]
}
test(state, action) {
let [t1, t2, dt1, dt2] = state
const a = action[0]
const m1 = this._link_mass1
const m2 = this._link_mass2
const l1 = this._link_len1
const lc1 = this._link_com_pos1
const lc2 = this._link_com_pos2
const i1 = this._moi
const i2 = this._moi
const g = this._g
const d1 = m1 * lc1 ** 2 + m2 * (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * Math.cos(t2)) + i1 + i2
const d2 = m2 * (lc2 ** 2 + l1 * lc2 * Math.cos(t2)) + i2
const phi2 = m2 * lc2 * g * Math.cos(t1 + t2 - Math.PI / 2)
const phi1 =
-m2 * l1 * lc2 * dt2 ** 2 * Math.sin(t2) -
2 * m2 * l1 * lc2 * dt2 * dt1 * Math.sin(t2) +
(m1 * lc1 + m2 * l1) * g * Math.cos(t1 - Math.PI / 2) +
phi2
const ddt2 =
(a + (d2 / d1) * phi1 - m2 * l1 * lc2 * dt1 ** 2 * Math.sin(t2) - phi2) /
(m2 * lc2 ** 2 + i2 - d2 ** 2 / d1)
const ddt1 = -(d2 * ddt2 + phi2) / d1
const clip = (x, min, max) => (x < min ? min : x > max ? max : x)
t1 += this._dt * dt1
while (t1 < -Math.PI) t1 += 2 * Math.PI
while (t1 > Math.PI) t1 -= 2 * Math.PI
t2 += this._dt * dt2
while (t2 < -Math.PI) t2 += 2 * Math.PI
while (t2 > Math.PI) t2 -= 2 * Math.PI
dt1 = clip(dt1 + this._dt * ddt1, -this._max_vel1, this._max_vel1)
dt2 = clip(dt2 + this._dt * ddt2, -this._max_vel2, this._max_vel2)
const fail = this.epoch >= this._max_step
const done = -Math.cos(t1) - Math.cos(t2 + t1) > 1 || fail
const reward = fail ? this._reward.fail : done ? this._reward.goal : this._reward.step
return {
state: [t1, t2, dt1, dt2],
reward,
done,
}
}
}