-
Notifications
You must be signed in to change notification settings - Fork 0
/
custom_environment.py
294 lines (243 loc) · 10.5 KB
/
custom_environment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
"""
Custom gymnasium environment with pettingzoo that defines agents playing Euchre
(It's in progress)
"""
import functools
import gymnasium
import numpy as np
from gymnasium.spaces import Discrete
from pettingzoo import AECEnv
from pettingzoo.utils import agent_selector, wrappers
ROCK = 0
PAPER = 1
SCISSORS = 2
NONE = 3
MOVES = ["ROCK", "PAPER", "SCISSORS", "None"]
NUM_ITERS = 100
REWARD_MAP = {
(ROCK, ROCK): (0, 0),
(ROCK, PAPER): (-1, 1),
(ROCK, SCISSORS): (1, -1),
(PAPER, ROCK): (1, -1),
(PAPER, PAPER): (0, 0),
(PAPER, SCISSORS): (-1, 1),
(SCISSORS, ROCK): (-1, 1),
(SCISSORS, PAPER): (1, -1),
(SCISSORS, SCISSORS): (0, 0),
}
"""
Defining the Environment
Represent cards as numbers 0-23
Clubs(c) | Diamond (d) | Heart (h) | Spade (s)
9 - A | 9 - A | 9 - A | 9 - A
0 - 5 | 6 - 11 | 12 - 17 | 18 - 23
Jacks: Jc=3, Jd=9, Jh=15, Js=21
An empty card slot is a -1. For example if a hand had 4 cards of:
Hand: "Ac, 9h, 10h, Js"
Observation: [ 6, 13, 14, 21, -1]
### Observation Space
Observation Shape: (11,)
Observation Values: [-1,23]
Observation Space
| Index | Description
|----------|-------------
| 0 - 4 | Agent's hand
| 5 | Current trump suit [0,4] in [c, d, h, s], -1 for no called suit yet
| 6 - 8 | Cards previously played this trick. Ordered. So the first entry is the first played and thus the suit to follow
| 9 | Who called the suit. [0,4] in agent id and -1 for no called suit yet
| 10 | Current offered suit [0,4] in [c, d, h, s] (only after deal until a suit is called)
#### Maybe leave these out, but it's information that people do have.
| 1 | Who led last trick. [0,4] agent id
| 4 | Cards played last trick. Ordered.
| 1 | Who led 2 tricks ago. [0,4] agent id
| 4 | Cards played 2 tricks ago. Ordered.
| 1 | Who led 3 tricks ago. [0,4] agent id
| 4 | Cards played 3 tricks ago. Ordered.
| 1 | Who led 4 tricks ago. [0,4] agent id
| 4 | Cards played 4 tricks ago. Ordered.
### Action Space
| Index | Description
|----------|-------------
| 0 - 4 | Play card from agent's hand
| 5 - 13 | Call trump suit [5=c,6=d,7=h,8=s], for alone:[9=c,10=d,11=h,12=s]. 13=pass
### Rewards
Taking a trick should be worth something
Winning the hand should be worth more. Bonus for however many points the hand was worth
"""
def env(render_mode=None):
"""
The env function often wraps the environment in wrappers by default.
You can find full documentation for these methods
elsewhere in the developer documentation.
"""
internal_render_mode = render_mode if render_mode != "ansi" else "human"
env = raw_env(render_mode=internal_render_mode)
# This wrapper is only for environments which print results to the terminal
if render_mode == "ansi":
env = wrappers.CaptureStdoutWrapper(env)
# this wrapper helps error handling for discrete action spaces
env = wrappers.AssertOutOfBoundsWrapper(env)
# Provides a wide vareity of helpful user errors
# Strongly recommended
env = wrappers.OrderEnforcingWrapper(env)
return env
class raw_env(AECEnv):
"""
The metadata holds environment constants. From gymnasium, we inherit the "render_modes",
metadata which specifies which modes can be put into the render() method.
At least human mode should be supported.
The "name" metadata allows the environment to be pretty printed.
"""
metadata = {"render_modes": ["human"], "name": "euchre_v1"}
def __init__(self, render_mode=None):
"""
The init method takes in environment arguments and
should define the following attributes:
- possible_agents
- render_mode
Note: as of v1.18.1, the action_spaces and observation_spaces attributes are deprecated.
Spaces should be defined in the action_space() and observation_space() methods.
If these methods are not overridden, spaces will be inferred from self.observation_spaces/action_spaces, raising a warning.
These attributes should not be changed after initialization.
"""
# 4 players
self.possible_agents = ["player_" + str(r) for r in range(4)]
# optional: a mapping between agent name and ID
self.agent_name_mapping = dict(
zip(self.possible_agents, list(range(len(self.possible_agents))))
)
# TODO: Delete this block and the comment above once we know we don't need it.
# optional: we can define the observation and action spaces here as attributes to be used in their corresponding methods
# self._action_spaces = {agent: Discrete(3) for agent in self.possible_agents}
# self._observation_spaces = {
# agent: Discrete(4) for agent in self.possible_agents
# }
self.render_mode = render_mode
# Observation space should be defined here.
# lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
# If your spaces change over time, remove this line (disable caching).
@functools.lru_cache(maxsize=None)
def observation_space(self, agent):
# gymnasium spaces are defined and documented here: https://gymnasium.farama.org/api/spaces/
return Discrete(4)
# Action space should be defined here.
# If your spaces change over time, remove this line (disable caching).
@functools.lru_cache(maxsize=None)
def action_space(self, agent):
return Discrete(3)
def render(self):
"""
Renders the environment. In human mode, it can print to terminal, open
up a graphical window, or open up some other display that a human can see and understand.
TODO: Rewrite this
"""
if self.render_mode is None:
gymnasium.logger.warn(
"You are calling render method without specifying any render mode."
)
return
if len(self.agents) == 2:
string = "Current state: Agent1: {} , Agent2: {}".format(
MOVES[self.state[self.agents[0]]], MOVES[self.state[self.agents[1]]]
)
else:
string = "Game over"
print(string)
def observe(self, agent):
"""
Observe should return the observation of the specified agent. This function
should return a sane observation (though not necessarily the most up to date possible)
at any time after reset() is called.
"""
# observation of one agent is the previous state of the other
return np.array(self.observations[agent])
def close(self):
"""
Close should release any graphical displays, subprocesses, network connections
or any other environment data which should not be kept around after the
user is no longer using the environment.
"""
pass
def reset(self, seed=None, options=None):
"""
Reset needs to initialize the following attributes
- agents
- rewards
- _cumulative_rewards
- terminations
- truncations
- infos
- agent_selection
And must set up the environment so that render(), step(), and observe()
can be called without issues.
Here it sets up the state dictionary which is used by step() and the observations dictionary which is used by step() and observe()
"""
self.agents = self.possible_agents[:]
self.rewards = {agent: 0 for agent in self.agents}
self._cumulative_rewards = {agent: 0 for agent in self.agents}
self.terminations = {agent: False for agent in self.agents}
self.truncations = {agent: False for agent in self.agents}
self.infos = {agent: {} for agent in self.agents}
self.state = {agent: NONE for agent in self.agents}
self.observations = {agent: NONE for agent in self.agents}
self.num_moves = 0
"""
Our agent_selector utility allows easy cyclic stepping through the agents list.
"""
self._agent_selector = agent_selector(self.agents)
self.agent_selection = self._agent_selector.next()
def step(self, action):
"""
step(action) takes in an action for the current agent (specified by
agent_selection) and needs to update
- rewards
- _cumulative_rewards (accumulating the rewards)
- terminations
- truncations
- infos
- agent_selection (to the next agent)
And any internal state used by observe() or render()
"""
if (
self.terminations[self.agent_selection]
or self.truncations[self.agent_selection]
):
# handles stepping an agent which is already dead
# accepts a None action for the one agent, and moves the agent_selection to
# the next dead agent, or if there are no more dead agents, to the next live agent
self._was_dead_step(action)
return
agent = self.agent_selection
# the agent which stepped last had its _cumulative_rewards accounted for
# (because it was returned by last()), so the _cumulative_rewards for this
# agent should start again at 0
self._cumulative_rewards[agent] = 0
# stores action of current agent
self.state[self.agent_selection] = action
# collect reward if it is the last agent to act
if self._agent_selector.is_last():
# rewards for all agents are placed in the .rewards dictionary
self.rewards[self.agents[0]], self.rewards[self.agents[1]] = REWARD_MAP[
(self.state[self.agents[0]], self.state[self.agents[1]])
]
self.num_moves += 1
# The truncations dictionary must be updated for all players.
self.truncations = {
agent: self.num_moves >= NUM_ITERS for agent in self.agents
}
# observe the current state
for i in self.agents:
self.observations[i] = self.state[
self.agents[1 - self.agent_name_mapping[i]]
]
else:
# necessary so that observe() returns a reasonable observation at all times.
self.state[self.agents[1 - self.agent_name_mapping[agent]]] = NONE
# no rewards are allocated until both players give an action
self._clear_rewards()
# selects the next agent.
self.agent_selection = self._agent_selector.next()
# Adds .rewards to ._cumulative_rewards
self._accumulate_rewards()
if self.render_mode == "human":
self.render()