-
Notifications
You must be signed in to change notification settings - Fork 0
/
standalone.py
108 lines (83 loc) · 3.54 KB
/
standalone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python3
from __future__ import division, print_function
import sys,time
import numpy as np,gym
from optparse import OptionParser
import gym_minigrid,expert,inverse_agent,hirl
from tempfile import TemporaryFile
import ipdb
def main():
MODE = "inverse"
risk_mode = False
parser = OptionParser()
parser.add_option(
"-e",
"--env-name",
dest="env_name",
help="gym environment to load",
default='MiniGrid-CaptureTheFlag-Static-v0',
#default='MiniGrid-LockedRoom-v0',
)
(options, args) = parser.parse_args()
# trajectory data parameters
tau_num = 1000; # number of trajectories
tau_len = 100; # length of each trajectories
# Load the gym environment
test_env_name = 'MiniGrid-CaptureTheFlag-Test-v0'
test_env = gym.make(test_env_name)
env = gym.make(options.env_name)
env.maxSteps = tau_len; # maximum time for an episode = length of our trajectory
if(MODE=="expert"):
# Load expert agent
q_expert = expert.ExpertClass(env,tau_num,tau_len)
# training
for episode in range(25000):
q_expert.reset(env,True)
for t in range(tau_len-1):
done, r = q_expert.update(env,episode,False)
if done:
q_expert.reset(env,True)
break
#if(episode%1000==0):
# env.render('human')
# time.sleep(0.05)
if(episode%1000==0):
print('Training expert episode:',episode)
q_expert.reset(env,False)
# testing (store successful expert trajectories)
success_episode = 0
while success_episode<tau_num:
q_expert.reset(env,False)
for t in range(tau_len-1):
done, r = q_expert.update(env,episode,True)
if r: #if main goal reached
success_episode += 1
if done: # if episode done
q_expert.reset(env,False)
break
#env.render('human')
#time.sleep(0.05)
## get traj
TAU = q_expert.get_tau();
np.save('expert_traj.npy', TAU)
elif(MODE=="inverse"):
print("inverse mode")
# load traj
TAU = np.load('expert_traj.npy')
TAU = TAU[:,:,0:100]
# load inverse rl agent
maxent_learner = hirl.HInverseAgentClass(env, test_env, tau_num, tau_len, risk_mode=risk_mode)
#maxent_learner = inverse_agent.InverseAgentClass(env, test_env, tau_num, tau_len, risk_mode=risk_mode)
## inverse RL mode: learn MaxEnt IRL from trajectories
maxent_learner.store_trajectories(TAU);
subgoal = maxent_learner.update(env,PRINT=True)
############
TAU_SPLIT = (np.split(TAU[0],[int(subgoal)])[1], np.split(TAU[1],[int(subgoal)])[1]) ## split traj at subgoal and get latter
# load inverse rl agent
#maxent_learner = inverse_agent.InverseAgentClass(env, test_env, tau_num, tau_len, risk_mode=risk_mode)
maxent_learner = hirl.HInverseAgentClass(env, test_env, tau_num, tau_len, risk_mode=risk_mode)
## inverse RL mode: learn MaxEnt IRL from trajectories
maxent_learner.store_trajectories(TAU_SPLIT);
subgoal = maxent_learner.update(env,PRINT=True)
if __name__ == "__main__":
main()