Skip to content

Commit

Permalink
Merge pull request ikostrikov#2 from avivt/master
Browse files Browse the repository at this point in the history
sync with avivt
  • Loading branch information
EvZissel committed Jun 1, 2021
2 parents 5f9fdec + eabdc2c commit f2b656e
Show file tree
Hide file tree
Showing 17 changed files with 454 additions and 15 deletions.
8 changes: 8 additions & 0 deletions a2c_ppo_acktr/algo/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from grad_tools.noisygrad import NoisyGrad
from grad_tools.testgrad import TestGrad
from grad_tools.graddrop import GradDrop
from grad_tools.mediangrad import MedianGrad


class PPO():
Expand All @@ -26,6 +27,7 @@ def __init__(self,
use_pcgrad=False,
use_testgrad=False,
use_testgrad_median=False,
use_median_grad=False,
testgrad_quantile=-1,
use_noisygrad=False,
use_graddrop=False,
Expand Down Expand Up @@ -54,6 +56,7 @@ def __init__(self,
self.use_pcgrad = use_pcgrad
self.use_testgrad = use_testgrad
self.use_noisygrad = use_noisygrad
self.use_median_grad = use_median_grad
self.use_privacy = use_privacy
if use_pcgrad:
self.optimizer = PCGrad(self.optimizer)
Expand All @@ -77,6 +80,9 @@ def __init__(self,
self.optimizer = NoisyGrad(self.optimizer,
max_grad_norm=num_mini_batch * max_task_grad_norm,
noise_ratio=grad_noise_ratio)
if use_median_grad:
self.optimizer = MedianGrad(self.optimizer,
noise_ratio=grad_noise_ratio)
if use_privacy:
privacy_engine = PrivacyEngine(
actor_critic,
Expand Down Expand Up @@ -149,6 +155,8 @@ def update(self, rollouts):
self.optimizer.pc_backward(task_losses)
elif self.use_noisygrad:
self.optimizer.noisy_backward(task_losses)
elif self.use_median_grad:
self.optimizer.median_backward(task_losses)
else:
total_loss.backward()
nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
Expand Down
5 changes: 5 additions & 0 deletions a2c_ppo_acktr/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ def get_args():
action='store_true',
default=False,
help='use testgrad with median gradient instead of mean in ppo')
parser.add_argument(
'--use_median_grad',
action='store_true',
default=False,
help='use median gradient + noise instead of mean in ppo')
parser.add_argument(
'--use_noisygrad',
action='store_true',
Expand Down
20 changes: 7 additions & 13 deletions a2c_ppo_acktr/scripts/script_rec36.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,19 @@
python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 1 &

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 1 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 2 &

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 2 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 3 &

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 3 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 4 &

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 4 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 5 &

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 5 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 6 &

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 6 &
wait

echo "graddrop obs recurrent 10 arms and free exploration"
echo "recurrent 25 arms and free exploration, different hyperparams"
25 changes: 25 additions & 0 deletions a2c_ppo_acktr/scripts/script_rec37.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 1 &

sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 2 &

sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 3 &

sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 4 &

sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 5 &

sleep 3

python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 6 &

wait

echo "graddrop obs recurrent 10 arms and free exploration"
19 changes: 19 additions & 0 deletions a2c_ppo_acktr/scripts/script_rec38.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 5 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 1 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 5 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 2 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 7 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 3 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 7 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 4 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 5 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 6 &
wait

echo "recurrent 25 arms and free exploration, different hyperparams"
20 changes: 20 additions & 0 deletions a2c_ppo_acktr/scripts/script_rec39.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 1.0 --free_exploration 6 --seed 1 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 1.0 --free_exploration 6 --seed 2 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 3 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 4 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --free_exploration 6 --seed 5 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --free_exploration 6 --seed 6 &
wait

echo "obs recurrent 25 arms and free exploration, different hyperparams"
echo "seed 4 works!!! Seed 4 Iter 2300 five_arms 18.0 ten_arms 18.5 many_arms 14.36"
20 changes: 20 additions & 0 deletions a2c_ppo_acktr/scripts/script_rec40.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 1.5 --free_exploration 6 --seed 1 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 1.5 --free_exploration 6 --seed 2 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 2.5 --free_exploration 6 --seed 3 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 2.5 --free_exploration 6 --seed 4 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 2.0 --free_exploration 6 --seed 5 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 2.0 --free_exploration 6 --seed 6 &
wait

echo "obs recurrent 25 arms and free exploration, median gradient, different hyperparams"
echo "so far not good"
21 changes: 21 additions & 0 deletions a2c_ppo_acktr/scripts/script_rec41.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 1 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 2 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 3 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 4 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 5 &
sleep 3

python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 6 &
wait

echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8"
echo "a previous seed gave surprising results, checking if this is consistent"
echo "2 seeds out of 6 are good (13,14)"

0 comments on commit f2b656e

Please sign in to comment.