From be39480fc3f992529e82fc0c04e6eca62a4195aa Mon Sep 17 00:00:00 2001 From: avivt Date: Thu, 27 May 2021 08:41:02 +0300 Subject: [PATCH 01/15] more envs --- a2c_ppo_acktr/scripts/script_rec36.sh | 20 +++++++------------- a2c_ppo_acktr/scripts/script_rec37.sh | 25 +++++++++++++++++++++++++ grad_tools/noisygrad.py | 11 +++++++++-- 3 files changed, 41 insertions(+), 15 deletions(-) create mode 100644 a2c_ppo_acktr/scripts/script_rec37.sh diff --git a/a2c_ppo_acktr/scripts/script_rec36.sh b/a2c_ppo_acktr/scripts/script_rec36.sh index fe3bea12f..cb5b2c1d2 100644 --- a/a2c_ppo_acktr/scripts/script_rec36.sh +++ b/a2c_ppo_acktr/scripts/script_rec36.sh @@ -1,25 +1,19 @@ -python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 1 & - +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 1 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 2 & - +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 2 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 3 & - +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 3 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 4 & - +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 4 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 5 & - +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 5 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 6 & - +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 6 & wait -echo "graddrop obs recurrent 10 arms and free exploration" +echo "recurrent 25 arms and free exploration, different hyperparams" diff --git a/a2c_ppo_acktr/scripts/script_rec37.sh b/a2c_ppo_acktr/scripts/script_rec37.sh new file mode 100644 index 000000000..fe3bea12f --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec37.sh @@ -0,0 +1,25 @@ +python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 1 & + +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 2 & + +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 3 & + +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 4 & + +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 5 & + +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v5" --algo ppo --log-interval 25 --num-steps 100 --num-processes 10 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 1 --num-mini-batch 10 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 5000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --free_exploration 6 --use_graddrop --seed 6 & + +wait + +echo "graddrop obs recurrent 10 arms and free exploration" diff --git a/grad_tools/noisygrad.py b/grad_tools/noisygrad.py index 0202d655d..458cfe0f1 100644 --- a/grad_tools/noisygrad.py +++ b/grad_tools/noisygrad.py @@ -9,10 +9,11 @@ class NoisyGrad(): - def __init__(self, optimizer, max_grad_norm=1.0, noise_ratio=1.0): + def __init__(self, optimizer, max_grad_norm=1.0, noise_ratio=1.0, use_median=False): self._optim = optimizer self._max_grad_norm = max_grad_norm self._noise_ratio = noise_ratio + self._use_median = use_median return @property @@ -59,8 +60,14 @@ def _clip_and_add_noise(self, grads, has_grads, shapes=None): g_i *= self._max_grad_norm / g_i.norm() g_i += torch.normal(torch.zeros_like(grads[0]), noise_std) merged_grad = torch.zeros_like(grads[0]).to(grads[0].device) - merged_grad[shared] = torch.stack([g[shared] + if self._use_median: + merged_grad[shared] = torch.median(torch.stack([g[shared] + for g in pc_grad]), dim=0)[0] + else: + merged_grad[shared] = torch.stack([g[shared] for g in pc_grad]).mean(dim=0) + if noise_std > 0: + merged_grad += torch.normal(torch.zeros_like(grads[0]), noise_std) merged_grad[~shared] = torch.stack([g[~shared] for g in pc_grad]).sum(dim=0) return merged_grad From fe881d5063ea8e7a67412b102bc4038d7a53c065 Mon Sep 17 00:00:00 2001 From: avivt Date: Thu, 27 May 2021 10:43:25 +0300 Subject: [PATCH 02/15] more envs --- a2c_ppo_acktr/scripts/script_rec38.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 a2c_ppo_acktr/scripts/script_rec38.sh diff --git a/a2c_ppo_acktr/scripts/script_rec38.sh b/a2c_ppo_acktr/scripts/script_rec38.sh new file mode 100644 index 000000000..aa5933e1c --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec38.sh @@ -0,0 +1,19 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 5 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 1 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 5 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 2 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 7 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 3 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 7 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 4 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 5 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 6 & +wait + +echo "recurrent 25 arms and free exploration, different hyperparams" From 4a78183ec69c575e59d13da89bcabe1250eda399 Mon Sep 17 00:00:00 2001 From: avivt Date: Thu, 27 May 2021 10:44:21 +0300 Subject: [PATCH 03/15] more envs --- a2c_ppo_acktr/scripts/script_rec38.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/a2c_ppo_acktr/scripts/script_rec38.sh b/a2c_ppo_acktr/scripts/script_rec38.sh index aa5933e1c..0c438ba34 100644 --- a/a2c_ppo_acktr/scripts/script_rec38.sh +++ b/a2c_ppo_acktr/scripts/script_rec38.sh @@ -1,19 +1,19 @@ -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 5 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 1 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 5 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 1 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 5 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 2 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 5 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 2 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 7 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 3 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 7 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 3 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 7 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 4 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 3e-4 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 7 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 4 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 5 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 5 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 16000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 6 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --recurrent-policy --free_exploration 6 --seed 6 & wait echo "recurrent 25 arms and free exploration, different hyperparams" From 0bac75b839a84a7e4c251114df93fa7863ccfaac Mon Sep 17 00:00:00 2001 From: avivt Date: Thu, 27 May 2021 15:12:49 +0300 Subject: [PATCH 04/15] more envs --- a2c_ppo_acktr/scripts/script_rec39.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 a2c_ppo_acktr/scripts/script_rec39.sh diff --git a/a2c_ppo_acktr/scripts/script_rec39.sh b/a2c_ppo_acktr/scripts/script_rec39.sh new file mode 100644 index 000000000..2a2ef72cd --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec39.sh @@ -0,0 +1,19 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 1.0 --free_exploration 6 --seed 1 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 1.0 --free_exploration 6 --seed 2 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 3 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 4 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --free_exploration 6 --seed 5 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --free_exploration 6 --seed 6 & +wait + +echo "recurrent 25 arms and free exploration, different hyperparams" From c4765a1199a14d9a352bb4505a1ea9e905384ae3 Mon Sep 17 00:00:00 2001 From: avivt Date: Thu, 27 May 2021 21:22:45 +0300 Subject: [PATCH 05/15] median gradient --- a2c_ppo_acktr/algo/ppo.py | 8 ++ a2c_ppo_acktr/arguments.py | 5 + grad_tools/mediangrad.py | 195 +++++++++++++++++++++++++++++++++++++ main.py | 5 + 4 files changed, 213 insertions(+) create mode 100644 grad_tools/mediangrad.py diff --git a/a2c_ppo_acktr/algo/ppo.py b/a2c_ppo_acktr/algo/ppo.py index 70883289f..53a6a8a89 100644 --- a/a2c_ppo_acktr/algo/ppo.py +++ b/a2c_ppo_acktr/algo/ppo.py @@ -8,6 +8,7 @@ from grad_tools.noisygrad import NoisyGrad from grad_tools.testgrad import TestGrad from grad_tools.graddrop import GradDrop +from grad_tools.mediangrad import MedianGrad class PPO(): @@ -26,6 +27,7 @@ def __init__(self, use_pcgrad=False, use_testgrad=False, use_testgrad_median=False, + use_median_grad=False, testgrad_quantile=-1, use_noisygrad=False, use_graddrop=False, @@ -54,6 +56,7 @@ def __init__(self, self.use_pcgrad = use_pcgrad self.use_testgrad = use_testgrad self.use_noisygrad = use_noisygrad + self.use_median_grad = use_median_grad self.use_privacy = use_privacy if use_pcgrad: self.optimizer = PCGrad(self.optimizer) @@ -77,6 +80,9 @@ def __init__(self, self.optimizer = NoisyGrad(self.optimizer, max_grad_norm=num_mini_batch * max_task_grad_norm, noise_ratio=grad_noise_ratio) + if use_median_grad: + self.optimizer = MedianGrad(self.optimizer, + noise_ratio=grad_noise_ratio) if use_privacy: privacy_engine = PrivacyEngine( actor_critic, @@ -149,6 +155,8 @@ def update(self, rollouts): self.optimizer.pc_backward(task_losses) elif self.use_noisygrad: self.optimizer.noisy_backward(task_losses) + elif self.use_median_grad: + self.optimizer.median_backward(task_losses) else: total_loss.backward() nn.utils.clip_grad_norm_(self.actor_critic.parameters(), diff --git a/a2c_ppo_acktr/arguments.py b/a2c_ppo_acktr/arguments.py index 2d01a3e35..4db32898d 100644 --- a/a2c_ppo_acktr/arguments.py +++ b/a2c_ppo_acktr/arguments.py @@ -169,6 +169,11 @@ def get_args(): action='store_true', default=False, help='use testgrad with median gradient instead of mean in ppo') + parser.add_argument( + '--use_median_grad', + action='store_true', + default=False, + help='use median gradient + noise instead of mean in ppo') parser.add_argument( '--use_noisygrad', action='store_true', diff --git a/grad_tools/mediangrad.py b/grad_tools/mediangrad.py new file mode 100644 index 000000000..b18e04a78 --- /dev/null +++ b/grad_tools/mediangrad.py @@ -0,0 +1,195 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import pdb +import numpy as np +import copy +import random + + +class MedianGrad(): + def __init__(self, optimizer, noise_ratio=1.0): + self._optim = optimizer + self._noise_ratio = noise_ratio + return + + @property + def optimizer(self): + return self._optim + + def zero_grad(self): + ''' + clear the gradient of the parameters + ''' + + return self._optim.zero_grad(set_to_none=True) + + def step(self): + ''' + update the parameters with the gradient + ''' + + return self._optim.step() + + def median_backward(self, objectives): + ''' + calculate the gradient of the parameters + + input: + - objectives: a list of objectives + ''' + + grads, shapes, has_grads = self._pack_grad(objectives) + noisy_grad = self._clip_and_add_noise(grads, has_grads) + noisy_grad = self._unflatten_grad(noisy_grad, shapes[0]) + self._set_grad(noisy_grad) + return + + def _clip_and_add_noise(self, grads, has_grads, shapes=None): + shared = torch.stack(has_grads).prod(0).bool() + pc_grad, num_task = copy.deepcopy(grads), len(grads) + + grad_norms = [] + noise_std = np.minimum(np.max(np.array(grad_norms)), self._max_grad_norm) * self._noise_ratio + + merged_grad = torch.zeros_like(grads[0]).to(grads[0].device) + stacked_grads = torch.stack([g[shared] for g in grads]) + merged_grad[shared] = torch.median(stacked_grads, dim=0)[0] + + u = torch.rand(merged_grad.shape) + top_quantile = np.minimum((num_task + 1) / (2 * num_task), 1.0) + bottom_quantile = np.maximum((num_task - 1) / (2 * num_task), 0.0) + noise_max = torch.quantile(stacked_grads.abs(), top_quantile, dim=0) - merged_grad + noise_min = merged_grad - torch.quantile(stacked_grads.abs(), bottom_quantile, dim=0) + noise = (u * (noise_max - noise_min) + noise_min) * self._noise_ratio + merged_grad += noise + + merged_grad[~shared] = torch.stack([g[~shared] + for g in pc_grad]).sum(dim=0) + return merged_grad + + def _set_grad(self, grads): + ''' + set the modified gradients to the network + ''' + + idx = 0 + for group in self._optim.param_groups: + for p in group['params']: + # if p.grad is None: continue + p.grad = grads[idx] + idx += 1 + return + + def _pack_grad(self, objectives): + ''' + pack the gradient of the parameters of the network for each objective + + output: + - grad: a list of the gradient of the parameters + - shape: a list of the shape of the parameters + - has_grad: a list of mask represent whether the parameter has gradient + ''' + + grads, shapes, has_grads = [], [], [] + for obj in objectives: + self._optim.zero_grad(set_to_none=True) + obj.backward(retain_graph=True) + grad, shape, has_grad = self._retrieve_grad() + grads.append(self._flatten_grad(grad, shape)) + has_grads.append(self._flatten_grad(has_grad, shape)) + shapes.append(shape) + return grads, shapes, has_grads + + def _unflatten_grad(self, grads, shapes): + unflatten_grad, idx = [], 0 + for shape in shapes: + length = np.prod(shape) + unflatten_grad.append(grads[idx:idx + length].view(shape).clone()) + idx += length + return unflatten_grad + + def _flatten_grad(self, grads, shapes): + flatten_grad = torch.cat([g.flatten() for g in grads]) + return flatten_grad + + def _retrieve_grad(self): + ''' + get the gradient of the parameters of the network with specific + objective + + output: + - grad: a list of the gradient of the parameters + - shape: a list of the shape of the parameters + - has_grad: a list of mask represent whether the parameter has gradient + ''' + + grad, shape, has_grad = [], [], [] + for group in self._optim.param_groups: + for p in group['params']: + # if p.grad is None: continue + # tackle the multi-head scenario + if p.grad is None: + shape.append(p.shape) + grad.append(torch.zeros_like(p).to(p.device)) + has_grad.append(torch.zeros_like(p).to(p.device)) + continue + shape.append(p.grad.shape) + grad.append(p.grad.clone()) + has_grad.append(torch.ones_like(p).to(p.device)) + return grad, shape, has_grad + + +class TestNet(nn.Module): + def __init__(self): + super().__init__() + self._linear = nn.Linear(3, 4) + + def forward(self, x): + return self._linear(x) + + +class MultiHeadTestNet(nn.Module): + def __init__(self): + super().__init__() + self._linear = nn.Linear(3, 2) + self._head1 = nn.Linear(2, 4) + self._head2 = nn.Linear(2, 4) + + def forward(self, x): + feat = self._linear(x) + return self._head1(feat), self._head2(feat) + + +if __name__ == '__main__': + + # fully shared network test + torch.manual_seed(4) + x, y = torch.randn(2, 3), torch.randn(2, 4) + net = TestNet() + y_pred = net(x) + noisy_adam = NoisyGrad(optim.Adam(net.parameters())) + noisy_adam.zero_grad() + loss1_fn, loss2_fn = nn.L1Loss(), nn.MSELoss() + loss1, loss2 = loss1_fn(y_pred, y), loss2_fn(y_pred, y) + + noisy_adam.noisy_backward([loss1, loss2]) + for p in net.parameters(): + print(p.grad) + + print('-' * 80) + # seperated shared network test + + torch.manual_seed(4) + x, y = torch.randn(2, 3), torch.randn(2, 4) + net = MultiHeadTestNet() + y_pred_1, y_pred_2 = net(x) + noisy_adam = NoisyGrad(optim.Adam(net.parameters())) + noisy_adam.zero_grad() + loss1_fn, loss2_fn = nn.MSELoss(), nn.MSELoss() + loss1, loss2 = loss1_fn(y_pred_1, y), loss2_fn(y_pred_2, y) + + noisy_adam.noisy_backward([loss1, loss2]) + for p in net.parameters(): + print(p.grad) diff --git a/main.py b/main.py index d4c5c337d..bbb68e7c9 100755 --- a/main.py +++ b/main.py @@ -57,6 +57,8 @@ def main(): logdir = logdir + '_pcgrad' elif args.use_testgrad: logdir = logdir + '_testgrad' + elif args.use_median_grad: + logdir = logdir + '_mediangrad' logdir = os.path.join('runs', logdir) logdir = os.path.join(os.path.expanduser(args.log_dir), logdir) utils.cleanup_log_dir(logdir) @@ -71,6 +73,7 @@ def main(): 'use_testgrad': args.use_testgrad, 'use_testgrad_median': args.use_testgrad_median, 'testgrad_quantile': args.testgrad_quantile, + 'median_grad': args.use_median_grad, 'use_privacy': args.use_privacy, 'seed': args.seed, 'recurrent': args.recurrent_policy, @@ -89,6 +92,7 @@ def main(): 'use_testgrad': args.use_testgrad, 'use_testgrad_median': args.use_testgrad_median, 'testgrad_quantile': args.testgrad_quantile, + 'median_grad': args.use_median_grad, 'use_privacy': args.use_privacy, 'seed': args.seed, 'recurrent': args.recurrent_policy, @@ -143,6 +147,7 @@ def main(): use_testgrad_median=args.use_testgrad_median, testgrad_quantile=args.testgrad_quantile, use_privacy=args.use_privacy, + use_median_grad=args.use_median_grad, max_task_grad_norm=args.max_task_grad_norm, grad_noise_ratio=args.grad_noise_ratio, testgrad_alpha=args.testgrad_alpha, From cc8fec04ea8cb4ec4fd3e8951b6f9020d0c5743c Mon Sep 17 00:00:00 2001 From: avivt Date: Thu, 27 May 2021 21:26:05 +0300 Subject: [PATCH 06/15] median gradient --- a2c_ppo_acktr/scripts/script_rec40.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 a2c_ppo_acktr/scripts/script_rec40.sh diff --git a/a2c_ppo_acktr/scripts/script_rec40.sh b/a2c_ppo_acktr/scripts/script_rec40.sh new file mode 100644 index 000000000..87af7a94c --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec40.sh @@ -0,0 +1,19 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 1.0 --free_exploration 6 --seed 1 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 1.0 --free_exploration 6 --seed 2 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 0.5 --free_exploration 6 --seed 3 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 0.5 --free_exploration 6 --seed 4 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 0.0 --free_exploration 6 --seed 5 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 0.0 --free_exploration 6 --seed 6 & +wait + +echo "obs recurrent 25 arms and free exploration, median gradient, different hyperparams" From 542334c049edc955cc8006e615ef65ef03efe988 Mon Sep 17 00:00:00 2001 From: avivt Date: Thu, 27 May 2021 21:34:08 +0300 Subject: [PATCH 07/15] median gradient --- a2c_ppo_acktr/scripts/script_rec39.sh | 3 ++- grad_tools/mediangrad.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/a2c_ppo_acktr/scripts/script_rec39.sh b/a2c_ppo_acktr/scripts/script_rec39.sh index 2a2ef72cd..be8c3630f 100644 --- a/a2c_ppo_acktr/scripts/script_rec39.sh +++ b/a2c_ppo_acktr/scripts/script_rec39.sh @@ -16,4 +16,5 @@ sleep 3 python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --free_exploration 6 --seed 6 & wait -echo "recurrent 25 arms and free exploration, different hyperparams" +echo "obs recurrent 25 arms and free exploration, different hyperparams" +echo "seed 4 works!!! Seed 4 Iter 2300 five_arms 18.0 ten_arms 18.5 many_arms 14.36" \ No newline at end of file diff --git a/grad_tools/mediangrad.py b/grad_tools/mediangrad.py index b18e04a78..f7e7e4a42 100644 --- a/grad_tools/mediangrad.py +++ b/grad_tools/mediangrad.py @@ -50,9 +50,6 @@ def _clip_and_add_noise(self, grads, has_grads, shapes=None): shared = torch.stack(has_grads).prod(0).bool() pc_grad, num_task = copy.deepcopy(grads), len(grads) - grad_norms = [] - noise_std = np.minimum(np.max(np.array(grad_norms)), self._max_grad_norm) * self._noise_ratio - merged_grad = torch.zeros_like(grads[0]).to(grads[0].device) stacked_grads = torch.stack([g[shared] for g in grads]) merged_grad[shared] = torch.median(stacked_grads, dim=0)[0] From 777b5e6f786b0a6554cee0837045f3cdc643ae78 Mon Sep 17 00:00:00 2001 From: avivt Date: Thu, 27 May 2021 22:13:49 +0300 Subject: [PATCH 08/15] median gradient --- a2c_ppo_acktr/scripts/script_rec41.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 a2c_ppo_acktr/scripts/script_rec41.sh diff --git a/a2c_ppo_acktr/scripts/script_rec41.sh b/a2c_ppo_acktr/scripts/script_rec41.sh new file mode 100644 index 000000000..298d6b05c --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec41.sh @@ -0,0 +1,20 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 1 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 2 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 3 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 4 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 5 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 6 & +wait + +echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8" +echo "a previous seed gave surprising results, checking if this is consistent" \ No newline at end of file From 1aaa610160b7fc834af7be0d811694b50b581143 Mon Sep 17 00:00:00 2001 From: avivt Date: Fri, 28 May 2021 07:53:10 +0300 Subject: [PATCH 09/15] median gradient --- a2c_ppo_acktr/scripts/script_rec42.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 a2c_ppo_acktr/scripts/script_rec42.sh diff --git a/a2c_ppo_acktr/scripts/script_rec42.sh b/a2c_ppo_acktr/scripts/script_rec42.sh new file mode 100644 index 000000000..2f8e9f293 --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec42.sh @@ -0,0 +1,20 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_tesgrad_median --free_exploration 6 --seed 1 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_tesgrad_median --free_exploration 6 --seed 2 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_tesgrad_median --free_exploration 6 --seed 3 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_tesgrad_median --free_exploration 6 --seed 4 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_tesgrad_median --free_exploration 6 --seed 5 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_tesgrad_median --free_exploration 6 --seed 6 & +wait + +echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8" +echo "a previous seed gave surprising results, checking if this is consistent" \ No newline at end of file From 799d99322b70a219f3600668cc5d606bb8387587 Mon Sep 17 00:00:00 2001 From: avivt Date: Fri, 28 May 2021 07:53:52 +0300 Subject: [PATCH 10/15] median gradient --- a2c_ppo_acktr/scripts/script_rec42.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/a2c_ppo_acktr/scripts/script_rec42.sh b/a2c_ppo_acktr/scripts/script_rec42.sh index 2f8e9f293..7647344db 100644 --- a/a2c_ppo_acktr/scripts/script_rec42.sh +++ b/a2c_ppo_acktr/scripts/script_rec42.sh @@ -1,19 +1,19 @@ -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_tesgrad_median --free_exploration 6 --seed 1 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_testgrad_median --free_exploration 6 --seed 1 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_tesgrad_median --free_exploration 6 --seed 2 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_testgrad_median --free_exploration 6 --seed 2 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_tesgrad_median --free_exploration 6 --seed 3 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_testgrad_median --free_exploration 6 --seed 3 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_tesgrad_median --free_exploration 6 --seed 4 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_testgrad_median --free_exploration 6 --seed 4 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_tesgrad_median --free_exploration 6 --seed 5 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_testgrad_median --free_exploration 6 --seed 5 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_tesgrad_median --free_exploration 6 --seed 6 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --use_testgrad_median --free_exploration 6 --seed 6 & wait echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8" From f1962f1b4b8ea21de2118f6d38df9c57efd5ad17 Mon Sep 17 00:00:00 2001 From: avivt Date: Fri, 28 May 2021 08:23:52 +0300 Subject: [PATCH 11/15] median gradient --- a2c_ppo_acktr/scripts/script_rec43.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 a2c_ppo_acktr/scripts/script_rec43.sh diff --git a/a2c_ppo_acktr/scripts/script_rec43.sh b/a2c_ppo_acktr/scripts/script_rec43.sh new file mode 100644 index 000000000..addbfa3c3 --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec43.sh @@ -0,0 +1,20 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 1 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 2 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 3 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --free_exploration 6 --seed 4 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --free_exploration 6 --seed 5 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.5 --free_exploration 6 --seed 6 & +wait + +echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8" +echo "a previous seed gave surprising results, checking if this is consistent" \ No newline at end of file From ddd8d9af592ec8bfc0727ca470f85fa3d3ef890e Mon Sep 17 00:00:00 2001 From: avivt Date: Fri, 28 May 2021 11:37:58 +0300 Subject: [PATCH 12/15] median gradient --- a2c_ppo_acktr/scripts/script_rec41.sh | 3 ++- a2c_ppo_acktr/scripts/script_rec42.sh | 3 ++- a2c_ppo_acktr/scripts/script_rec43.sh | 3 ++- a2c_ppo_acktr/scripts/script_rec44.sh | 21 +++++++++++++++++++++ 4 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 a2c_ppo_acktr/scripts/script_rec44.sh diff --git a/a2c_ppo_acktr/scripts/script_rec41.sh b/a2c_ppo_acktr/scripts/script_rec41.sh index 298d6b05c..19eb4e3ba 100644 --- a/a2c_ppo_acktr/scripts/script_rec41.sh +++ b/a2c_ppo_acktr/scripts/script_rec41.sh @@ -17,4 +17,5 @@ python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 2 wait echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8" -echo "a previous seed gave surprising results, checking if this is consistent" \ No newline at end of file +echo "a previous seed gave surprising results, checking if this is consistent" +echo "2 seeds out of 6 are good (13,14)" \ No newline at end of file diff --git a/a2c_ppo_acktr/scripts/script_rec42.sh b/a2c_ppo_acktr/scripts/script_rec42.sh index 7647344db..2dfddbcda 100644 --- a/a2c_ppo_acktr/scripts/script_rec42.sh +++ b/a2c_ppo_acktr/scripts/script_rec42.sh @@ -17,4 +17,5 @@ python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 2 wait echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8" -echo "a previous seed gave surprising results, checking if this is consistent" \ No newline at end of file +echo "a previous seed gave surprising results, checking if this is consistent" +echo "no good results" \ No newline at end of file diff --git a/a2c_ppo_acktr/scripts/script_rec43.sh b/a2c_ppo_acktr/scripts/script_rec43.sh index addbfa3c3..06a3bc2fc 100644 --- a/a2c_ppo_acktr/scripts/script_rec43.sh +++ b/a2c_ppo_acktr/scripts/script_rec43.sh @@ -17,4 +17,5 @@ python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 2 wait echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8" -echo "a previous seed gave surprising results, checking if this is consistent" \ No newline at end of file +echo "a previous seed gave surprising results, checking if this is consistent" +echo "no good results" \ No newline at end of file diff --git a/a2c_ppo_acktr/scripts/script_rec44.sh b/a2c_ppo_acktr/scripts/script_rec44.sh new file mode 100644 index 000000000..8f40829e7 --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec44.sh @@ -0,0 +1,21 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 11 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 12 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 13 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 14 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 15 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --free_exploration 6 --seed 16 & +wait + +echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.8" +echo "a previous seed gave surprising results, checking if this is consistent" +echo "2 seeds out of 6 are good (13,14)" \ No newline at end of file From 53a5acc4643554019573cbd3fda645103d9f4860 Mon Sep 17 00:00:00 2001 From: avivt Date: Fri, 28 May 2021 17:24:00 +0300 Subject: [PATCH 13/15] median gradient --- a2c_ppo_acktr/scripts/script_rec40.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/a2c_ppo_acktr/scripts/script_rec40.sh b/a2c_ppo_acktr/scripts/script_rec40.sh index 87af7a94c..ee9461222 100644 --- a/a2c_ppo_acktr/scripts/script_rec40.sh +++ b/a2c_ppo_acktr/scripts/script_rec40.sh @@ -1,19 +1,19 @@ -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 1.0 --free_exploration 6 --seed 1 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 1.5 --free_exploration 6 --seed 1 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 1.0 --free_exploration 6 --seed 2 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 1.5 --free_exploration 6 --seed 2 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 0.5 --free_exploration 6 --seed 3 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 2.5 --free_exploration 6 --seed 3 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 0.5 --free_exploration 6 --seed 4 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 2.5 --free_exploration 6 --seed 4 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 0.0 --free_exploration 6 --seed 5 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 2.0 --free_exploration 6 --seed 5 & sleep 3 -python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 0.0 --free_exploration 6 --seed 6 & +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_median_grad --grad_noise_ratio 2.0 --free_exploration 6 --seed 6 & wait echo "obs recurrent 25 arms and free exploration, median gradient, different hyperparams" From bb02d6a2f978f0416d1efa56b5f3cd28917bd6e5 Mon Sep 17 00:00:00 2001 From: avivt Date: Fri, 28 May 2021 22:23:38 +0300 Subject: [PATCH 14/15] median gradient --- a2c_ppo_acktr/scripts/script_rec40.sh | 1 + a2c_ppo_acktr/scripts/script_rec45.sh | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 a2c_ppo_acktr/scripts/script_rec45.sh diff --git a/a2c_ppo_acktr/scripts/script_rec40.sh b/a2c_ppo_acktr/scripts/script_rec40.sh index ee9461222..2b875bc7c 100644 --- a/a2c_ppo_acktr/scripts/script_rec40.sh +++ b/a2c_ppo_acktr/scripts/script_rec40.sh @@ -17,3 +17,4 @@ python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 2 wait echo "obs recurrent 25 arms and free exploration, median gradient, different hyperparams" +echo "so far not good" diff --git a/a2c_ppo_acktr/scripts/script_rec45.sh b/a2c_ppo_acktr/scripts/script_rec45.sh new file mode 100644 index 000000000..b08408a07 --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec45.sh @@ -0,0 +1,19 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.8 --use_testgrad_median --free_exploration 6 --seed 11 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.6 --use_testgrad_median --free_exploration 6 --seed 12 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.4 --use_testgrad_median --free_exploration 6 --seed 13 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.4 --use_testgrad_median --free_exploration 6 --seed 14 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.2 --use_testgrad_median --free_exploration 6 --seed 15 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.2 --use_testgrad_median --free_exploration 6 --seed 16 & +wait + +echo "obs recurrent 25 arms and free exploration, testgrad_" From eabdc2cff0f70892709e53c1878ce9d36dbf9367 Mon Sep 17 00:00:00 2001 From: avivt Date: Sat, 29 May 2021 08:48:06 +0300 Subject: [PATCH 15/15] scripts --- a2c_ppo_acktr/scripts/script_rec45.sh | 3 ++- a2c_ppo_acktr/scripts/script_rec46.sh | 20 ++++++++++++++++++++ a2c_ppo_acktr/scripts/script_rec47.sh | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 a2c_ppo_acktr/scripts/script_rec46.sh create mode 100644 a2c_ppo_acktr/scripts/script_rec47.sh diff --git a/a2c_ppo_acktr/scripts/script_rec45.sh b/a2c_ppo_acktr/scripts/script_rec45.sh index b08408a07..2f18a5408 100644 --- a/a2c_ppo_acktr/scripts/script_rec45.sh +++ b/a2c_ppo_acktr/scripts/script_rec45.sh @@ -16,4 +16,5 @@ sleep 3 python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.2 --use_testgrad_median --free_exploration 6 --seed 16 & wait -echo "obs recurrent 25 arms and free exploration, testgrad_" +echo "obs recurrent 25 arms and free exploration, testgrad_median" +echo "Seed 16 Iter 2000 five_arms 16.0 ten_arms 15.6 many_arms 10.18 , didn't work well" \ No newline at end of file diff --git a/a2c_ppo_acktr/scripts/script_rec46.sh b/a2c_ppo_acktr/scripts/script_rec46.sh new file mode 100644 index 000000000..95db8408c --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec46.sh @@ -0,0 +1,20 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.7 --free_exploration 6 --seed 11 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.7 --free_exploration 6 --seed 12 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.7 --free_exploration 6 --seed 13 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.7 --free_exploration 6 --seed 14 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.7 --free_exploration 6 --seed 15 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.7 --free_exploration 6 --seed 16 & +wait + +echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.7" +echo "a previous seed with 0.8 gave surprising results, checking if this is consistent" diff --git a/a2c_ppo_acktr/scripts/script_rec47.sh b/a2c_ppo_acktr/scripts/script_rec47.sh new file mode 100644 index 000000000..eda17e40e --- /dev/null +++ b/a2c_ppo_acktr/scripts/script_rec47.sh @@ -0,0 +1,20 @@ +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 11 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 12 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 13 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 14 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 15 & +sleep 3 + +python3 main.py --env-name "h_bandit-randchoose-v8" --algo ppo --log-interval 25 --num-steps 100 --num-processes 25 --lr 1e-3 --entropy-coef 0.05 --value-loss-coef 0.5 --ppo-epoch 3 --num-mini-batch 25 --gamma 0.9 --gae-lambda 0.95 --num-env-steps 6000000 --eval-interval 100 --log-dir ./ppo_log --task_steps=20 --obs_recurrent --use_testgrad --testgrad_beta 0.9 --free_exploration 6 --seed 16 & +wait + +echo "obs recurrent 25 arms and free exploration, testgrad_beta 0.9" +echo "a previous seed with 0.8 gave surprising results, checking if this is consistent"