### Task 2 - REINFORCE (Vanilla Policy Gradient)


In [12]:
import torch
import gym

from env.custom_hopper import *

In [13]:
env = gym.make('CustomHopper-source-v0')

print('Action space:', env.action_space)
print('State space:', env.observation_space)
print('Dynamics parameters:', env.get_parameters())

observation_space_dim = env.observation_space.shape[-1]
action_space_dim = env.action_space.shape[-1]

seed_value = 1234
n_episodes = 50

Action space: Box([-1. -1. -1.], [1. 1. 1.], (3,), float32)
State space: Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf inf inf inf], (11,), float64)
Dynamics parameters: [2.53429174 3.92699082 2.71433605 5.0893801 ]


In [19]:
from agent_reinforce import Agent, Policy

reward_list_reinforce = []

model = "REINFORCE-100k.mdl"

policy = Policy(observation_space_dim, action_space_dim)
policy.load_state_dict(torch.load(model), strict=True)

agent = Agent(policy, device = "cpu")

for episode in range(n_episodes):
    done = False
    test_reward = 0
    env.seed(seed_value + episode)
    state = env.reset()
    while not done:

        action, _ = agent.get_action(state, evaluation=True)

        state, reward, done, info = env.step(action.detach().cpu().numpy())

        test_reward += reward
        # env.render()
    reward_list_reinforce.append(test_reward)
    print(f"Episode: {episode} | Return: {test_reward}")



Episode: 0 | Return: 372.0458397563965
Episode: 1 | Return: 368.2702959528808
Episode: 2 | Return: 231.33360258129952
Episode: 3 | Return: 373.0867081360459
Episode: 4 | Return: 350.08055084521726
Episode: 5 | Return: 582.7491480306913
Episode: 6 | Return: 521.3258152743631
Episode: 7 | Return: 341.4859424331279
Episode: 8 | Return: 272.0349742523086
Episode: 9 | Return: 326.8920841747951
Episode: 10 | Return: 276.69841873845326
Episode: 11 | Return: 328.74050984505345
Episode: 12 | Return: 344.4689223000142
Episode: 13 | Return: 363.00528601493227
Episode: 14 | Return: 772.7539171098514
Episode: 15 | Return: 364.26280387523013
Episode: 16 | Return: 243.38486056399816
Episode: 17 | Return: 336.50032377433325
Episode: 18 | Return: 364.27791995880375
Episode: 19 | Return: 331.2671943934878
Episode: 20 | Return: 343.48674366356215
Episode: 21 | Return: 362.5634711123964
Episode: 22 | Return: 740.7889543833462
Episode: 23 | Return: 444.9906907348974
Episode: 24 | Return: 360.2046985595709


In [20]:
from agent_reinforce import Agent, Policy

reward_list_reinforce_with_baseline = []

model = "REINFORCE-with_baseline_100k.mdl"

policy = Policy(observation_space_dim, action_space_dim)
policy.load_state_dict(torch.load(model), strict=True)

agent = Agent(policy, device = "cpu")

for episode in range(n_episodes):
    done = False
    test_reward = 0
    env.seed(seed_value + episode)
    state = env.reset()
    while not done:

        action, _ = agent.get_action(state, evaluation=True)

        state, reward, done, info = env.step(action.detach().cpu().numpy())

        test_reward += reward
        # env.render()
    reward_list_reinforce_with_baseline.append(test_reward)
    print(f"Episode: {episode} | Return: {test_reward}")



Episode: 0 | Return: 340.0227342709311
Episode: 1 | Return: 167.98717826572607
Episode: 2 | Return: 374.5498351279817
Episode: 3 | Return: 374.7337886845524
Episode: 4 | Return: 188.05971077338134
Episode: 5 | Return: 388.15119892626666
Episode: 6 | Return: 188.9818675237329
Episode: 7 | Return: 197.83860713641332
Episode: 8 | Return: 365.0956455743209
Episode: 9 | Return: 368.5039534422479
Episode: 10 | Return: 373.2816643524105
Episode: 11 | Return: 316.174922260255
Episode: 12 | Return: 316.54662962220664
Episode: 13 | Return: 253.3717649266839
Episode: 14 | Return: 404.31497464261304
Episode: 15 | Return: 369.727774472464
Episode: 16 | Return: 438.0005140418449
Episode: 17 | Return: 389.7525418821754
Episode: 18 | Return: 374.1237761266403
Episode: 19 | Return: 375.9434284249575
Episode: 20 | Return: 234.4117427541253
Episode: 21 | Return: 419.5283164641726
Episode: 22 | Return: 375.9712930437545
Episode: 23 | Return: 372.5761466945517
Episode: 24 | Return: 377.9571145648134
Episod

In [22]:
from agent import Agent, Policy

reward_list_actor_critic = []

model = "ACTOR_CRITIC_100k.mdl"

policy = Policy(observation_space_dim, action_space_dim)
policy.load_state_dict(torch.load(model), strict=True)

agent = Agent(policy, device = "cpu")

for episode in range(n_episodes):
    done = False
    test_reward = 0
    env.seed(seed_value + episode)
    state = env.reset()
    while not done:

        action, _ = agent.get_action(state, evaluation=True)

        state, reward, done, info = env.step(action.detach().cpu().numpy())

        test_reward += reward
        # env.render()
    reward_list_actor_critic.append(test_reward)
    print(f"Episode: {episode} | Return: {test_reward}")


Episode: 0 | Return: 300.2326721777124
Episode: 1 | Return: 314.7175922495674
Episode: 2 | Return: 262.41011069660203
Episode: 3 | Return: 239.16858139373315
Episode: 4 | Return: 302.0821774505435
Episode: 5 | Return: 348.36643468468924
Episode: 6 | Return: 224.42370003888544
Episode: 7 | Return: 265.8663167592419
Episode: 8 | Return: 203.59837448621457
Episode: 9 | Return: 209.4784829681353
Episode: 10 | Return: 256.11697182020293
Episode: 11 | Return: 237.356934021136
Episode: 12 | Return: 217.1448463604666
Episode: 13 | Return: 226.68471576476642
Episode: 14 | Return: 337.9210100875975
Episode: 15 | Return: 252.71554139918268
Episode: 16 | Return: 237.7844284621892
Episode: 17 | Return: 207.32764050123143
Episode: 18 | Return: 227.55678428685997
Episode: 19 | Return: 244.13151103307416
Episode: 20 | Return: 305.16237377144046
Episode: 21 | Return: 335.800646150063
Episode: 22 | Return: 255.02569906903437
Episode: 23 | Return: 329.80525941677627
Episode: 24 | Return: 345.988165127126

In [24]:
# print the mean and std of the returns
print(f"REINFORCE: {round(np.mean(reward_list_reinforce))} +- {round(np.std(reward_list_reinforce))}")
print(f"REINFORCE with baseline: {round(np.mean(reward_list_reinforce_with_baseline))} +- {round(np.std(reward_list_reinforce_with_baseline))}")
print(f"Actor Critic: {round(np.mean(reward_list_actor_critic))} +- {round(np.std(reward_list_actor_critic))}")



REINFORCE: 370 +- 108
REINFORCE with baseline: 340 +- 71
Actor Critic: 257 +- 56
