In [1]:
import model as m

import torch
import torch.optim as optim
import gym
from collections import deque
import statistics
from visualize import update_viz

Setting up a new session...


In [2]:
# Simple gym environment
env = gym.make("CartPole-v1")

In [3]:
# Policy for choosing actions
policy = m.DiscretePolicy(
    n_obs = env.observation_space.shape[0],
    n_acts = env.action_space.n,
    n_hidden = 64
)

In [4]:
# Method for training the policy
policy_optimizer = optim.Adam(policy.parameters(), lr = 1e-3)

In [5]:
episode_reward = 0
last_episode_rewards = []

In [6]:
for episode in range(10000):
    # Part 1: gather experience
    
    trajectories = []
    state = env.reset()

    # Display and plot recent performance
    if len(last_episode_rewards) == 100:
        avg_ep_reward = statistics.mean(last_episode_rewards)
        print(avg_ep_reward)
        update_viz(episode, avg_ep_reward)
        last_episode_rewards.clear()

    # Run a simulation until completion
    while len(trajectories) == 0 or not trajectories[-1]["done"]:
        action = policy(torch.tensor(state, dtype=torch.float32))
        new_state, reward, done, _ = env.step(action.item())
        episode_reward += reward

        # Record data to replay memory
        trajectories.append({
            "state": state,
            "action": action,
            "reward": torch.tensor([reward]),
            "done": done
        })

        state = new_state

    last_episode_rewards.append(episode_reward)
    episode_reward = 0

    if episode % 10 == 0:
        # Part 2: learn from experience
        
        states = torch.tensor([trajectory["state"] for trajectory in trajectories], dtype=torch.float32)
        actions = torch.tensor([trajectory["action"] for trajectory in trajectories], dtype=torch.float32)
        dones = torch.tensor([torch.tensor([1.0]) if trajectory["done"] else torch.tensor([0.0]) for trajectory in trajectories], dtype=torch.float32)
        rewards = [trajectory["reward"] for trajectory in trajectories]

        # Compute return, i.e. cumulative reward
        returns = [0] * len(rewards)
        discounted_future = 0
        
        for i in reversed(range(len(rewards))):
            if dones[i]:
                returns[i] = rewards[i]
            else:
                returns[i] = rewards[i] + discounted_future
            
            discounted_future = returns[i] * 0.99

        returns = torch.tensor(returns)

        # Compute the loss function
        mean = returns.mean()
        std = returns.std() + 1e-6
        returns = (returns - mean)/std
        
        log_probs = policy.log_prob(states, actions)

        policy_loss = -(torch.dot(returns, log_probs)).mean()

        # Train through backpropagation on the loss
        policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_optimizer.step()

24.45
22.1
22.91
24.28
22.55
23.14
25.02
29.24
32.01
33.85
40.75
43.3
55.53
62.37
69.81
98.64
120.42
142.74
161.08
177.47
206.17
209.83
235.42
196.59
167.91
154.41
287.97
345.1
373.79
446.65
468.67
468.76
463.14
445.86
371.63
329.56
349.66
407.48
416.11
455.89
467.58
487.07
487.93
475.31
456.11
399.36
447.96
484.82
472.38
486.07
499.38
499.81
496.65
487.93
492.06
484.07
494.73
498.28
498.23
493.62
485.24
460.76
448.67
458.62
478.77
486.48
490.16
493.49
494.24
496.13
498.87
493.7
474.61
486.96
497.17
498.1
479.11
475.44
480.95
475.1
484.61
495.73
496.46
496.29
498.83
494.97
492.33
483.83
471.4
458.59
435.74
473.32
461.49
456.45
459.91
464.01
467.89
435.89
456.91


In [7]:
# Run a rendered simulation
state = env.reset()
done = False
while not done:
    env.render()
    action = policy(torch.tensor(state, dtype=torch.float32)).item()
    state, reward, done, _ = env.step(action)
env.close()