In [1]:
import model as m

import torch
import torch.optim as optim
import gym
from collections import deque
import statistics
from visualize import update_viz

Setting up a new session...


In [2]:
env = gym.make("CartPole-v1")

In [3]:
policy = m.DiscretePolicy(
    n_obs = env.observation_space.shape[0],
    n_acts = env.action_space.n,
    n_hidden = 64
)

In [4]:
policy_optimizer = optim.Adam(policy.parameters(), lr = 1e-3)

In [5]:
episode_reward = 0
last_episode_rewards = []

In [6]:
for episode in range(100000):
    trajectories = []

    #state = torch.tensor([env.reset()], dtype=torch.float32)
    state = env.reset()

    if len(last_episode_rewards) == 100:
        avg_ep_reward = statistics.mean(last_episode_rewards)
        print(avg_ep_reward)
        update_viz(episode, avg_ep_reward)
        last_episode_rewards.clear()

    while len(trajectories) == 0 or not trajectories[-1]["done"]:
        action = policy(torch.tensor(state, dtype=torch.float32))
        new_state, reward, done, _ = env.step(action.item())

        episode_reward += reward

        trajectories.append({
            "state": state,
            "action": action,
            "reward": torch.tensor([reward]),
            "done": done
        })

        state = new_state

    last_episode_rewards.append(episode_reward)
    episode_reward = 0

    if episode % 10 == 0:

        states = torch.tensor([trajectory["state"] for trajectory in trajectories], dtype=torch.float32)
        actions = torch.tensor([trajectory["action"] for trajectory in trajectories], dtype=torch.float32)
        dones = torch.tensor([torch.tensor([1.0]) if trajectory["done"] else torch.tensor([0.0]) for trajectory in trajectories], dtype=torch.float32)
        rewards = [trajectory["reward"] for trajectory in trajectories]

        #print(rewards)
        returns = [0] * len(rewards)
        discounted_future = 0
        
        for i in reversed(range(len(rewards))):
            if dones[i]:
                returns[i] = rewards[i]
            else:
                returns[i] = rewards[i] + discounted_future
            
            discounted_future = returns[i] * 0.99

        returns = torch.tensor(returns)

        mean = returns.mean()
        std = returns.std() + 1e-6
        returns = (returns - mean)/std
        
        log_probs = policy.log_prob(states, actions)

        #print(returns.shape)
        #print(log_probs.shape)
        policy_loss = -(torch.dot(returns, log_probs)).mean()

        policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_optimizer.step()

21.75
21.06
22.29
23.23
22.49
22.78
22.02
25.1
24.44
26.75
30.95
31.12
40.53
38.04
49.85
58.53
77.43
92.1
110.73
126.27
142.61
147.42
107.04
218.48
113.82
102.16
178.45
278.29
175.17
99.1
136.98
284.47
338.0
407.26
421.15
418.46
358.13
393.35
395.19
436.12
442.36
482.31
491.15
487.42
400.94
404.7
489.07
493.22
499.81
496.78
500.0
500.0
487.46
424.22
203.31
207.69
440.94
474.9
448.98
446.38
389.78
346.11
214.13
153.28
280.9
489.69
492.61
480.75
491.79
483.22
492.64
493.35
488.5
494.09
452.98
432.77
473.59
473.51
494.59
488.56
495.11
499.37
500.0
500.0
493.47
252.35
346.9
498.89
500.0
500.0
500.0


KeyboardInterrupt: 

In [7]:
#[trajectory["state"] for trajectory in trajectories]
