In [1]:
import argparse
import gym
import numpy as np
from itertools import count

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.distributions import Categorical

In [2]:
env = gym.make('CartPole-v0')
gamma = 0.99
# env.seed(args.seed)
# torch.manual_seed(args.seed)

In [3]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.affine2 = nn.Linear(128, 2)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

In [4]:
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state) #returns the probability of an action with the given state (eg. output of the NN)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()

In [5]:
def finish_episode():
    R = 0
    policy_loss = []
    rewards = []
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        rewards.insert(0, R)
    rewards = torch.tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
    for log_prob, reward in zip(policy.saved_log_probs, rewards):
        policy_loss.append(-log_prob * reward)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]

In [6]:
running_reward = 10 # what is the running reward?
policy = Policy()
# optimizer updates the weights of the NN
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()

In [8]:
state=env.reset()
state

In [None]:
state = torch.from_numpy(state).float().unsqueeze(0)

In [None]:
probs = policy(state) #returns the probability of an action with the given state (eg. output of the NN)

In [None]:
m = Categorical(probs)

In [None]:
action = m.sample()

In [None]:
policy.saved_log_probs.append(m.log_prob(action))

In [10]:
action = select_action(state)
action

1

In [7]:
for i_episode in count(1):
    state = env.reset()
    for t in range(10000):  # 10000 steps per episode, when the episode is done, done = True
        action = select_action(state) # select action
        state, reward, done, _ = env.step(action)

        policy.rewards.append(reward)
        if done:
             break

    running_reward = running_reward * 0.99 + t * 0.01
    finish_episode()
    print("finish episode")

finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish episode
finish epi

KeyboardInterrupt: 