In [1]:
import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

args = {
    'gamma': 0.99,
    'render': True,
    'log_interval': 10
}

env = gym.make('MountainCar-v0')
#env.seed(args['seed'])
#torch.manual_seed(args['seed'])
episodes = 10000
print("action_space={}".format(env.action_space))
print("obs_space={}".format(env.observation_space))

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(2, 128)
        self.action_head = nn.Linear(128, 3)
        self.value_head = nn.Linear(128, 1)

        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_scores = self.action_head(x)
        state_values = self.value_head(x)
        return F.softmax(action_scores, dim=-1), state_values


model = Policy()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
eps = np.finfo(np.float32).eps.item()


def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(state)
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.item()


def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    rewards = []
    for r in model.rewards[::-1]:
        R = r + args['gamma'] * R
        rewards.insert(0, R)
    rewards = torch.tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
    for (log_prob, value), r in zip(saved_actions, rewards):
        reward = r - value.item()
        policy_losses.append(-log_prob * reward)
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([r])))
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]


def main():    
    for i_episode in range(episodes):
        running_reward = 0
        heights = []
        velocities = []
        state = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            action = select_action(state)            
            state, reward, done, _ = env.step(action)            
            if args['render']:
                env.render()
            model.rewards.append(reward)            
            running_reward += reward
            heights.append(state[0])
            velocities.append(state[1])
            if done:
                break
                
        #avg_height = 1 + np.mean(np.array(heights))         
        max_height = np.abs(np.max(np.array(heights))) 
        max_velocity = np.abs(np.max(np.array(velocities))) 
        running_reward = running_reward + max_height * 10 + max_velocity * 10 # + avg_velocity * 10
        
        finish_episode()
        if i_episode % args['log_interval'] == 0:
            print('Episode {}\tLast length: {:5d}\t Reward: {:.2f} Max height: {:.2f} Max velocity: {:.6f}'.format(
                i_episode, t, running_reward, max_height, max_velocity))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

print("threshold={} \n".format(env.spec.reward_threshold))
main()



[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
action_space=Discrete(3)
obs_space=Box(2,)
threshold=-110.0 

Episode 0	Last length:   199	 Reward: -196.48 Max height: 0.34 Max velocity: 0.016668
Episode 10	Last length:   199	 Reward: -195.44 Max height: 0.45 Max velocity: 0.007596
Episode 20	Last length:   199	 Reward: -195.26 Max height: 0.47 Max velocity: 0.005445
Episode 30	Last length:   199	 Reward: -196.47 Max height: 0.33 Max velocity: 0.019249
Episode 40	Last length:   199	 Reward: -195.40 Max height: 0.45 Max velocity: 0.011185
Episode 50	Last length:   199	 Reward: -196.95 Max height: 0.29 Max velocity: 0.019837
Episode 60	Last length:   199	 Reward: -196.09 Max height: 0.38 Max velocity: 0.010786
Episode 70	Last length:   199	 Reward: -195.52 Max height: 0.44 Max velocity: 0.008378
Episode 80	Last length:   199	 Reward: -196.03 Max height: 0.39 Max velocity: 0.011483
Episode 90	Last length:   199	 Reward: -195.40 M

Episode 920	Last length:   199	 Reward: -196.82 Max height: 0.30 Max velocity: 0.016882
Episode 930	Last length:   199	 Reward: -195.77 Max height: 0.41 Max velocity: 0.011138
Episode 940	Last length:   199	 Reward: -196.09 Max height: 0.38 Max velocity: 0.012349
Episode 950	Last length:   199	 Reward: -195.10 Max height: 0.48 Max velocity: 0.008210
Episode 960	Last length:   199	 Reward: -196.35 Max height: 0.35 Max velocity: 0.011889
Episode 970	Last length:   199	 Reward: -196.05 Max height: 0.38 Max velocity: 0.012038
Episode 980	Last length:   199	 Reward: -195.26 Max height: 0.47 Max velocity: 0.003764
Episode 990	Last length:   199	 Reward: -196.34 Max height: 0.35 Max velocity: 0.012804
Episode 1000	Last length:   199	 Reward: -196.36 Max height: 0.35 Max velocity: 0.014936
Episode 1010	Last length:   199	 Reward: -195.81 Max height: 0.41 Max velocity: 0.007414
Episode 1020	Last length:   199	 Reward: -197.45 Max height: 0.23 Max velocity: 0.024253
Episode 1030	Last length:   1

Episode 1850	Last length:   199	 Reward: -195.29 Max height: 0.47 Max velocity: 0.005238
Episode 1860	Last length:   199	 Reward: -196.30 Max height: 0.36 Max velocity: 0.013973
Episode 1870	Last length:   199	 Reward: -197.13 Max height: 0.27 Max velocity: 0.020451
Episode 1880	Last length:   199	 Reward: -195.72 Max height: 0.42 Max velocity: 0.011284
Episode 1890	Last length:   199	 Reward: -195.89 Max height: 0.40 Max velocity: 0.011846
Episode 1900	Last length:   199	 Reward: -195.66 Max height: 0.42 Max velocity: 0.009838
Episode 1910	Last length:   199	 Reward: -196.50 Max height: 0.33 Max velocity: 0.015857
Episode 1920	Last length:   199	 Reward: -196.06 Max height: 0.38 Max velocity: 0.009880
Episode 1930	Last length:   199	 Reward: -195.56 Max height: 0.44 Max velocity: 0.008894
Episode 1940	Last length:   199	 Reward: -195.66 Max height: 0.42 Max velocity: 0.012345
Episode 1950	Last length:   199	 Reward: -195.50 Max height: 0.44 Max velocity: 0.006208
Episode 1960	Last len

Episode 2780	Last length:   199	 Reward: -195.51 Max height: 0.44 Max velocity: 0.010329
Episode 2790	Last length:   199	 Reward: -195.78 Max height: 0.41 Max velocity: 0.011806
Episode 2800	Last length:   199	 Reward: -195.84 Max height: 0.41 Max velocity: 0.010071
Episode 2810	Last length:   199	 Reward: -195.13 Max height: 0.48 Max velocity: 0.004930
Episode 2820	Last length:   199	 Reward: -196.73 Max height: 0.30 Max velocity: 0.023919
Episode 2830	Last length:   199	 Reward: -195.32 Max height: 0.46 Max velocity: 0.008806
Episode 2840	Last length:   199	 Reward: -196.09 Max height: 0.38 Max velocity: 0.012742
Episode 2850	Last length:   199	 Reward: -196.89 Max height: 0.29 Max velocity: 0.019140
Episode 2860	Last length:   199	 Reward: -196.28 Max height: 0.36 Max velocity: 0.013540
Episode 2870	Last length:   199	 Reward: -195.76 Max height: 0.41 Max velocity: 0.010928
Episode 2880	Last length:   199	 Reward: -195.97 Max height: 0.39 Max velocity: 0.012231
Episode 2890	Last len

Episode 3710	Last length:   199	 Reward: -196.49 Max height: 0.34 Max velocity: 0.014847
Episode 3720	Last length:   199	 Reward: -195.49 Max height: 0.44 Max velocity: 0.008475
Episode 3730	Last length:   199	 Reward: -196.79 Max height: 0.29 Max velocity: 0.028152
Episode 3740	Last length:   199	 Reward: -195.47 Max height: 0.45 Max velocity: 0.007892
Episode 3750	Last length:   199	 Reward: -196.21 Max height: 0.37 Max velocity: 0.013835
Episode 3760	Last length:   199	 Reward: -195.95 Max height: 0.39 Max velocity: 0.015306
Episode 3770	Last length:   199	 Reward: -195.73 Max height: 0.42 Max velocity: 0.010501
Episode 3780	Last length:   199	 Reward: -196.17 Max height: 0.37 Max velocity: 0.016521
Episode 3790	Last length:   199	 Reward: -195.02 Max height: 0.49 Max velocity: 0.006972
Episode 3800	Last length:   199	 Reward: -195.70 Max height: 0.42 Max velocity: 0.008681
Episode 3810	Last length:   199	 Reward: -196.49 Max height: 0.33 Max velocity: 0.017929
Episode 3820	Last len

KeyboardInterrupt: 

In [None]:
print(env.spec.reward_threshold)