In [68]:
import numpy as np
import gym
import random

env = gym.make('MountainCarContinuous-v0')



In [25]:
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline

# actor-critic

In [26]:
def phi(state, action):
    return np.array([state[0], state[1], action[0]])

def Q_approx(state, action, w):
    return phi(state, action) @ w

def mu(state, θ):
    return np.array([state @ θ])

def policy_approx(mu_action, σ, env):
    return np.clip(np.random.normal(mu_action, σ, size=(1,)), env.action_space.low[0], env.action_space.high[0])


In [27]:
import time

In [28]:
γ = 0.8
α = 0.1
β = 0.2
θ = np.random.rand(2,)
w = np.random.rand(3,)
σ = 1


In [29]:
train = True
render = True

In [142]:
num_episodes = 200
num_steps = 300
thetaList = []

for i in tqdm(range(num_episodes)):
    # diminishing exploration rate
    e = (0.8/num_episodes) * i + 0.1
    
    state = env.reset()
    rewards = []
    actions = []
    thetas = []

    # sometimes random
    mu_action = mu(state, θ) if np.random.uniform() < e else np.random.uniform(-1, 1, size=(1,))
    for _ in range(num_steps):    
        # TAKE ACTION
        # gradient policy - sample
        action = policy_approx(mu_action, σ, env)

        new_state, reward, done, _ = env.step(mu_action)
        # get next action using trained weights
        mu_new_action = mu(new_state, θ)
        
        # convert reward to positive
        reward *= -1

        # learning parameters
        if train:
            dlogpi = state * ((action - mu_action)/σ**2)
            θ = α * dlogpi * Q_approx(state, mu_action, w)
            δ = reward + γ * Q_approx(new_state, mu_new_action, w) - Q_approx(state, mu_action, w)
            w += β * δ * phi(state, mu_action)

        # update state
        state = new_state
        mu_action = mu_new_action

        # keep track of actions and rewards
        rewards.append(reward)
        actions.append(action)
        thetas.append(Q_approx(state, mu_action, w))

        if render:
            env.render()

        if done:
            print("successful")

#         assert abs(mu_new_action) <= 2
#         assert np.prod(abs(θ) < 100)
#         assert np.prod(abs(w) < 100)
            
    env.close()
    
    thetaList.extend(thetas)


  """
  """
  0%|          | 0/200 [00:00<?, ?it/s]


OverflowError: math range error

# Vanilla Policy Gradient

In [69]:
def mu(state, θ):
    return np.array([state @ θ])

def policy_approx(mu_action, σ, env):
    return np.clip(np.random.normal(mu_action, σ, size=(1,)), env.action_space.low[0], env.action_space.high[0])


In [75]:
num_episodes = 10000
num_steps = 300

α = 0.1
θ = np.random.rand(2,)
σ = 1
γ = 0.8

for i in tqdm(range(num_episodes)):
    # diminishing exploration rate
    e = (0.8/num_episodes) * i + 0.1
    
    state = env.reset()
    rewards = []
    mu_actions = []
    actions = []
    states = []

    # sometimes random
    for _ in range(num_steps):    
        # TAKE ACTION
        # gradient policy - sample
        mu_action = mu(state, θ) if np.random.uniform() < e else np.random.uniform(-1, 1, size=(1,))
        action = policy_approx(mu_action, σ, env)

        states.append(state)
        mu_actions.append(mu_action)
        actions.append(action)
        
        state, reward, done, _ = env.step(action)
        
        assert abs(action) <= 1
        assert abs(reward) < 1

        # keep track of actions and rewards
        rewards.append(reward)
        
        if done:
            print("done")
            # replay
            state = env.reset()
            for action in actions:
                env.step(action)
                env.render()

            env.close()
            raise Exception("")
            break
        
                        
    # train
    i = 0
    for state, action, mu_action in zip(states, actions, mu_actions):
        dlogpi = state * ((action - mu_action)/σ**2)
        R = 0
        for r in rewards[i:][::-1]:
            R = r + γ * R
        # minimise reward
        θ = θ + α * dlogpi * R

        i += 1
        
θ


  0%|          | 0/10000 [00:00<?, ?it/s][A
  0%|          | 5/10000 [00:00<03:36, 46.08it/s][A
  0%|          | 11/10000 [00:00<03:30, 47.52it/s][A
  0%|          | 17/10000 [00:00<03:25, 48.70it/s][A
  0%|          | 22/10000 [00:00<03:25, 48.49it/s][A
  0%|          | 28/10000 [00:00<03:22, 49.26it/s][A
  0%|          | 33/10000 [00:00<03:21, 49.34it/s][A
  0%|          | 39/10000 [00:00<03:20, 49.80it/s][A
  0%|          | 44/10000 [00:00<03:22, 49.10it/s][A
  0%|          | 49/10000 [00:00<03:22, 49.04it/s][A
  1%|          | 54/10000 [00:01<03:45, 44.03it/s][A
  1%|          | 60/10000 [00:01<03:36, 45.91it/s][A
  1%|          | 66/10000 [00:01<03:30, 47.22it/s][A
  1%|          | 72/10000 [00:01<03:25, 48.38it/s][A
  1%|          | 77/10000 [00:01<03:26, 48.13it/s][A
  1%|          | 82/10000 [00:01<03:28, 47.59it/s][A
  1%|          | 88/10000 [00:01<03:25, 48.13it/s][A
  1%|          | 93/10000 [00:01<03:24, 48.36it/s][A
  1%|          | 98/10000 [00:02<03:2

KeyboardInterrupt: 

In [76]:
state = env.reset()
for action in actions:
    env.step(action)
    env.render()
    
env.close()


  5%|▌         | 528/10000 [00:25<03:17, 47.96it/s][A

In [23]:
import time

In [24]:
# sometimes random
done = False
state = env.reset()
while not done:
    # TAKE ACTION
    # gradient policy - sample
    mu_action = mu(state, θ)
    action = policy_approx(mu_action, σ, env)
    state, reward, done, _ = env.step(action)
    env.render()
    time.sleep(0.005)
    
    
env.close()

In [17]:
env.close()

# with momentum

In [7]:
import numpy as np
import gym
import random
from tqdm import tqdm

env = gym.make('MountainCarContinuous-v0')

In [2]:
def mu(state, θ):
    return np.array([state @ θ])

def policy_approx(mu_action, σ, env):
    return np.clip(np.random.normal(mu_action, σ, size=(1,)), env.action_space.low[0], env.action_space.high[0])

In [5]:
env.observation_space, env.action_space

(Box(2,), Box(1,))

In [27]:
num_episodes = 100
num_steps = 300

α = 0.1
θ = np.random.rand(4,)
σ = 1
γ = 0.8

for i in tqdm(range(num_episodes)):
    # diminishing exploration rate
    e = (0.8/num_episodes) * i + 0.1
    
    rewards = []
    mu_actions = []
    actions = []
    states = []

    state = np.concatenate((env.reset(), np.array([0, 0])))
#     mu_action = np.random.uniform(-1, 1, size=(1,))
    mu_action = mu(state, θ) if np.random.uniform() < e else np.random.uniform(-1, 1, size=(1,))

    # sometimes random
    for _ in range(num_steps):    
        action = policy_approx(mu_action, σ, env)
        new_state, reward, done, _ = env.step(action)

        diff = new_state - state[:2]
        state = np.concatenate((new_state, diff))
    
        # state includes diff
        states.append(state)
        mu_actions.append(mu_action)
        actions.append(action)
        
        # keep track of actions and rewards
        rewards.append(reward)
        
        if done:
            print("done")
#             # replay
#             state = env.reset()
#             for action in actions:
#                 env.step(action)
#                 env.render()

#             env.close()
#             raise Exception("")
            break
            
        mu_action = mu(state, θ) if np.random.uniform() < e else np.random.uniform(-1, 1, size=(1,))
        
#         assert abs(mu_action) < 2

        
    if done:
        print("done", sum(rewards), np.linalg.norm(θ))
    else:
        print("not done", sum(rewards), np.linalg.norm(θ))
                        
    # train minimise reward
    for state, action, mu_action in zip(states, actions, mu_actions):
        mu_action = abs(mu_action) if mu_action < -1 else mu_action
        dlogpi = state * ((action - mu_action)/σ**2)
        θ = θ + α * dlogpi * sum(rewards)
    
#     i = 0
#     for state, action, mu_action in zip(states, actions, mu_actions):
#         dlogpi = state * ((action - mu_action)/σ**2)
#         R = 0
#         for r in rewards[i:][::-1]:
#             R = r + γ * R
#         # minimise reward
#         θ = θ - α * dlogpi * R

#         i += 1
        
θ



  0%|          | 0/100 [00:00<?, ?it/s][A[A

  6%|▌         | 6/100 [00:00<00:01, 51.86it/s][A[A

not done -16.74805077381793 1.233394246917018
not done -18.129011697181312 3.9401118588497073
not done -19.177383894704068 26.630290032456717
not done -17.834418604529922 546.6261285189524
not done -17.989010354671503 6918.423584942755
not done -18.53021263369004 124506.38999977185
not done -19.134204275081633 2271636.9395728107
not done -19.50529198683212 46154337.871301144
not done -20.13142425924694 1137501672.062231
not done -18.735265884781132 30961301935.915325
not done -19.969549498111896 696914337113.884




 12%|█▏        | 12/100 [00:00<00:01, 51.98it/s][A[A

 17%|█▋        | 17/100 [00:00<00:01, 51.36it/s][A[A

 23%|██▎       | 23/100 [00:00<00:01, 51.81it/s][A[A

not done -19.185258061127712 24225544817011.64
not done -20.332916510169973 556574815925400.6
not done -18.647817010782934 1.7997290351230952e+16
not done -18.657605587453176 4.5093716216765805e+17
not done -19.408456624155434 1.264090676264891e+19
not done -19.49897674213282 4.378131147557734e+20
not done -20.49379621852876 1.5466481951205147e+22
not done -20.353715259629812 5.646995490334824e+23
not done -20.03884384710873 1.9126525533422827e+25
not done -20.3184299974841 7.921376450929932e+26
not done -19.839084613356807 3.063735362361651e+28
not done -20.44013451169417 9.932756179532377e+29




 28%|██▊       | 28/100 [00:00<00:01, 51.14it/s][A[A

 34%|███▍      | 34/100 [00:00<00:01, 50.83it/s][A[A

not done -20.04075299843055 3.518131692492725e+31
not done -21.123389188070153 1.410613814979247e+33
not done -21.283571509573342 5.854956046135888e+34
not done -21.68841660256954 2.5193203391103974e+36
not done -22.678707654675012 1.0624026476706032e+38
not done -21.513647893046564 6.074384215164577e+39
not done -20.078712301429988 2.929076854305714e+41
not done -22.761439443450524 1.2517674726859492e+43
not done -21.939230929752107 6.396727684343452e+44
not done -21.305016976180568 3.480694739901344e+46
not done -21.84869327927837 1.7995216513957124e+48




 40%|████      | 40/100 [00:00<00:01, 50.95it/s][A[A

not done -21.895118717003992 7.625601343993035e+49
not done -22.466257730879875 4.0810492740573116e+51
not done -22.622522135655537 2.162239331061987e+53
not done -21.730205055495166 1.1172669221587132e+55
not done -22.971100304744464 5.944857855198022e+56
not done -23.343924522935872 3.588255568112727e+58
not done -21.687457498821995 2.2604737484884866e+60
not done -22.20372561219976 1.2088295418724025e+62
not done -21.7058120698849 6.634900262630296e+63
not done -22.95277295705212 3.7363920566369803e+65
not done -22.687285945269686 2.4517156793267295e+67




 46%|████▌     | 46/100 [00:00<00:01, 51.20it/s][A[A

 52%|█████▏    | 52/100 [00:01<00:00, 50.88it/s][A[A

not done -23.195387251710446 1.4530075610907567e+69
not done -23.690087714150465 9.086580994192378e+70
not done -22.11417369775364 6.2731579131170455e+72
not done -24.349054502984224 3.431911768366452e+74
not done -24.42402724936077 2.839193587295447e+76
not done -24.723393235864567 1.9877802436985825e+78
not done -23.892122906495278 1.6883754456276832e+80
not done -23.0128713771504 1.1828606372176138e+82
not done -23.47928777226786 7.945321911945831e+83
not done -23.13547021895887 6.093332954594699e+85
not done -23.08762986945904 3.6616112943708203e+87




 57%|█████▋    | 57/100 [00:01<00:00, 50.05it/s][A[A

 63%|██████▎   | 63/100 [00:01<00:00, 50.39it/s][A[A

not done -24.494979534679523 2.4258832415581341e+89
not done -24.66312416247243 1.6077819897583078e+91
not done -24.058442714997526 1.4829838262039137e+93
not done -24.225846530645118 9.549057431869909e+94
not done -25.271794798705418 6.81910745534636e+96
not done -26.137149945343918 5.435689101426617e+98
not done -24.604065399454655 4.068820443103317e+100
not done -24.93973656601131 2.917865316973786e+102
not done -24.963409764477642 2.6121488821548336e+104
not done -25.969216790590874 1.7420613765264394e+106
not done -25.33578442056387 1.3736525259193911e+108




 68%|██████▊   | 68/100 [00:01<00:00, 49.50it/s][A[A

 73%|███████▎  | 73/100 [00:01<00:00, 49.14it/s][A[A



not done -24.785876757648897 1.1220377335576637e+110
not done -25.578092739502306 8.742018405837738e+111
not done -25.38419263944214 7.520192233413964e+113
not done -25.04293246068911 6.017127876852775e+115
not done -24.94028530659065 4.9619918645506975e+117
not done -25.568621257507225 4.229036198231918e+119
not done -25.427501650061505 3.3252040176413373e+121
not done -24.263875350857607 2.550073115126443e+123
not done -25.373538275934003 1.918985152936047e+125
not done -26.742140854583422 1.5410132519764395e+127
not done -26.434150696970626 1.2896642137656785e+129


 78%|███████▊  | 78/100 [00:01<00:00, 49.33it/s][A[A

 84%|████████▍ | 84/100 [00:01<00:00, 49.94it/s][A[A

not done -26.647557352469633 1.1888120488306058e+131
not done -26.530653277686724 1.1849424947260503e+133
not done -26.43232637596304 1.0712039451999796e+135
not done -26.376949443319 8.378789207291082e+136
not done -27.363089763996868 7.141064868174264e+138
not done -26.132632424346898 6.276351555171364e+140
not done -27.45707585116785 5.107114006865082e+142
not done -26.78962801628681 4.77302403709278e+144
not done -27.903883115115462 4.259330412473467e+146
not done -27.557989014733828 3.795562831298436e+148




 89%|████████▉ | 89/100 [00:01<00:00, 47.86it/s][A[A

 95%|█████████▌| 95/100 [00:01<00:00, 49.30it/s][A[A

not done -27.750329201286238 3.38791339136729e+150
not done -27.0808892379263 3.0132666253151885e+152
not done -27.719267617546222 inf
not done -27.53100343547358 inf
not done -27.864212568820946 inf
not done -27.783338271842748 inf
not done -27.86551387735264 inf
not done -28.51110555088399 inf
not done -28.292558512696267 inf
not done -28.449992494206317 inf
not done -28.33462131802401 inf




100%|██████████| 100/100 [00:01<00:00, 50.09it/s][A[A

not done -28.74566964147958 inf





array([-1.35689828e+174,  2.69636911e+171,  2.69636911e+171,
        1.02270050e+171])

In [89]:
state = env.reset()
for action in actions:
    env.step(action)
    env.render()
    time.sleep(0.005)
    
env.close()

In [90]:
# sometimes random
done = False
state = np.concatenate((env.reset(), np.array([0, 0])))
# mu_action = np.random.uniform(-1, 1, size=(1,))
mu_action = mu(state, θ)
for _ in range(num_steps): 
    action = policy_approx(mu_action, σ, env)
    new_state, reward, done, _ = env.step(action)

    diff = new_state - state[:2]
    state = np.concatenate((new_state, diff))

    mu_action = mu(state, θ) #if np.random.uniform() < 0.3 else np.random.uniform(-1, 1, size=(1,))
    
    env.render()
    time.sleep(0.005)
    
    if done:
        print("done")
        break
    
env.close()

# with featurizer

In [47]:
import sklearn.pipeline
import sklearn.preprocessing
from sklearn.kernel_approximation import RBFSampler

observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)

# Used to converte a state to a featurizes represenation.
# We use RBF kernels with different variances to cover different parts of the space
featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
        ])
featurizer.fit(scaler.transform(observation_examples))

def featurize_state(state):
    """
    Returns the featurized representation for a state.
    """
    scaled = scaler.transform([state])
    featurized = featurizer.transform(scaled)
    return featurized[0]

state = env.observation_space.sample()
featState = featurize_state(state)
state, len(featState), featState

(array([0.2800193 , 0.05769776], dtype=float32),
 400,
 array([-1.02214268e-01,  9.10168543e-02,  1.01379090e-01, -1.41332601e-01,
         2.93913883e-02,  1.39476665e-01,  1.95481497e-02,  1.04866269e-01,
        -1.41376966e-01, -6.05327312e-02,  1.34208409e-04, -1.37791173e-01,
        -8.28713953e-02,  2.93601850e-02,  9.67093197e-02, -1.36176980e-01,
        -1.16757212e-01,  5.45302113e-02,  4.22399678e-02, -2.86192697e-02,
         1.22874267e-01,  9.81621030e-02, -1.38032889e-01,  1.21929988e-01,
         1.31356573e-01,  1.30028255e-01, -1.39268045e-01,  1.04703258e-01,
         1.41417678e-01, -1.33806642e-01, -4.34702387e-02, -1.40373701e-01,
        -7.25861297e-02, -9.84651844e-02,  1.08626791e-01,  6.50991227e-02,
         1.24387060e-01,  1.36417675e-01,  1.33312870e-01, -1.71161794e-02,
        -7.02103764e-02, -1.36140019e-01,  5.43915250e-02, -9.79312036e-02,
         1.28723526e-01,  7.76717552e-02, -6.37048887e-02,  5.29013539e-02,
        -1.41262207e-01, -1.34632

In [49]:
num_episodes = 10
num_steps = 300

α = 0.1
θ = np.random.rand(400,)
σ = 1
γ = 0.8

for i in tqdm(range(num_episodes)):
    # diminishing exploration rate
    e = (0.8/num_episodes) * i + 0.1
    
    rewards = []
    mu_actions = []
    actions = []
    states = []

    state = env.reset()
    state = featurize_state(state)
    
    mu_action = mu(state, θ) if np.random.uniform() < e else np.random.uniform(-1, 1, size=(1,))

    # sometimes random
    for _ in range(num_steps):    
        action = policy_approx(mu_action, σ, env)
        state, reward, done, _ = env.step(action)

        state = featurize_state(state)
    
        # state includes diff
        states.append(state)
        mu_actions.append(mu_action)
        actions.append(action)
        
        # keep track of actions and rewards
        rewards.append(reward)
        
        if done:
            print("done")
#             # replay
#             state = env.reset()
#             for action in actions:
#                 env.step(action)
#                 env.render()

#             env.close()
#             raise Exception("")
            break
            
        mu_action = mu(state, θ) if np.random.uniform() < e else np.random.uniform(-1, 1, size=(1,))
        
#         assert abs(mu_action) < 2
                                
    # train minimise reward
    for state, action, mu_action in zip(states, actions, mu_actions):
        mu_action = abs(mu_action) if mu_action < -1 else mu_action
        dlogpi = state * ((action - mu_action)/σ**2)
        θ = θ + α * dlogpi * sum(rewards)
    
#     i = 0
#     for state, action, mu_action in zip(states, actions, mu_actions):
#         dlogpi = state * ((action - mu_action)/σ**2)
#         R = 0
#         for r in rewards[i:][::-1]:
#             R = r + γ * R
#         # minimise reward
#         θ = θ + α * dlogpi * R

#         i += 1
        
θ



  0%|          | 0/10 [00:00<?, ?it/s][A[A

 10%|█         | 1/10 [00:00<00:01,  5.85it/s][A[A

 20%|██        | 2/10 [00:00<00:01,  5.99it/s][A[A

 30%|███       | 3/10 [00:00<00:01,  6.16it/s][A[A

 40%|████      | 4/10 [00:00<00:00,  6.10it/s][A[A

 50%|█████     | 5/10 [00:00<00:00,  6.26it/s][A[A

 60%|██████    | 6/10 [00:00<00:00,  6.29it/s][A[A

 70%|███████   | 7/10 [00:01<00:00,  6.38it/s][A[A

 80%|████████  | 8/10 [00:01<00:00,  6.24it/s][A[A

 90%|█████████ | 9/10 [00:01<00:00,  6.14it/s][A[A

100%|██████████| 10/10 [00:01<00:00,  6.24it/s][A[A


array([ 6.08328959e+26, -1.85805726e+26,  8.50762986e+26,  1.00253944e+27,
       -1.08845524e+27,  1.51516845e+26,  4.95188248e+26,  7.92468584e+26,
        5.73051834e+26, -9.75896230e+26,  2.95974368e+26, -3.12469319e+25,
       -1.84534288e+26, -2.29740703e+26, -1.91736007e+26, -1.96955533e+26,
        8.04596676e+25,  6.70508477e+25, -7.56034292e+26,  6.32639151e+26,
        1.02487732e+27, -5.11286116e+26, -1.64554115e+26,  6.35927638e+26,
       -6.45490669e+26,  5.19082855e+26,  4.40992396e+26, -8.02255684e+25,
        3.71914562e+26, -2.65434767e+26, -9.54874483e+26, -6.63951482e+26,
        5.51222306e+26, -7.99700431e+25,  1.12765380e+26,  1.05039457e+27,
        4.71671532e+26,  5.03587524e+26, -2.99372284e+26, -9.75017212e+26,
       -8.51201366e+26, -2.87381982e+26,  3.46525630e+26, -7.34881558e+26,
        6.44159238e+26,  2.24373581e+26,  1.01660290e+27,  6.97961644e+26,
       -3.91404341e+26, -1.18560499e+26,  7.37182533e+26, -6.14799276e+26,
        6.00370080e+26, -

# actor-critic with tensorflow

In [60]:
import gym
import itertools
import matplotlib
import numpy as np
import sys
import collections

# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()
import tensorflow as tf

In [None]:
import sklearn.pipeline
import sklearn.preprocessing
from sklearn.kernel_approximation import RBFSampler

observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)

# Used to converte a state to a featurizes represenation.
# We use RBF kernels with different variances to cover different parts of the space
featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
        ])
featurizer.fit(scaler.transform(observation_examples))

def featurize_state(state):
    """
    Returns the featurized representation for a state.
    """
    scaled = scaler.transform([state])
    featurized = featurizer.transform(scaled)
    return featurized[0]

state = env.observation_space.sample()
featState = featurize_state(state)
state, len(featState), featState

In [79]:
from tensorflow.keras.layers import Lambda, Input, Dense
from tensorflow.keras.backend import random_normal
from tensorflow.keras.models import Model

In [80]:
state = Input(shape=400)
μ = Dense(1, activation=None)(state)
σ = Dense(1, activation=None)(state)
# σ = tf.math.softplus(σ) + 1e-5

action = random_normal(shape=(1, 1), mean=μ, stddev=σ)
action = tf.clip_by_value(action, env.action_space.low[0], env.action_space.high[0])

policyEstimator = Model(state, action)

In [90]:
s = env.observation_space.sample()
s = featurize_state(s)
s = np.array([s])
s, policyEstimator.predict(s)

(array([[ 0.13246266, -0.12773119,  0.14095157, -0.08511366,  0.09100377,
         -0.01563032,  0.01494   , -0.11525381,  0.12598155,  0.01621066,
         -0.07662953, -0.04708563, -0.0379208 ,  0.05464486, -0.07814908,
          0.14069131, -0.13721569, -0.10242794, -0.13826155,  0.1314298 ,
          0.13952601, -0.03343887,  0.08508133, -0.08104398, -0.13889242,
         -0.13320044,  0.1212669 ,  0.05445997,  0.10991691, -0.06585452,
         -0.03454165, -0.03879303,  0.03079203, -0.00737028, -0.13605325,
         -0.04802686, -0.13897802, -0.05802051, -0.09685986,  0.02399825,
         -0.14137235, -0.1232097 , -0.13721324, -0.12164906,  0.13809611,
          0.11326175, -0.01346296, -0.10940263, -0.01316769,  0.14140378,
          0.09248487, -0.10120197, -0.06947694,  0.11920745, -0.14071681,
          0.04742168,  0.14115267,  0.07627147,  0.05492381, -0.10158316,
          0.07357183, -0.13508643, -0.04825777, -0.14098352,  0.01246688,
          0.0263538 ,  0.13978305,  0.

In [56]:
class PolicyEstimator():
    """ policy function approximator """
    def __init__(self, α=0.01, scope="policy_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [400], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just linear classifier
            self.μ = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(self.state, 0),
                num_outputs=1,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)
            self.μ = tf.squeeze(self.μ)
            
            self.σ = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(self.state, 0),
                num_outputs=1,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)
            self.σ = tf.squeeze(self.σ)
            self.σ = tf.nn.softplus(self.σ) + 1e-5

            self.normal_dist = tf.contrib.distributions.Normal(self.μ, self.σ)
            self.action = self.normal_dist._sample_n(1)
            self.action = tf.clip_by_value(self.action, env.action_space.low[0], env.action_space.high[0])

            # Loss and train op
            self.loss = -self.normal_dist.log_prob(self.action) * self.target
            # Add cross entropy cost to encourage exploration
            self.loss -= 1e-1 * self.normal_dist.entropy()
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=α)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.contrib.framework.get_global_step())
            
        def predict(self, state, sess=None):
            sess = sess or tf.get_default_session()
            state = featurize_state(state)
            return sess.run(self.action, { self.state: state })
        
        def update(self, state, target, action, sess=None):
            sess = sess or tf.get_default_session()
            state = featurize_state(state)
            feed_dict = { self.state: state, self.target: target, self.action: action  }
            _, loss = sess.run([self.train_op, self.loss], feed_dict)
            return loss

In [57]:
class ValueEstimator():
    def __init__(self, α=0.1, scope="value_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [400], "state")
            
            self.target = tf.placeholder(dtype=tf.float32, name="target")
            # This is just linear classifier
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(self.state, 0),
                num_outputs=1,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)

            self.value_estimate = tf.squeeze(self.output_layer)
            self.loss = tf.squared_difference(self.value_estimate, self.target)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=α)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.contrib.framework.get_global_step())        
    
    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        state = featurize_state(state)
        return sess.run(self.value_estimate, { self.state: state })

    def update(self, state, target, sess=None):
        sess = sess or tf.get_default_session()
        state = featurize_state(state)
        feed_dict = { self.state: state, self.target: target }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        
        return loss

In [59]:
policy_estimator = PolicyEstimator(α=0.001)
value_estimator = ValueEstimator(α=0.1)

AttributeError: module 'tensorflow_core.compat.v1' has no attribute 'contrib'